X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/6c6bbe574725464d302f2cea71b22515c5d1ad1a..38798bba13d1257936e960517e0c9d16aee05cff:/nominatim/tokenizer/icu_tokenizer.py?ds=inline diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 1e3eab98..251f4da5 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -8,7 +8,8 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using libICU instead of the PostgreSQL module. """ -from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, Dict, Set, Iterable +from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \ + Dict, Set, Iterable import itertools import json import logging @@ -22,7 +23,7 @@ from nominatim.db.sql_preprocessor import SQLPreprocessor from nominatim.data.place_info import PlaceInfo from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.tokenizer.place_sanitizer import PlaceSanitizer -from nominatim.tokenizer.sanitizers.base import PlaceName +from nominatim.data.place_name import PlaceName from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer @@ -30,6 +31,11 @@ DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization" LOG = logging.getLogger() +WORD_TYPES =(('country_names', 'C'), + ('postcodes', 'P'), + ('full_word', 'W'), + ('housenumbers', 'H')) + def create(dsn: str, data_dir: Path) -> 'ICUTokenizer': """ Create a new instance of the tokenizer provided by this module. """ @@ -37,7 +43,7 @@ def create(dsn: str, data_dir: Path) -> 'ICUTokenizer': class ICUTokenizer(AbstractTokenizer): - """ This tokenizer uses libICU to covert names and queries to ASCII. + """ This tokenizer uses libICU to convert names and queries to ASCII. Otherwise it uses the same algorithms and data structures as the normalization routines in Nominatim 3. """ @@ -61,7 +67,8 @@ class ICUTokenizer(AbstractTokenizer): if init_db: self.update_sql_functions(config) - self._init_db_tables(config) + self._setup_db_tables(config) + self._create_base_indices(config, 'word') def init_from_project(self, config: Configuration) -> None: @@ -79,9 +86,7 @@ class ICUTokenizer(AbstractTokenizer): """ Do any required postprocessing to make the tokenizer data ready for use. """ - with connect(self.dsn) as conn: - sqlp = SQLPreprocessor(conn, config) - sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql') + self._create_lookup_indices(config, 'word') def update_sql_functions(self, config: Configuration) -> None: @@ -99,24 +104,106 @@ class ICUTokenizer(AbstractTokenizer): self.init_from_project(config) - def update_statistics(self) -> None: + def update_statistics(self, config: Configuration, threads: int = 2) -> None: """ Recompute frequencies for all name words. """ with connect(self.dsn) as conn: - if conn.table_exists('search_name'): - with conn.cursor() as cur: - cur.drop_table("word_frequencies") - LOG.info("Computing word frequencies") + if not conn.table_exists('search_name'): + return + + with conn.cursor() as cur: + cur.execute('ANALYSE search_name') + if threads > 1: + cur.execute('SET max_parallel_workers_per_gather TO %s', + (min(threads, 6),)) + + if conn.server_version_tuple() < (12, 0): + LOG.info('Computing word frequencies') + cur.drop_table('word_frequencies') + cur.drop_table('addressword_frequencies') cur.execute("""CREATE TEMP TABLE word_frequencies AS SELECT unnest(name_vector) as id, count(*) FROM search_name GROUP BY id""") - cur.execute("CREATE INDEX ON word_frequencies(id)") - LOG.info("Update word table with recomputed frequencies") - cur.execute("""UPDATE word - SET info = info || jsonb_build_object('count', count) - FROM word_frequencies WHERE word_id = id""") - cur.drop_table("word_frequencies") + cur.execute('CREATE INDEX ON word_frequencies(id)') + cur.execute("""CREATE TEMP TABLE addressword_frequencies AS + SELECT unnest(nameaddress_vector) as id, count(*) + FROM search_name GROUP BY id""") + cur.execute('CREATE INDEX ON addressword_frequencies(id)') + cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER, + INOUT info JSONB) + AS $$ + DECLARE rec RECORD; + BEGIN + IF info is null THEN + info = '{}'::jsonb; + END IF; + FOR rec IN SELECT count FROM word_frequencies WHERE id = wid + LOOP + info = info || jsonb_build_object('count', rec.count); + END LOOP; + FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid + LOOP + info = info || jsonb_build_object('addr_count', rec.count); + END LOOP; + IF info = '{}'::jsonb THEN + info = null; + END IF; + END; + $$ LANGUAGE plpgsql IMMUTABLE; + """) + LOG.info('Update word table with recomputed frequencies') + cur.drop_table('tmp_word') + cur.execute("""CREATE TABLE tmp_word AS + SELECT word_id, word_token, type, word, + word_freq_update(word_id, info) as info + FROM word + """) + cur.drop_table('word_frequencies') + cur.drop_table('addressword_frequencies') + else: + LOG.info('Computing word frequencies') + cur.drop_table('word_frequencies') + cur.execute(""" + CREATE TEMP TABLE word_frequencies AS + WITH word_freq AS MATERIALIZED ( + SELECT unnest(name_vector) as id, count(*) + FROM search_name GROUP BY id), + addr_freq AS MATERIALIZED ( + SELECT unnest(nameaddress_vector) as id, count(*) + FROM search_name GROUP BY id) + SELECT coalesce(a.id, w.id) as id, + (CASE WHEN w.count is null THEN '{}'::JSONB + ELSE jsonb_build_object('count', w.count) END + || + CASE WHEN a.count is null THEN '{}'::JSONB + ELSE jsonb_build_object('addr_count', a.count) END) as info + FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id; + """) + cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)') + cur.execute('ANALYSE word_frequencies') + LOG.info('Update word table with recomputed frequencies') + cur.drop_table('tmp_word') + cur.execute("""CREATE TABLE tmp_word AS + SELECT word_id, word_token, type, word, + (CASE WHEN wf.info is null THEN word.info + ELSE coalesce(word.info, '{}'::jsonb) || wf.info + END) as info + FROM word LEFT JOIN word_frequencies wf + ON word.word_id = wf.id + """) + cur.drop_table('word_frequencies') + + with conn.cursor() as cur: + cur.execute('SET max_parallel_workers_per_gather TO 0') + + sqlp = SQLPreprocessor(conn, config) + sqlp.run_string(conn, + 'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"') conn.commit() + self._create_base_indices(config, 'tmp_word') + self._create_lookup_indices(config, 'tmp_word') + self._move_temporary_word_table('tmp_word') + def _cleanup_housenumbers(self) -> None: @@ -182,19 +269,32 @@ class ICUTokenizer(AbstractTokenizer): self.loader.make_token_analysis()) - def _install_php(self, phpdir: Path, overwrite: bool = True) -> None: + def most_frequent_words(self, conn: Connection, num: int) -> List[str]: + """ Return a list of the `num` most frequent full words + in the database. + """ + with conn.cursor() as cur: + cur.execute("""SELECT word, sum((info->>'count')::int) as count + FROM word WHERE type = 'W' + GROUP BY word + ORDER BY count DESC LIMIT %s""", (num,)) + return list(s[0].split('@')[0] for s in cur) + + + def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None: """ Install the php script for the tokenizer. """ - assert self.loader is not None - php_file = self.data_dir / "tokenizer.php" + if phpdir is not None: + assert self.loader is not None + php_file = self.data_dir / "tokenizer.php" - if not php_file.exists() or overwrite: - php_file.write_text(dedent(f"""\ - None: @@ -206,16 +306,84 @@ class ICUTokenizer(AbstractTokenizer): self.loader.save_config_to_db(conn) - def _init_db_tables(self, config: Configuration) -> None: + def _setup_db_tables(self, config: Configuration) -> None: """ Set up the word table and fill it with pre-computed word frequencies. """ with connect(self.dsn) as conn: + with conn.cursor() as cur: + cur.drop_table('word') sqlp = SQLPreprocessor(conn, config) - sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql') + sqlp.run_string(conn, """ + CREATE TABLE word ( + word_id INTEGER, + word_token text NOT NULL, + type text NOT NULL, + word text, + info jsonb + ) {{db.tablespace.search_data}}; + GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}"; + + DROP SEQUENCE IF EXISTS seq_word; + CREATE SEQUENCE seq_word start 1; + GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}"; + """) conn.commit() + def _create_base_indices(self, config: Configuration, table_name: str) -> None: + """ Set up the word table and fill it with pre-computed word + frequencies. + """ + with connect(self.dsn) as conn: + sqlp = SQLPreprocessor(conn, config) + sqlp.run_string(conn, + """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}} + USING BTREE (word_token) {{db.tablespace.search_index}}""", + table_name=table_name) + for name, ctype in WORD_TYPES: + sqlp.run_string(conn, + """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}} + USING BTREE (word) {{db.tablespace.address_index}} + WHERE type = '{{column_type}}' + """, + table_name=table_name, idx_name=name, + column_type=ctype) + conn.commit() + + + def _create_lookup_indices(self, config: Configuration, table_name: str) -> None: + """ Create additional indexes used when running the API. + """ + with connect(self.dsn) as conn: + sqlp = SQLPreprocessor(conn, config) + # Index required for details lookup. + sqlp.run_string(conn, """ + CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id + ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}} + """, + table_name=table_name) + conn.commit() + + + def _move_temporary_word_table(self, old: str) -> None: + """ Rename all tables and indexes used by the tokenizer. + """ + with connect(self.dsn) as conn: + with conn.cursor() as cur: + cur.drop_table('word') + cur.execute(f"ALTER TABLE {old} RENAME TO word") + for idx in ('word_token', 'word_id'): + cur.execute(f"""ALTER INDEX idx_{old}_{idx} + RENAME TO idx_word_{idx}""") + for name, _ in WORD_TYPES: + cur.execute(f"""ALTER INDEX idx_{old}_{name} + RENAME TO idx_word_{name}""") + conn.commit() + + + + class ICUNameAnalyzer(AbstractAnalyzer): """ The ICU analyzer uses the ICU library for splitting names. @@ -323,7 +491,7 @@ class ICUNameAnalyzer(AbstractAnalyzer): postcode_name = place.name.strip().upper() variant_base = None else: - postcode_name = analyzer.normalize(place.name) + postcode_name = analyzer.get_canonical_id(place) variant_base = place.get_attr("variant") if variant_base: @@ -358,7 +526,7 @@ class ICUNameAnalyzer(AbstractAnalyzer): if analyzer is None: variants = [term] else: - variants = analyzer.get_variants_ascii(variant) + variants = analyzer.compute_variants(variant) if term not in variants: variants.append(term) else: @@ -374,7 +542,7 @@ class ICUNameAnalyzer(AbstractAnalyzer): - def update_special_phrases(self, phrases: Sequence[Tuple[str, str, str, str]], + def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]], should_replace: bool) -> None: """ Replace the search index for special phrases with the new phrases. If `should_replace` is True, then the previous set of will be @@ -430,7 +598,7 @@ class ICUNameAnalyzer(AbstractAnalyzer): def _remove_special_phrases(self, cursor: Cursor, new_phrases: Set[Tuple[str, str, str, str]], existing_phrases: Set[Tuple[str, str, str, str]]) -> int: - """ Remove all phrases from the databse that are no longer in the + """ Remove all phrases from the database that are no longer in the new phrase list. """ to_delete = existing_phrases - new_phrases @@ -565,24 +733,25 @@ class ICUNameAnalyzer(AbstractAnalyzer): result = self._cache.housenumbers.get(norm_name, result) if result[0] is None: with self.conn.cursor() as cur: - cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, )) - result = cur.fetchone()[0], norm_name # type: ignore[no-untyped-call] + hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, )) + + result = hid, norm_name self._cache.housenumbers[norm_name] = result else: # Otherwise use the analyzer to determine the canonical name. # Per convention we use the first variant as the 'lookup name', the # name that gets saved in the housenumber field of the place. - norm_name = analyzer.normalize(hnr.name) - if norm_name: - result = self._cache.housenumbers.get(norm_name, result) + word_id = analyzer.get_canonical_id(hnr) + if word_id: + result = self._cache.housenumbers.get(word_id, result) if result[0] is None: - variants = analyzer.get_variants_ascii(norm_name) + variants = analyzer.compute_variants(word_id) if variants: with self.conn.cursor() as cur: - cur.execute("SELECT create_analyzed_hnr_id(%s, %s)", - (norm_name, list(variants))) - result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call] - self._cache.housenumbers[norm_name] = result + hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)", + (word_id, list(variants))) + result = hid, variants[0] + self._cache.housenumbers[word_id] = result return result @@ -619,7 +788,7 @@ class ICUNameAnalyzer(AbstractAnalyzer): def _retrieve_full_tokens(self, name: str) -> List[int]: """ Get the full name token for the given name, if it exists. - The name is only retrived for the standard analyser. + The name is only retrieved for the standard analyser. """ assert self.conn is not None norm_name = self._search_normalized(name) @@ -649,23 +818,22 @@ class ICUNameAnalyzer(AbstractAnalyzer): for name in names: analyzer_id = name.get_attr('analyzer') analyzer = self.token_analysis.get_analyzer(analyzer_id) - norm_name = analyzer.normalize(name.name) + word_id = analyzer.get_canonical_id(name) if analyzer_id is None: - token_id = norm_name + token_id = word_id else: - token_id = f'{norm_name}@{analyzer_id}' + token_id = f'{word_id}@{analyzer_id}' full, part = self._cache.names.get(token_id, (None, None)) if full is None: - variants = analyzer.get_variants_ascii(norm_name) + variants = analyzer.compute_variants(word_id) if not variants: continue with self.conn.cursor() as cur: cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)", (token_id, variants)) - full, part = cast(Tuple[int, List[int]], - cur.fetchone()) # type: ignore[no-untyped-call] + full, part = cast(Tuple[int, List[int]], cur.fetchone()) self._cache.names[token_id] = (full, part) @@ -687,7 +855,7 @@ class ICUNameAnalyzer(AbstractAnalyzer): postcode_name = item.name.strip().upper() variant_base = None else: - postcode_name = analyzer.normalize(item.name) + postcode_name = analyzer.get_canonical_id(item) variant_base = item.get_attr("variant") if variant_base: @@ -702,7 +870,7 @@ class ICUNameAnalyzer(AbstractAnalyzer): variants = {term} if analyzer is not None and variant_base: - variants.update(analyzer.get_variants_ascii(variant_base)) + variants.update(analyzer.compute_variants(variant_base)) with self.conn.cursor() as cur: cur.execute("SELECT create_postcode_word(%s, %s)", @@ -719,7 +887,7 @@ class _TokenInfo: self.names: Optional[str] = None self.housenumbers: Set[str] = set() self.housenumber_tokens: Set[int] = set() - self.street_tokens: Set[int] = set() + self.street_tokens: Optional[Set[int]] = None self.place_tokens: Set[int] = set() self.address_tokens: Dict[str, str] = {} self.postcode: Optional[str] = None @@ -741,7 +909,7 @@ class _TokenInfo: out['hnr'] = ';'.join(self.housenumbers) out['hnr_tokens'] = self._mk_array(self.housenumber_tokens) - if self.street_tokens: + if self.street_tokens is not None: out['street'] = self._mk_array(self.street_tokens) if self.place_tokens: @@ -775,6 +943,8 @@ class _TokenInfo: def add_street(self, tokens: Iterable[int]) -> None: """ Add addr:street match terms. """ + if self.street_tokens is None: + self.street_tokens = set() self.street_tokens.update(tokens)