]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/icu_tokenizer.py
Merge remote-tracking branch 'upstream/master'
[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
index 31eaaf2958aef1411a8228462fee04d68507555b..c1821d7edc7b88b2aa1f95797be2ddfce0ee0c85 100644 (file)
@@ -23,7 +23,7 @@ from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.data.place_info import PlaceInfo
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
 from nominatim.data.place_info import PlaceInfo
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
-from nominatim.tokenizer.sanitizers.base import PlaceName
+from nominatim.data.place_name import PlaceName
 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 
 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 
@@ -31,6 +31,11 @@ DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
 
 LOG = logging.getLogger()
 
 
 LOG = logging.getLogger()
 
+WORD_TYPES =(('country_names', 'C'),
+             ('postcodes', 'P'),
+             ('full_word', 'W'),
+             ('housenumbers', 'H'))
+
 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
     """ Create a new instance of the tokenizer provided by this module.
     """
 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
     """ Create a new instance of the tokenizer provided by this module.
     """
@@ -38,7 +43,7 @@ def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
 
 
 class ICUTokenizer(AbstractTokenizer):
 
 
 class ICUTokenizer(AbstractTokenizer):
-    """ This tokenizer uses libICU to covert names and queries to ASCII.
+    """ This tokenizer uses libICU to convert names and queries to ASCII.
         Otherwise it uses the same algorithms and data structures as the
         normalization routines in Nominatim 3.
     """
         Otherwise it uses the same algorithms and data structures as the
         normalization routines in Nominatim 3.
     """
@@ -62,7 +67,8 @@ class ICUTokenizer(AbstractTokenizer):
 
         if init_db:
             self.update_sql_functions(config)
 
         if init_db:
             self.update_sql_functions(config)
-            self._init_db_tables(config)
+            self._setup_db_tables(config)
+            self._create_base_indices(config, 'word')
 
 
     def init_from_project(self, config: Configuration) -> None:
 
 
     def init_from_project(self, config: Configuration) -> None:
@@ -80,9 +86,7 @@ class ICUTokenizer(AbstractTokenizer):
         """ Do any required postprocessing to make the tokenizer data ready
             for use.
         """
         """ Do any required postprocessing to make the tokenizer data ready
             for use.
         """
-        with connect(self.dsn) as conn:
-            sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
+        self._create_lookup_indices(config, 'word')
 
 
     def update_sql_functions(self, config: Configuration) -> None:
 
 
     def update_sql_functions(self, config: Configuration) -> None:
@@ -100,24 +104,39 @@ class ICUTokenizer(AbstractTokenizer):
         self.init_from_project(config)
 
 
         self.init_from_project(config)
 
 
-    def update_statistics(self) -> None:
+    def update_statistics(self, config: Configuration) -> None:
         """ Recompute frequencies for all name words.
         """
         with connect(self.dsn) as conn:
         """ Recompute frequencies for all name words.
         """
         with connect(self.dsn) as conn:
-            if conn.table_exists('search_name'):
-                with conn.cursor() as cur:
-                    cur.drop_table("word_frequencies")
-                    LOG.info("Computing word frequencies")
-                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
-                                     SELECT unnest(name_vector) as id, count(*)
-                                     FROM search_name GROUP BY id""")
-                    cur.execute("CREATE INDEX ON word_frequencies(id)")
-                    LOG.info("Update word table with recomputed frequencies")
-                    cur.execute("""UPDATE word
-                                   SET info = info || jsonb_build_object('count', count)
-                                   FROM word_frequencies WHERE word_id = id""")
-                    cur.drop_table("word_frequencies")
+            if not conn.table_exists('search_name'):
+                return
+
+            with conn.cursor() as cur:
+                LOG.info('Computing word frequencies')
+                cur.drop_table('word_frequencies')
+                cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                 SELECT unnest(name_vector) as id, count(*)
+                                 FROM search_name GROUP BY id""")
+                cur.execute('CREATE INDEX ON word_frequencies(id)')
+                LOG.info('Update word table with recomputed frequencies')
+                cur.drop_table('tmp_word')
+                cur.execute("""CREATE TABLE tmp_word AS
+                                SELECT word_id, word_token, type, word,
+                                       (CASE WHEN wf.count is null THEN info
+                                          ELSE info || jsonb_build_object('count', wf.count)
+                                        END) as info
+                                FROM word LEFT JOIN word_frequencies wf
+                                  ON word.word_id = wf.id""")
+                cur.drop_table('word_frequencies')
+
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_string(conn,
+                            'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
             conn.commit()
             conn.commit()
+        self._create_base_indices(config, 'tmp_word')
+        self._create_lookup_indices(config, 'tmp_word')
+        self._move_temporary_word_table('tmp_word')
+
 
 
     def _cleanup_housenumbers(self) -> None:
 
 
     def _cleanup_housenumbers(self) -> None:
@@ -183,19 +202,32 @@ class ICUTokenizer(AbstractTokenizer):
                                self.loader.make_token_analysis())
 
 
                                self.loader.make_token_analysis())
 
 
-    def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
+    def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
+        """ Return a list of the `num` most frequent full words
+            in the database.
+        """
+        with conn.cursor() as cur:
+            cur.execute("""SELECT word, sum((info->>'count')::int) as count
+                             FROM word WHERE type = 'W'
+                             GROUP BY word
+                             ORDER BY count DESC LIMIT %s""", (num,))
+            return list(s[0].split('@')[0] for s in cur)
+
+
+    def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
         """ Install the php script for the tokenizer.
         """
         """ Install the php script for the tokenizer.
         """
-        assert self.loader is not None
-        php_file = self.data_dir / "tokenizer.php"
+        if phpdir is not None:
+            assert self.loader is not None
+            php_file = self.data_dir / "tokenizer.php"
 
 
-        if not php_file.exists() or overwrite:
-            php_file.write_text(dedent(f"""\
-                <?php
-                @define('CONST_Max_Word_Frequency', 10000000);
-                @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
-                @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
-                require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
+            if not php_file.exists() or overwrite:
+                php_file.write_text(dedent(f"""\
+                    <?php
+                    @define('CONST_Max_Word_Frequency', 10000000);
+                    @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
+                    @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
+                    require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
 
 
     def _save_config(self) -> None:
 
 
     def _save_config(self) -> None:
@@ -207,16 +239,84 @@ class ICUTokenizer(AbstractTokenizer):
             self.loader.save_config_to_db(conn)
 
 
             self.loader.save_config_to_db(conn)
 
 
-    def _init_db_tables(self, config: Configuration) -> None:
+    def _setup_db_tables(self, config: Configuration) -> None:
+        """ Set up the word table and fill it with pre-computed word
+            frequencies.
+        """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.drop_table('word')
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_string(conn, """
+                CREATE TABLE word (
+                      word_id INTEGER,
+                      word_token text NOT NULL,
+                      type text NOT NULL,
+                      word text,
+                      info jsonb
+                    ) {{db.tablespace.search_data}};
+                GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
+
+                DROP SEQUENCE IF EXISTS seq_word;
+                CREATE SEQUENCE seq_word start 1;
+                GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
+            """)
+            conn.commit()
+
+
+    def _create_base_indices(self, config: Configuration, table_name: str) -> None:
         """ Set up the word table and fill it with pre-computed word
             frequencies.
         """
         with connect(self.dsn) as conn:
             sqlp = SQLPreprocessor(conn, config)
         """ Set up the word table and fill it with pre-computed word
             frequencies.
         """
         with connect(self.dsn) as conn:
             sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
+            sqlp.run_string(conn,
+                            """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
+                               USING BTREE (word_token) {{db.tablespace.search_index}}""",
+                            table_name=table_name)
+            for name, ctype in WORD_TYPES:
+                sqlp.run_string(conn,
+                                """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
+                                   USING BTREE (word) {{db.tablespace.address_index}}
+                                   WHERE type = '{{column_type}}'
+                                """,
+                                table_name=table_name, idx_name=name,
+                                column_type=ctype)
+            conn.commit()
+
+
+    def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
+        """ Create additional indexes used when running the API.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            # Index required for details lookup.
+            sqlp.run_string(conn, """
+                CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
+                  ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
+            """,
+            table_name=table_name)
             conn.commit()
 
 
             conn.commit()
 
 
+    def _move_temporary_word_table(self, old: str) -> None:
+        """ Rename all tables and indexes used by the tokenizer.
+        """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.drop_table('word')
+                cur.execute(f"ALTER TABLE {old} RENAME TO word")
+                for idx in ('word_token', 'word_id'):
+                    cur.execute(f"""ALTER INDEX idx_{old}_{idx}
+                                      RENAME TO idx_word_{idx}""")
+                for name, _ in WORD_TYPES:
+                    cur.execute(f"""ALTER INDEX idx_{old}_{name}
+                                    RENAME TO idx_word_{name}""")
+            conn.commit()
+
+
+
+
 class ICUNameAnalyzer(AbstractAnalyzer):
     """ The ICU analyzer uses the ICU library for splitting names.
 
 class ICUNameAnalyzer(AbstractAnalyzer):
     """ The ICU analyzer uses the ICU library for splitting names.
 
@@ -324,7 +424,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                             postcode_name = place.name.strip().upper()
                             variant_base = None
                         else:
                             postcode_name = place.name.strip().upper()
                             variant_base = None
                         else:
-                            postcode_name = analyzer.normalize(place.name)
+                            postcode_name = analyzer.get_canonical_id(place)
                             variant_base = place.get_attr("variant")
 
                         if variant_base:
                             variant_base = place.get_attr("variant")
 
                         if variant_base:
@@ -359,7 +459,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                 if analyzer is None:
                     variants = [term]
                 else:
                 if analyzer is None:
                     variants = [term]
                 else:
-                    variants = analyzer.get_variants_ascii(variant)
+                    variants = analyzer.compute_variants(variant)
                     if term not in variants:
                         variants.append(term)
             else:
                     if term not in variants:
                         variants.append(term)
             else:
@@ -431,7 +531,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
     def _remove_special_phrases(self, cursor: Cursor,
                              new_phrases: Set[Tuple[str, str, str, str]],
                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
     def _remove_special_phrases(self, cursor: Cursor,
                              new_phrases: Set[Tuple[str, str, str, str]],
                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
-        """ Remove all phrases from the databse that are no longer in the
+        """ Remove all phrases from the database that are no longer in the
             new phrase list.
         """
         to_delete = existing_phrases - new_phrases
             new phrase list.
         """
         to_delete = existing_phrases - new_phrases
@@ -566,24 +666,25 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                 result = self._cache.housenumbers.get(norm_name, result)
                 if result[0] is None:
                     with self.conn.cursor() as cur:
                 result = self._cache.housenumbers.get(norm_name, result)
                 if result[0] is None:
                     with self.conn.cursor() as cur:
-                        cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
-                        result = cur.fetchone()[0], norm_name # type: ignore[no-untyped-call]
+                        hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
+
+                        result = hid, norm_name
                         self._cache.housenumbers[norm_name] = result
         else:
             # Otherwise use the analyzer to determine the canonical name.
             # Per convention we use the first variant as the 'lookup name', the
             # name that gets saved in the housenumber field of the place.
                         self._cache.housenumbers[norm_name] = result
         else:
             # Otherwise use the analyzer to determine the canonical name.
             # Per convention we use the first variant as the 'lookup name', the
             # name that gets saved in the housenumber field of the place.
-            norm_name = analyzer.normalize(hnr.name)
-            if norm_name:
-                result = self._cache.housenumbers.get(norm_name, result)
+            word_id = analyzer.get_canonical_id(hnr)
+            if word_id:
+                result = self._cache.housenumbers.get(word_id, result)
                 if result[0] is None:
                 if result[0] is None:
-                    variants = analyzer.get_variants_ascii(norm_name)
+                    variants = analyzer.compute_variants(word_id)
                     if variants:
                         with self.conn.cursor() as cur:
                     if variants:
                         with self.conn.cursor() as cur:
-                            cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
-                                        (norm_name, list(variants)))
-                            result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call]
-                            self._cache.housenumbers[norm_name] = result
+                            hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
+                                             (word_id, list(variants)))
+                            result = hid, variants[0]
+                            self._cache.housenumbers[word_id] = result
 
         return result
 
 
         return result
 
@@ -620,7 +721,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
 
     def _retrieve_full_tokens(self, name: str) -> List[int]:
         """ Get the full name token for the given name, if it exists.
 
     def _retrieve_full_tokens(self, name: str) -> List[int]:
         """ Get the full name token for the given name, if it exists.
-            The name is only retrived for the standard analyser.
+            The name is only retrieved for the standard analyser.
         """
         assert self.conn is not None
         norm_name = self._search_normalized(name)
         """
         assert self.conn is not None
         norm_name = self._search_normalized(name)
@@ -650,23 +751,22 @@ class ICUNameAnalyzer(AbstractAnalyzer):
         for name in names:
             analyzer_id = name.get_attr('analyzer')
             analyzer = self.token_analysis.get_analyzer(analyzer_id)
         for name in names:
             analyzer_id = name.get_attr('analyzer')
             analyzer = self.token_analysis.get_analyzer(analyzer_id)
-            norm_name = analyzer.normalize(name.name)
+            word_id = analyzer.get_canonical_id(name)
             if analyzer_id is None:
             if analyzer_id is None:
-                token_id = norm_name
+                token_id = word_id
             else:
             else:
-                token_id = f'{norm_name}@{analyzer_id}'
+                token_id = f'{word_id}@{analyzer_id}'
 
             full, part = self._cache.names.get(token_id, (None, None))
             if full is None:
 
             full, part = self._cache.names.get(token_id, (None, None))
             if full is None:
-                variants = analyzer.get_variants_ascii(norm_name)
+                variants = analyzer.compute_variants(word_id)
                 if not variants:
                     continue
 
                 with self.conn.cursor() as cur:
                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
                                 (token_id, variants))
                 if not variants:
                     continue
 
                 with self.conn.cursor() as cur:
                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
                                 (token_id, variants))
-                    full, part = cast(Tuple[int, List[int]],
-                                      cur.fetchone()) # type: ignore[no-untyped-call]
+                    full, part = cast(Tuple[int, List[int]], cur.fetchone())
 
                 self._cache.names[token_id] = (full, part)
 
 
                 self._cache.names[token_id] = (full, part)
 
@@ -688,7 +788,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
             postcode_name = item.name.strip().upper()
             variant_base = None
         else:
             postcode_name = item.name.strip().upper()
             variant_base = None
         else:
-            postcode_name = analyzer.normalize(item.name)
+            postcode_name = analyzer.get_canonical_id(item)
             variant_base = item.get_attr("variant")
 
         if variant_base:
             variant_base = item.get_attr("variant")
 
         if variant_base:
@@ -703,7 +803,7 @@ class ICUNameAnalyzer(AbstractAnalyzer):
 
             variants = {term}
             if analyzer is not None and variant_base:
 
             variants = {term}
             if analyzer is not None and variant_base:
-                variants.update(analyzer.get_variants_ascii(variant_base))
+                variants.update(analyzer.compute_variants(variant_base))
 
             with self.conn.cursor() as cur:
                 cur.execute("SELECT create_postcode_word(%s, %s)",
 
             with self.conn.cursor() as cur:
                 cur.execute("SELECT create_postcode_word(%s, %s)",
@@ -720,7 +820,7 @@ class _TokenInfo:
         self.names: Optional[str] = None
         self.housenumbers: Set[str] = set()
         self.housenumber_tokens: Set[int] = set()
         self.names: Optional[str] = None
         self.housenumbers: Set[str] = set()
         self.housenumber_tokens: Set[int] = set()
-        self.street_tokens: Set[int] = set()
+        self.street_tokens: Optional[Set[int]] = None
         self.place_tokens: Set[int] = set()
         self.address_tokens: Dict[str, str] = {}
         self.postcode: Optional[str] = None
         self.place_tokens: Set[int] = set()
         self.address_tokens: Dict[str, str] = {}
         self.postcode: Optional[str] = None
@@ -742,7 +842,7 @@ class _TokenInfo:
             out['hnr'] = ';'.join(self.housenumbers)
             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 
             out['hnr'] = ';'.join(self.housenumbers)
             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 
-        if self.street_tokens:
+        if self.street_tokens is not None:
             out['street'] = self._mk_array(self.street_tokens)
 
         if self.place_tokens:
             out['street'] = self._mk_array(self.street_tokens)
 
         if self.place_tokens:
@@ -776,6 +876,8 @@ class _TokenInfo:
     def add_street(self, tokens: Iterable[int]) -> None:
         """ Add addr:street match terms.
         """
     def add_street(self, tokens: Iterable[int]) -> None:
         """ Add addr:street match terms.
         """
+        if self.street_tokens is None:
+            self.street_tokens = set()
         self.street_tokens.update(tokens)
 
 
         self.street_tokens.update(tokens)