]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/icu_tokenizer.py
Merge pull request #3422 from lonvia/drop-non-parented-interpolations
[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
index cbbaf71fd8d898e10a1c483a80e8f1079a92c9e2..4b9dac69e18eb63760c07ff71a981f1343ab6ac3 100644 (file)
@@ -67,7 +67,7 @@ class ICUTokenizer(AbstractTokenizer):
 
         if init_db:
             self.update_sql_functions(config)
 
         if init_db:
             self.update_sql_functions(config)
-            self._setup_db_tables(config, 'word')
+            self._setup_db_tables(config)
             self._create_base_indices(config, 'word')
 
 
             self._create_base_indices(config, 'word')
 
 
@@ -104,7 +104,7 @@ class ICUTokenizer(AbstractTokenizer):
         self.init_from_project(config)
 
 
         self.init_from_project(config)
 
 
-    def update_statistics(self, config: Configuration) -> None:
+    def update_statistics(self, config: Configuration, threads: int = 2) -> None:
         """ Recompute frequencies for all name words.
         """
         with connect(self.dsn) as conn:
         """ Recompute frequencies for all name words.
         """
         with connect(self.dsn) as conn:
@@ -112,22 +112,93 @@ class ICUTokenizer(AbstractTokenizer):
                 return
 
             with conn.cursor() as cur:
                 return
 
             with conn.cursor() as cur:
-                LOG.info('Computing word frequencies')
-                cur.drop_table('word_frequencies')
-                cur.execute("""CREATE TEMP TABLE word_frequencies AS
-                                 SELECT unnest(name_vector) as id, count(*)
-                                 FROM search_name GROUP BY id""")
-                cur.execute('CREATE INDEX ON word_frequencies(id)')
-                LOG.info('Update word table with recomputed frequencies')
-                cur.drop_table('tmp_word')
-                cur.execute("""CREATE TABLE tmp_word AS
-                                SELECT word_id, word_token, type, word,
-                                       (CASE WHEN wf.count is null THEN info
-                                          ELSE info || jsonb_build_object('count', wf.count)
-                                        END) as info
-                                FROM word LEFT JOIN word_frequencies wf
-                                  ON word.word_id = wf.id""")
-                cur.drop_table('word_frequencies')
+                cur.execute('ANALYSE search_name')
+                if threads > 1:
+                    cur.execute('SET max_parallel_workers_per_gather TO %s',
+                                (min(threads, 6),))
+
+                if conn.server_version_tuple() < (12, 0):
+                    LOG.info('Computing word frequencies')
+                    cur.drop_table('word_frequencies')
+                    cur.drop_table('addressword_frequencies')
+                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                     SELECT unnest(name_vector) as id, count(*)
+                                     FROM search_name GROUP BY id""")
+                    cur.execute('CREATE INDEX ON word_frequencies(id)')
+                    cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
+                                     SELECT unnest(nameaddress_vector) as id, count(*)
+                                     FROM search_name GROUP BY id""")
+                    cur.execute('CREATE INDEX ON addressword_frequencies(id)')
+                    cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
+                                                                               INOUT info JSONB)
+                                   AS $$
+                                   DECLARE rec RECORD;
+                                   BEGIN
+                                   IF info is null THEN
+                                     info = '{}'::jsonb;
+                                   END IF;
+                                   FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
+                                   LOOP
+                                     info = info || jsonb_build_object('count', rec.count);
+                                   END LOOP;
+                                   FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
+                                   LOOP
+                                     info = info || jsonb_build_object('addr_count', rec.count);
+                                   END LOOP;
+                                   IF info = '{}'::jsonb THEN
+                                     info = null;
+                                   END IF;
+                                   END;
+                                   $$ LANGUAGE plpgsql IMMUTABLE;
+                                """)
+                    LOG.info('Update word table with recomputed frequencies')
+                    cur.drop_table('tmp_word')
+                    cur.execute("""CREATE TABLE tmp_word AS
+                                    SELECT word_id, word_token, type, word,
+                                           word_freq_update(word_id, info) as info
+                                    FROM word
+                                """)
+                    cur.drop_table('word_frequencies')
+                    cur.drop_table('addressword_frequencies')
+                else:
+                    LOG.info('Computing word frequencies')
+                    cur.drop_table('word_frequencies')
+                    cur.execute("""
+                      CREATE TEMP TABLE word_frequencies AS
+                      WITH word_freq AS MATERIALIZED (
+                               SELECT unnest(name_vector) as id, count(*)
+                                     FROM search_name GROUP BY id),
+                           addr_freq AS MATERIALIZED (
+                               SELECT unnest(nameaddress_vector) as id, count(*)
+                                     FROM search_name GROUP BY id)
+                      SELECT coalesce(a.id, w.id) as id,
+                             (CASE WHEN w.count is null THEN '{}'::JSONB
+                                  ELSE jsonb_build_object('count', w.count) END
+                              ||
+                              CASE WHEN a.count is null THEN '{}'::JSONB
+                                  ELSE jsonb_build_object('addr_count', a.count) END) as info
+                      FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
+                      """)
+                    cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
+                    cur.execute('ANALYSE word_frequencies')
+                    LOG.info('Update word table with recomputed frequencies')
+                    cur.drop_table('tmp_word')
+                    cur.execute("""CREATE TABLE tmp_word AS
+                                    SELECT word_id, word_token, type, word,
+                                           (CASE WHEN wf.info is null THEN word.info
+                                            ELSE coalesce(word.info, '{}'::jsonb) || wf.info
+                                            END) as info
+                                    FROM word LEFT JOIN word_frequencies wf
+                                         ON word.word_id = wf.id
+                                """)
+                    cur.drop_table('word_frequencies')
+
+            with conn.cursor() as cur:
+                cur.execute('SET max_parallel_workers_per_gather TO 0')
+
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_string(conn,
+                            'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
             conn.commit()
         self._create_base_indices(config, 'tmp_word')
         self._create_lookup_indices(config, 'tmp_word')
             conn.commit()
         self._create_base_indices(config, 'tmp_word')
         self._create_lookup_indices(config, 'tmp_word')
@@ -210,19 +281,20 @@ class ICUTokenizer(AbstractTokenizer):
             return list(s[0].split('@')[0] for s in cur)
 
 
             return list(s[0].split('@')[0] for s in cur)
 
 
-    def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
+    def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
         """ Install the php script for the tokenizer.
         """
         """ Install the php script for the tokenizer.
         """
-        assert self.loader is not None
-        php_file = self.data_dir / "tokenizer.php"
+        if phpdir is not None:
+            assert self.loader is not None
+            php_file = self.data_dir / "tokenizer.php"
 
 
-        if not php_file.exists() or overwrite:
-            php_file.write_text(dedent(f"""\
-                <?php
-                @define('CONST_Max_Word_Frequency', 10000000);
-                @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
-                @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
-                require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
+            if not php_file.exists() or overwrite:
+                php_file.write_text(dedent(f"""\
+                    <?php
+                    @define('CONST_Max_Word_Frequency', 10000000);
+                    @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
+                    @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
+                    require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
 
 
     def _save_config(self) -> None:
 
 
     def _save_config(self) -> None:
@@ -234,28 +306,29 @@ class ICUTokenizer(AbstractTokenizer):
             self.loader.save_config_to_db(conn)
 
 
             self.loader.save_config_to_db(conn)
 
 
-    def _setup_db_tables(self, config: Configuration, table_name: str) -> None:
+    def _setup_db_tables(self, config: Configuration) -> None:
         """ Set up the word table and fill it with pre-computed word
             frequencies.
         """
         with connect(self.dsn) as conn:
             with conn.cursor() as cur:
         """ Set up the word table and fill it with pre-computed word
             frequencies.
         """
         with connect(self.dsn) as conn:
             with conn.cursor() as cur:
-                cur.drop_table(table_name)
+                cur.drop_table('word')
             sqlp = SQLPreprocessor(conn, config)
             sqlp.run_string(conn, """
             sqlp = SQLPreprocessor(conn, config)
             sqlp.run_string(conn, """
-                CREATE TABLE {{table_name}} (
+                CREATE TABLE word (
                       word_id INTEGER,
                       word_token text NOT NULL,
                       type text NOT NULL,
                       word text,
                       info jsonb
                     ) {{db.tablespace.search_data}};
                       word_id INTEGER,
                       word_token text NOT NULL,
                       type text NOT NULL,
                       word text,
                       info jsonb
                     ) {{db.tablespace.search_data}};
-                GRANT SELECT ON {{table_name}} TO "{{config.DATABASE_WEBUSER}}";
+                GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
 
 
-                DROP SEQUENCE IF EXISTS seq_{{table_name}};
-                CREATE SEQUENCE seq_{{table_name}} start 1;
-                GRANT SELECT ON seq_{{table_name}} to "{{config.DATABASE_WEBUSER}}";
-            """, table_name=table_name)
+                DROP SEQUENCE IF EXISTS seq_word;
+                CREATE SEQUENCE seq_word start 1;
+                GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
+            """)
+            conn.commit()
 
 
     def _create_base_indices(self, config: Configuration, table_name: str) -> None:
 
 
     def _create_base_indices(self, config: Configuration, table_name: str) -> None:
@@ -276,10 +349,11 @@ class ICUTokenizer(AbstractTokenizer):
                                 """,
                                 table_name=table_name, idx_name=name,
                                 column_type=ctype)
                                 """,
                                 table_name=table_name, idx_name=name,
                                 column_type=ctype)
+            conn.commit()
 
 
     def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
 
 
     def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
-        """ Create addtional indexes used when running the API.
+        """ Create additional indexes used when running the API.
         """
         with connect(self.dsn) as conn:
             sqlp = SQLPreprocessor(conn, config)
         """
         with connect(self.dsn) as conn:
             sqlp = SQLPreprocessor(conn, config)
@@ -289,6 +363,7 @@ class ICUTokenizer(AbstractTokenizer):
                   ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
             """,
             table_name=table_name)
                   ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
             """,
             table_name=table_name)
+            conn.commit()
 
 
     def _move_temporary_word_table(self, old: str) -> None:
 
 
     def _move_temporary_word_table(self, old: str) -> None:
@@ -637,10 +712,11 @@ class ICUNameAnalyzer(AbstractAnalyzer):
                 token_info.add_street(self._retrieve_full_tokens(item.name))
             elif item.kind == 'place':
                 if not item.suffix:
                 token_info.add_street(self._retrieve_full_tokens(item.name))
             elif item.kind == 'place':
                 if not item.suffix:
-                    token_info.add_place(self._compute_partial_tokens(item.name))
+                    token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
             elif not item.kind.startswith('_') and not item.suffix and \
                  item.kind not in ('country', 'full', 'inclusion'):
             elif not item.kind.startswith('_') and not item.suffix and \
                  item.kind not in ('country', 'full', 'inclusion'):
-                token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
+                token_info.add_address_term(item.kind,
+                                            itertools.chain(*self._compute_name_tokens([item])))
 
 
     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
 
 
     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
@@ -681,36 +757,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):
         return result
 
 
         return result
 
 
-    def _compute_partial_tokens(self, name: str) -> List[int]:
-        """ Normalize the given term, split it into partial words and return
-            then token list for them.
-        """
-        assert self.conn is not None
-        norm_name = self._search_normalized(name)
-
-        tokens = []
-        need_lookup = []
-        for partial in norm_name.split():
-            token = self._cache.partials.get(partial)
-            if token:
-                tokens.append(token)
-            else:
-                need_lookup.append(partial)
-
-        if need_lookup:
-            with self.conn.cursor() as cur:
-                cur.execute("""SELECT word, getorcreate_partial_word(word)
-                               FROM unnest(%s) word""",
-                            (need_lookup, ))
-
-                for partial, token in cur:
-                    assert token is not None
-                    tokens.append(token)
-                    self._cache.partials[partial] = token
-
-        return tokens
-
-
     def _retrieve_full_tokens(self, name: str) -> List[int]:
         """ Get the full name token for the given name, if it exists.
             The name is only retrieved for the standard analyser.
     def _retrieve_full_tokens(self, name: str) -> List[int]:
         """ Get the full name token for the given name, if it exists.
             The name is only retrieved for the standard analyser.
@@ -882,8 +928,9 @@ class _TokenInfo:
     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
         """ Add additional address terms.
         """
     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
         """ Add additional address terms.
         """
-        if partials:
-            self.address_tokens[key] = self._mk_array(partials)
+        array = self._mk_array(partials)
+        if len(array) > 2:
+            self.address_tokens[key] = array
 
     def set_postcode(self, postcode: Optional[str]) -> None:
         """ Set the postcode to the given one.
 
     def set_postcode(self, postcode: Optional[str]) -> None:
         """ Set the postcode to the given one.