Merge pull request #3342 from mtmail/tyops

[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index b6e646377b0e7f9d6dea691cf1ffcc6e1295632d..c1821d7edc7b88b2aa1f95797be2ddfce0ee0c85 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -31,6 +31,11 @@ DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  
  LOG = logging.getLogger()
  
+WORD_TYPES =(('country_names', 'C'),
+             ('postcodes', 'P'),
+             ('full_word', 'W'),
+             ('housenumbers', 'H'))
+
  def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
      """ Create a new instance of the tokenizer provided by this module.
      """
@@ -62,7 +67,8 @@ class ICUTokenizer(AbstractTokenizer):
  
          if init_db:
              self.update_sql_functions(config)
-            self._init_db_tables(config)
+            self._setup_db_tables(config)
+            self._create_base_indices(config, 'word')
  
  
      def init_from_project(self, config: Configuration) -> None:
@@ -80,9 +86,7 @@ class ICUTokenizer(AbstractTokenizer):
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
-        with connect(self.dsn) as conn:
-            sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
+        self._create_lookup_indices(config, 'word')
  
  
      def update_sql_functions(self, config: Configuration) -> None:
@@ -100,24 +104,39 @@ class ICUTokenizer(AbstractTokenizer):
          self.init_from_project(config)
  
  
-    def update_statistics(self) -> None:
+    def update_statistics(self, config: Configuration) -> None:
          """ Recompute frequencies for all name words.
          """
          with connect(self.dsn) as conn:
-            if conn.table_exists('search_name'):
-                with conn.cursor() as cur:
-                    cur.drop_table("word_frequencies")
-                    LOG.info("Computing word frequencies")
-                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
-                                     SELECT unnest(name_vector) as id, count(*)
-                                     FROM search_name GROUP BY id""")
-                    cur.execute("CREATE INDEX ON word_frequencies(id)")
-                    LOG.info("Update word table with recomputed frequencies")
-                    cur.execute("""UPDATE word
-                                   SET info = info || jsonb_build_object('count', count)
-                                   FROM word_frequencies WHERE word_id = id""")
-                    cur.drop_table("word_frequencies")
+            if not conn.table_exists('search_name'):
+                return
+
+            with conn.cursor() as cur:
+                LOG.info('Computing word frequencies')
+                cur.drop_table('word_frequencies')
+                cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                 SELECT unnest(name_vector) as id, count(*)
+                                 FROM search_name GROUP BY id""")
+                cur.execute('CREATE INDEX ON word_frequencies(id)')
+                LOG.info('Update word table with recomputed frequencies')
+                cur.drop_table('tmp_word')
+                cur.execute("""CREATE TABLE tmp_word AS
+                                SELECT word_id, word_token, type, word,
+                                       (CASE WHEN wf.count is null THEN info
+                                          ELSE info || jsonb_build_object('count', wf.count)
+                                        END) as info
+                                FROM word LEFT JOIN word_frequencies wf
+                                  ON word.word_id = wf.id""")
+                cur.drop_table('word_frequencies')
+
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_string(conn,
+                            'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
              conn.commit()
+        self._create_base_indices(config, 'tmp_word')
+        self._create_lookup_indices(config, 'tmp_word')
+        self._move_temporary_word_table('tmp_word')
+
  
  
      def _cleanup_housenumbers(self) -> None:
@@ -183,19 +202,32 @@ class ICUTokenizer(AbstractTokenizer):
                                 self.loader.make_token_analysis())
  
  
-    def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
+    def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
+        """ Return a list of the `num` most frequent full words
+            in the database.
+        """
+        with conn.cursor() as cur:
+            cur.execute("""SELECT word, sum((info->>'count')::int) as count
+                             FROM word WHERE type = 'W'
+                             GROUP BY word
+                             ORDER BY count DESC LIMIT %s""", (num,))
+            return list(s[0].split('@')[0] for s in cur)
+
+
+    def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
          """ Install the php script for the tokenizer.
          """
-        assert self.loader is not None
-        php_file = self.data_dir / "tokenizer.php"
+        if phpdir is not None:
+            assert self.loader is not None
+            php_file = self.data_dir / "tokenizer.php"
  
-        if not php_file.exists() or overwrite:
-            php_file.write_text(dedent(f"""\
-                <?php
-                @define('CONST_Max_Word_Frequency', 10000000);
-                @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
-                @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
-                require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
+            if not php_file.exists() or overwrite:
+                php_file.write_text(dedent(f"""\
+                    <?php
+                    @define('CONST_Max_Word_Frequency', 10000000);
+                    @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
+                    @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
+                    require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
  
  
      def _save_config(self) -> None:
@@ -207,16 +239,84 @@ class ICUTokenizer(AbstractTokenizer):
              self.loader.save_config_to_db(conn)
  
  
-    def _init_db_tables(self, config: Configuration) -> None:
+    def _setup_db_tables(self, config: Configuration) -> None:
          """ Set up the word table and fill it with pre-computed word
              frequencies.
          """
          with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.drop_table('word')
              sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
+            sqlp.run_string(conn, """
+                CREATE TABLE word (
+                      word_id INTEGER,
+                      word_token text NOT NULL,
+                      type text NOT NULL,
+                      word text,
+                      info jsonb
+                    ) {{db.tablespace.search_data}};
+                GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
+
+                DROP SEQUENCE IF EXISTS seq_word;
+                CREATE SEQUENCE seq_word start 1;
+                GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
+            """)
              conn.commit()
  
  
+    def _create_base_indices(self, config: Configuration, table_name: str) -> None:
+        """ Set up the word table and fill it with pre-computed word
+            frequencies.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_string(conn,
+                            """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
+                               USING BTREE (word_token) {{db.tablespace.search_index}}""",
+                            table_name=table_name)
+            for name, ctype in WORD_TYPES:
+                sqlp.run_string(conn,
+                                """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
+                                   USING BTREE (word) {{db.tablespace.address_index}}
+                                   WHERE type = '{{column_type}}'
+                                """,
+                                table_name=table_name, idx_name=name,
+                                column_type=ctype)
+            conn.commit()
+
+
+    def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
+        """ Create additional indexes used when running the API.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            # Index required for details lookup.
+            sqlp.run_string(conn, """
+                CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
+                  ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
+            """,
+            table_name=table_name)
+            conn.commit()
+
+
+    def _move_temporary_word_table(self, old: str) -> None:
+        """ Rename all tables and indexes used by the tokenizer.
+        """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.drop_table('word')
+                cur.execute(f"ALTER TABLE {old} RENAME TO word")
+                for idx in ('word_token', 'word_id'):
+                    cur.execute(f"""ALTER INDEX idx_{old}_{idx}
+                                      RENAME TO idx_word_{idx}""")
+                for name, _ in WORD_TYPES:
+                    cur.execute(f"""ALTER INDEX idx_{old}_{name}
+                                    RENAME TO idx_word_{name}""")
+            conn.commit()
+
+
+
+
  class ICUNameAnalyzer(AbstractAnalyzer):
      """ The ICU analyzer uses the ICU library for splitting names.