recreate word table when refreshing counts

author Sarah Hoffmann <lonvia@denofr.de>

Sun, 4 Feb 2024 15:43:33 +0000 (16:43 +0100)

committer Sarah Hoffmann <lonvia@denofr.de>

Sun, 4 Feb 2024 20:35:10 +0000 (21:35 +0100)
author Sarah Hoffmann <lonvia@denofr.de>
Sun, 4 Feb 2024 15:43:33 +0000 (16:43 +0100)
committer Sarah Hoffmann <lonvia@denofr.de>
Sun, 4 Feb 2024 20:35:10 +0000 (21:35 +0100)
diff --git a/lib-sql/tokenizer/icu_tokenizer_tables.sql b/lib-sql/tokenizer/icu_tokenizer_tables.sql

deleted file mode 100644 (file)

index 509f6f6..0000000
--- a/lib-sql/tokenizer/icu_tokenizer_tables.sql
+++ /dev/null
@@ -1,40 +0,0 @@
--- SPDX-License-Identifier: GPL-2.0-only
---
--- This file is part of Nominatim. (https://nominatim.org)
---
--- Copyright (C) 2022 by the Nominatim developer community.
--- For a full list of authors see the git log.
-
-DROP TABLE IF EXISTS word;
-CREATE TABLE word (
-  word_id INTEGER,
-  word_token text NOT NULL,
-  type text NOT NULL,
-  word text,
-  info jsonb
-) {{db.tablespace.search_data}};
-
-CREATE INDEX idx_word_word_token ON word
-    USING BTREE (word_token) {{db.tablespace.search_index}};
--- Used when updating country names from the boundary relation.
-CREATE INDEX idx_word_country_names ON word
-    USING btree(word) {{db.tablespace.address_index}}
-    WHERE type = 'C';
--- Used when inserting new postcodes on updates.
-CREATE INDEX idx_word_postcodes ON word
-    USING btree(word) {{db.tablespace.address_index}}
-    WHERE type = 'P';
--- Used when inserting full words.
-CREATE INDEX idx_word_full_word ON word
-    USING btree(word) {{db.tablespace.address_index}}
-    WHERE type = 'W';
--- Used when inserting analyzed housenumbers (exclude old-style entries).
-CREATE INDEX idx_word_housenumbers ON word
-    USING btree(word) {{db.tablespace.address_index}}
-    WHERE type = 'H' and word is not null;
-
-GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
-
-DROP SEQUENCE IF EXISTS seq_word;
-CREATE SEQUENCE seq_word start 1;
-GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
diff --git a/nominatim/clicmd/refresh.py b/nominatim/clicmd/refresh.py

index 5e1b044e734336bf305f1c69c649bf18fda0fddc..afafe4a8305441bfa42cab62ec317c69fd513fc2 100644 (file)
--- a/nominatim/clicmd/refresh.py
+++ b/nominatim/clicmd/refresh.py
@@ -110,7 +110,7 @@ class UpdateRefresh:
  
          if args.word_counts:
              LOG.warning('Recompute word statistics')
-            self._get_tokenizer(args.config).update_statistics()
+            self._get_tokenizer(args.config).update_statistics(args.config)
  
          if args.address_levels:
              LOG.warning('Updating address levels')
diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py

index 3d212ff980994d7b6511bc234215e1d3201cdd5e..67ca5bb10200aa1d095212898122569009e23d49 100644 (file)
--- a/nominatim/clicmd/setup.py
+++ b/nominatim/clicmd/setup.py
@@ -169,7 +169,7 @@ class SetupAll:
          tokenizer.finalize_import(args.config)
  
          LOG.warning('Recompute word counts')
-        tokenizer.update_statistics()
+        tokenizer.update_statistics(args.config)
  
          webdir = args.project_dir / 'website'
          LOG.warning('Setup website at %s', webdir)
diff --git a/nominatim/db/sql_preprocessor.py b/nominatim/db/sql_preprocessor.py

index 2e11f57195323e2e53aec9dc77d541f81b6522e6..3762d82eff1a8a0b0a8e17223eb3f9f9c154e329 100644 (file)
--- a/nominatim/db/sql_preprocessor.py
+++ b/nominatim/db/sql_preprocessor.py
@@ -90,6 +90,18 @@ class SQLPreprocessor:
          self.env.globals['postgres'] = _setup_postgresql_features(conn)
  
  
+    def run_string(self, conn: Connection, template: str, **kwargs: Any) -> None:
+        """ Execute the given SQL template string on the connection.
+            The keyword arguments may supply additional parameters
+            for preprocessing.
+        """
+        sql = self.env.from_string(template).render(**kwargs)
+
+        with conn.cursor() as cur:
+            cur.execute(sql)
+        conn.commit()
+
+
      def run_sql_file(self, conn: Connection, name: str, **kwargs: Any) -> None:
          """ Execute the given SQL file on the connection. The keyword arguments
              may supply additional parameters for preprocessing.
diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py

index 061cff36b99f22273e55e350d410d4291c425b91..29bcc8e196cf29cf4ef110252fbad0a5f26da2b7 100644 (file)
--- a/nominatim/tokenizer/base.py
+++ b/nominatim/tokenizer/base.py
@@ -201,7 +201,7 @@ class AbstractTokenizer(ABC):
  
  
      @abstractmethod
-    def update_statistics(self) -> None:
+    def update_statistics(self, config: Configuration) -> None:
          """ Recompute any tokenizer statistics necessary for efficient lookup.
              This function is meant to be called from time to time by the user
              to improve performance. However, the tokenizer must not depend on
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index 799ff559b94599c43e4f66270f82ec94ac0138cc..cbbaf71fd8d898e10a1c483a80e8f1079a92c9e2 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -31,6 +31,11 @@ DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  
  LOG = logging.getLogger()
  
+WORD_TYPES =(('country_names', 'C'),
+             ('postcodes', 'P'),
+             ('full_word', 'W'),
+             ('housenumbers', 'H'))
+
  def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
      """ Create a new instance of the tokenizer provided by this module.
      """
@@ -62,7 +67,8 @@ class ICUTokenizer(AbstractTokenizer):
  
          if init_db:
              self.update_sql_functions(config)
-            self._init_db_tables(config)
+            self._setup_db_tables(config, 'word')
+            self._create_base_indices(config, 'word')
  
  
      def init_from_project(self, config: Configuration) -> None:
@@ -80,9 +86,7 @@ class ICUTokenizer(AbstractTokenizer):
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
-        with connect(self.dsn) as conn:
-            sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
+        self._create_lookup_indices(config, 'word')
  
  
      def update_sql_functions(self, config: Configuration) -> None:
@@ -100,24 +104,35 @@ class ICUTokenizer(AbstractTokenizer):
          self.init_from_project(config)
  
  
-    def update_statistics(self) -> None:
+    def update_statistics(self, config: Configuration) -> None:
          """ Recompute frequencies for all name words.
          """
          with connect(self.dsn) as conn:
-            if conn.table_exists('search_name'):
-                with conn.cursor() as cur:
-                    cur.drop_table("word_frequencies")
-                    LOG.info("Computing word frequencies")
-                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
-                                     SELECT unnest(name_vector) as id, count(*)
-                                     FROM search_name GROUP BY id""")
-                    cur.execute("CREATE INDEX ON word_frequencies(id)")
-                    LOG.info("Update word table with recomputed frequencies")
-                    cur.execute("""UPDATE word
-                                   SET info = info || jsonb_build_object('count', count)
-                                   FROM word_frequencies WHERE word_id = id""")
-                    cur.drop_table("word_frequencies")
+            if not conn.table_exists('search_name'):
+                return
+
+            with conn.cursor() as cur:
+                LOG.info('Computing word frequencies')
+                cur.drop_table('word_frequencies')
+                cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                 SELECT unnest(name_vector) as id, count(*)
+                                 FROM search_name GROUP BY id""")
+                cur.execute('CREATE INDEX ON word_frequencies(id)')
+                LOG.info('Update word table with recomputed frequencies')
+                cur.drop_table('tmp_word')
+                cur.execute("""CREATE TABLE tmp_word AS
+                                SELECT word_id, word_token, type, word,
+                                       (CASE WHEN wf.count is null THEN info
+                                          ELSE info || jsonb_build_object('count', wf.count)
+                                        END) as info
+                                FROM word LEFT JOIN word_frequencies wf
+                                  ON word.word_id = wf.id""")
+                cur.drop_table('word_frequencies')
              conn.commit()
+        self._create_base_indices(config, 'tmp_word')
+        self._create_lookup_indices(config, 'tmp_word')
+        self._move_temporary_word_table('tmp_word')
+
  
  
      def _cleanup_housenumbers(self) -> None:
@@ -219,16 +234,81 @@ class ICUTokenizer(AbstractTokenizer):
              self.loader.save_config_to_db(conn)
  
  
-    def _init_db_tables(self, config: Configuration) -> None:
+    def _setup_db_tables(self, config: Configuration, table_name: str) -> None:
+        """ Set up the word table and fill it with pre-computed word
+            frequencies.
+        """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.drop_table(table_name)
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_string(conn, """
+                CREATE TABLE {{table_name}} (
+                      word_id INTEGER,
+                      word_token text NOT NULL,
+                      type text NOT NULL,
+                      word text,
+                      info jsonb
+                    ) {{db.tablespace.search_data}};
+                GRANT SELECT ON {{table_name}} TO "{{config.DATABASE_WEBUSER}}";
+
+                DROP SEQUENCE IF EXISTS seq_{{table_name}};
+                CREATE SEQUENCE seq_{{table_name}} start 1;
+                GRANT SELECT ON seq_{{table_name}} to "{{config.DATABASE_WEBUSER}}";
+            """, table_name=table_name)
+
+
+    def _create_base_indices(self, config: Configuration, table_name: str) -> None:
          """ Set up the word table and fill it with pre-computed word
              frequencies.
          """
          with connect(self.dsn) as conn:
              sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
+            sqlp.run_string(conn,
+                            """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
+                               USING BTREE (word_token) {{db.tablespace.search_index}}""",
+                            table_name=table_name)
+            for name, ctype in WORD_TYPES:
+                sqlp.run_string(conn,
+                                """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
+                                   USING BTREE (word) {{db.tablespace.address_index}}
+                                   WHERE type = '{{column_type}}'
+                                """,
+                                table_name=table_name, idx_name=name,
+                                column_type=ctype)
+
+
+    def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
+        """ Create addtional indexes used when running the API.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            # Index required for details lookup.
+            sqlp.run_string(conn, """
+                CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
+                  ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
+            """,
+            table_name=table_name)
+
+
+    def _move_temporary_word_table(self, old: str) -> None:
+        """ Rename all tables and indexes used by the tokenizer.
+        """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.drop_table('word')
+                cur.execute(f"ALTER TABLE {old} RENAME TO word")
+                for idx in ('word_token', 'word_id'):
+                    cur.execute(f"""ALTER INDEX idx_{old}_{idx}
+                                      RENAME TO idx_word_{idx}""")
+                for name, _ in WORD_TYPES:
+                    cur.execute(f"""ALTER INDEX idx_{old}_{name}
+                                    RENAME TO idx_word_{name}""")
              conn.commit()
  
  
+
+
  class ICUNameAnalyzer(AbstractAnalyzer):
      """ The ICU analyzer uses the ICU library for splitting names.
  
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py

index 1b68a494383bb72804d662b8bcdff8456b87c6db..2d28a8b29891623e72e7f8c2f0f7b7c9cafbd160 100644 (file)
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -210,7 +210,7 @@ class LegacyTokenizer(AbstractTokenizer):
              self._save_config(conn, config)
  
  
-    def update_statistics(self) -> None:
+    def update_statistics(self, _: Configuration) -> None:
          """ Recompute the frequency of full words.
          """
          with connect(self.dsn) as conn:
diff --git a/test/python/cli/conftest.py b/test/python/cli/conftest.py

index 7aea2c5917c8c716f679ec975df0bedd61a7f058..1bb393fb240613d1ed85f04d36269733c61469c8 100644 (file)
--- a/test/python/cli/conftest.py
+++ b/test/python/cli/conftest.py
@@ -38,10 +38,10 @@ class DummyTokenizer:
      def finalize_import(self, *args):
          self.finalize_import_called = True
  
-    def update_statistics(self):
+    def update_statistics(self, *args):
          self.update_statistics_called = True
  
-    def update_word_tokens(self):
+    def update_word_tokens(self, *args):
          self.update_word_tokens_called = True
  
  
diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py

index 2d9da69a632d938ed733bbb44cc06435eec5f955..aa1afe160ca9010630b7b36502d14b173a453003 100644 (file)
--- a/test/python/tokenizer/test_icu.py
+++ b/test/python/tokenizer/test_icu.py
@@ -7,7 +7,6 @@
  """
  Tests for ICU tokenizer.
  """
-import shutil
  import yaml
  import itertools
  
@@ -32,8 +31,6 @@ def test_config(project_env, tmp_path):
      sqldir.mkdir()
      (sqldir / 'tokenizer').mkdir()
      (sqldir / 'tokenizer' / 'icu_tokenizer.sql').write_text("SELECT 'a'")
-    shutil.copy(str(project_env.lib_dir.sql / 'tokenizer' / 'icu_tokenizer_tables.sql'),
-                str(sqldir / 'tokenizer' / 'icu_tokenizer_tables.sql'))
  
      project_env.lib_dir.sql = sqldir
  
@@ -204,16 +201,14 @@ def test_update_sql_functions(db_prop, temp_db_cursor,
  
  def test_finalize_import(tokenizer_factory, temp_db_conn,
                           temp_db_cursor, test_config, sql_preprocessor_cfg):
-    func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_indices.sql'
-    func_file.write_text("""CREATE FUNCTION test() RETURNS TEXT
-                            AS $$ SELECT 'b'::text $$ LANGUAGE SQL""")
-
      tok = tokenizer_factory()
      tok.init_new_db(test_config)
  
+    assert not temp_db_conn.index_exists('idx_word_word_id')
+
      tok.finalize_import(test_config)
  
-    temp_db_cursor.scalar('SELECT test()') == 'b'
+    assert temp_db_conn.index_exists('idx_word_word_id')
  
  
  def test_check_database(test_config, tokenizer_factory,
@@ -224,19 +219,20 @@ def test_check_database(test_config, tokenizer_factory,
      assert tok.check_database(test_config) is None
  
  
-def test_update_statistics_reverse_only(word_table, tokenizer_factory):
+def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_config):
      tok = tokenizer_factory()
-    tok.update_statistics()
+    tok.update_statistics(test_config)
  
  
-def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory):
+def test_update_statistics(word_table, table_factory, temp_db_cursor,
+                           tokenizer_factory, test_config):
      word_table.add_full_word(1000, 'hello')
      table_factory('search_name',
                    'place_id BIGINT, name_vector INT[]',
                    [(12, [1000])])
      tok = tokenizer_factory()
  
-    tok.update_statistics()
+    tok.update_statistics(test_config)
  
      assert temp_db_cursor.scalar("""SELECT count(*) FROM word
                                      WHERE type = 'W' and
diff --git a/test/python/tokenizer/test_legacy.py b/test/python/tokenizer/test_legacy.py

index d63ee8e14624d5db3e652cfd677511f755b6a487..f7f04490b4c9ffc690221519bcb4389719b9f6dd 100644 (file)
--- a/test/python/tokenizer/test_legacy.py
+++ b/test/python/tokenizer/test_legacy.py
@@ -238,19 +238,19 @@ def test_check_database_bad_setup(test_config, tokenizer_factory, monkeypatch,
      assert tok.check_database(False) is not None
  
  
-def test_update_statistics_reverse_only(word_table, tokenizer_factory):
+def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_config):
      tok = tokenizer_factory()
-    tok.update_statistics()
+    tok.update_statistics(test_config)
  
  
-def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory):
+def test_update_statistics(word_table, table_factory, temp_db_cursor, tokenizer_factory, test_config):
      word_table.add_full_word(1000, 'hello')
      table_factory('search_name',
                    'place_id BIGINT, name_vector INT[]',
                    [(12, [1000])])
      tok = tokenizer_factory()
  
-    tok.update_statistics()
+    tok.update_statistics(test_config)
  
      assert temp_db_cursor.scalar("""SELECT count(*) FROM word
                                      WHERE word_token like ' %' and
author	Sarah Hoffmann <lonvia@denofr.de>
	Sun, 4 Feb 2024 15:43:33 +0000 (16:43 +0100)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Sun, 4 Feb 2024 20:35:10 +0000 (21:35 +0100)
lib-sql/tokenizer/icu_tokenizer_tables.sql	[deleted file]	patch \| blob \| history
nominatim/clicmd/refresh.py		patch \| blob \| history
nominatim/clicmd/setup.py		patch \| blob \| history
nominatim/db/sql_preprocessor.py		patch \| blob \| history
nominatim/tokenizer/base.py		patch \| blob \| history
nominatim/tokenizer/icu_tokenizer.py		patch \| blob \| history
nominatim/tokenizer/legacy_tokenizer.py		patch \| blob \| history
test/python/cli/conftest.py		patch \| blob \| history
test/python/tokenizer/test_icu.py		patch \| blob \| history
test/python/tokenizer/test_legacy.py		patch \| blob \| history