From: Sarah Hoffmann <lonvia@denofr.de>
Date: Tue, 19 Oct 2021 09:21:16 +0000 (+0200)
Subject: make word recount a tokenizer-specific function
X-Git-Tag: v4.0.0~16^2~2
X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/e8e2502e2f9d2275b8d567341400672adea9fea3?ds=inline

make word recount a tokenizer-specific function
---

diff --git a/lib-sql/words_from_search_name.sql b/lib-sql/words_from_search_name.sql
deleted file mode 100644
index b7727dc6..00000000
--- a/lib-sql/words_from_search_name.sql
+++ /dev/null
@@ -1,11 +0,0 @@
-DROP TABLE IF EXISTS word_frequencies;
-CREATE TABLE word_frequencies AS
- SELECT unnest(name_vector) as id, count(*) FROM search_name GROUP BY id;
-
-CREATE INDEX idx_word_frequencies ON word_frequencies(id);
-
-UPDATE word SET search_name_count = count
-  FROM word_frequencies
- WHERE word_token like ' %' and word_id = id;
-
-DROP TABLE word_frequencies;
diff --git a/nominatim/clicmd/refresh.py b/nominatim/clicmd/refresh.py
index aa540f6b..e7d7d7ba 100644
--- a/nominatim/clicmd/refresh.py
+++ b/nominatim/clicmd/refresh.py
@@ -71,8 +71,8 @@ class UpdateRefresh:
                           "Postcode updates on a frozen database is not possible.")
 
         if args.word_counts:
-            LOG.warning('Recompute frequency of full-word search terms')
-            refresh.recompute_word_counts(args.config.get_libpq_dsn(), args.sqllib_dir)
+            LOG.warning('Recompute word statistics')
+            self._get_tokenizer(args.config).update_statistics()
 
         if args.address_levels:
             cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)
diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py
index 02bc312f..94fac1fc 100644
--- a/nominatim/tokenizer/base.py
+++ b/nominatim/tokenizer/base.py
@@ -205,6 +205,16 @@ class AbstractTokenizer(ABC):
         pass
 
 
+    @abstractmethod
+    def update_statistics(self) -> None:
+        """ Recompute any tokenizer statistics necessary for efficient lookup.
+            This function is meant to be called from time to time by the user
+            to improve performance. However, the tokenizer must not depend on
+            it to be called in order to work.
+        """
+        pass
+
+
     @abstractmethod
     def name_analyzer(self) -> AbstractAnalyzer:
         """ Create a new analyzer for tokenizing names and queries
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py
index 12d1eccd..686fbd79 100644
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -93,6 +93,25 @@ class LegacyICUTokenizer(AbstractTokenizer):
         return None
 
 
+    def update_statistics(self):
+        """ Recompute frequencies for all name words.
+        """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.drop_table("word_frequencies")
+                LOG.info("Computing word frequencies")
+                cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                 SELECT unnest(name_vector) as id, count(*)
+                                 FROM search_name GROUP BY id""")
+                cur.execute("CREATE INDEX ON word_frequencies(id)")
+                LOG.info("Update word table with recomputed frequencies")
+                cur.execute("""UPDATE word
+                               SET info = info || jsonb_build_object('count', count)
+                               FROM word_frequencies WHERE word_id = id""")
+                cur.drop_table("word_frequencies")
+            conn.commit()
+
+
     def name_analyzer(self):
         """ Create a new analyzer for tokenizing names and queries
             using this tokinzer. Analyzers are context managers and should
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py
index c935f20d..d901a68d 100644
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -186,6 +186,24 @@ class LegacyTokenizer(AbstractTokenizer):
             self._save_config(conn, config)
 
 
+    def update_statistics(self):
+        """ Recompute the frequency of full words.
+        """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.drop_table("word_frequencies")
+                LOG.info("Computing word frequencies")
+                cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                 SELECT unnest(name_vector) as id, count(*)
+                                 FROM search_name GROUP BY id""")
+                cur.execute("CREATE INDEX ON word_frequencies(id)")
+                LOG.info("Update word table with recomputed frequencies")
+                cur.execute("""UPDATE word SET search_name_count = count
+                               FROM word_frequencies
+                               WHERE word_token like ' %' and word_id = id""")
+                cur.drop_table("word_frequencies")
+            conn.commit()
+
     def name_analyzer(self):
         """ Create a new analyzer for tokenizing names and queries
             using this tokinzer. Analyzers are context managers and should
diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py
index 5aaee0c8..00ae5dc9 100644
--- a/nominatim/tools/refresh.py
+++ b/nominatim/tools/refresh.py
@@ -14,12 +14,6 @@ from nominatim.version import NOMINATIM_VERSION
 LOG = logging.getLogger()
 
 
-def recompute_word_counts(dsn, sql_dir):
-    """ Compute the frequency of full-word search terms.
-    """
-    execute_file(dsn, sql_dir / 'words_from_search_name.sql')
-
-
 def _add_address_level_rows_from_entry(rows, entry):
     """ Converts a single entry from the JSON format for address rank
         descriptions into a flat format suitable for inserting into a