]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/icu_tokenizer.py
do not count words when in reverse-only mode
[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
index 686fbd7939ee70b10a5a7557cec334ba0e324733..3331a3210aaba70d49b602299c2ce9e88238a3a0 100644 (file)
@@ -2,7 +2,6 @@
 Tokenizer implementing normalisation as used before Nominatim 4 but using
 libICU instead of the PostgreSQL module.
 """
-from collections import Counter
 import itertools
 import json
 import logging
@@ -68,10 +67,13 @@ class LegacyICUTokenizer(AbstractTokenizer):
             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
 
 
-    def finalize_import(self, _):
+    def finalize_import(self, config):
         """ Do any required postprocessing to make the tokenizer data ready
             for use.
         """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
 
 
     def update_sql_functions(self, config):
@@ -97,18 +99,19 @@ class LegacyICUTokenizer(AbstractTokenizer):
         """ Recompute frequencies for all name words.
         """
         with connect(self.dsn) as conn:
-            with conn.cursor() as cur:
-                cur.drop_table("word_frequencies")
-                LOG.info("Computing word frequencies")
-                cur.execute("""CREATE TEMP TABLE word_frequencies AS
-                                 SELECT unnest(name_vector) as id, count(*)
-                                 FROM search_name GROUP BY id""")
-                cur.execute("CREATE INDEX ON word_frequencies(id)")
-                LOG.info("Update word table with recomputed frequencies")
-                cur.execute("""UPDATE word
-                               SET info = info || jsonb_build_object('count', count)
-                               FROM word_frequencies WHERE word_id = id""")
-                cur.drop_table("word_frequencies")
+            if conn.table_exists('search_name'):
+                with conn.cursor() as cur:
+                    cur.drop_table("word_frequencies")
+                    LOG.info("Computing word frequencies")
+                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                     SELECT unnest(name_vector) as id, count(*)
+                                     FROM search_name GROUP BY id""")
+                    cur.execute("CREATE INDEX ON word_frequencies(id)")
+                    LOG.info("Update word table with recomputed frequencies")
+                    cur.execute("""UPDATE word
+                                   SET info = info || jsonb_build_object('count', count)
+                                   FROM word_frequencies WHERE word_id = id""")
+                    cur.drop_table("word_frequencies")
             conn.commit()
 
 
@@ -161,43 +164,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
             conn.commit()
 
-            LOG.warning("Precomputing word tokens")
-
-            # get partial words and their frequencies
-            words = self._count_partial_terms(conn)
-
-            # copy them back into the word table
-            with CopyBuffer() as copystr:
-                for term, cnt in words.items():
-                    copystr.add('w', term, json.dumps({'count': cnt}))
-
-                with conn.cursor() as cur:
-                    copystr.copy_out(cur, 'word',
-                                     columns=['type', 'word_token', 'info'])
-                    cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-                                   WHERE word_id is null and type = 'w'""")
-
-            conn.commit()
-
-    def _count_partial_terms(self, conn):
-        """ Count the partial terms from the names in the place table.
-        """
-        words = Counter()
-        analysis = self.loader.make_token_analysis()
-
-        with conn.cursor(name="words") as cur:
-            cur.execute(""" SELECT v, count(*) FROM
-                              (SELECT svals(name) as v FROM place)x
-                            WHERE length(v) < 75 GROUP BY v""")
-
-            for name, cnt in cur:
-                word = analysis.search.transliterate(name)
-                if word and ' ' in word:
-                    for term in set(word.split()):
-                        words[term] += cnt
-
-        return words
-
 
 class LegacyICUNameAnalyzer(AbstractAnalyzer):
     """ The legacy analyzer uses the ICU library for splitting names.