]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/legacy_icu_tokenizer.py
switch housenumber tokens to new word table layout
[nominatim.git] / nominatim / tokenizer / legacy_icu_tokenizer.py
index c585c5afe0bf28bfa24590ed05cb165f6fd2dd01..9fbb9bb09688cf349bfb28a74c84d640fcd84767 100644 (file)
@@ -9,8 +9,6 @@ import re
 from textwrap import dedent
 from pathlib import Path
 
-import psycopg2.extras
-
 from nominatim.db.connection import connect
 from nominatim.db.properties import set_property, get_property
 from nominatim.db.utils import CopyBuffer
@@ -154,7 +152,7 @@ class LegacyICUTokenizer:
         """
         with connect(self.dsn) as conn:
             sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
+            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
             conn.commit()
 
             LOG.warning("Precomputing word tokens")
@@ -341,7 +339,7 @@ class LegacyICUNameAnalyzer:
                 term = self.name_processor.get_search_normalized(word)
                 if term:
                     copystr.add(word, ' ' + term, cls, typ,
-                                oper if oper in ('in', 'near')  else None, 0)
+                                oper if oper in ('in', 'near') else None, 0)
                     added += 1
 
             copystr.copy_out(cursor, 'word',
@@ -359,8 +357,7 @@ class LegacyICUNameAnalyzer:
         to_delete = existing_phrases - new_phrases
 
         if to_delete:
-            psycopg2.extras.execute_values(
-                cursor,
+            cursor.execute_values(
                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
                     WHERE word = name and class = in_class and type = in_type
                           and ((op = '-' and operator is null) or op = operator)""",
@@ -374,21 +371,27 @@ class LegacyICUNameAnalyzer:
         """
         word_tokens = set()
         for name in self._compute_full_names(names):
-            if name:
-                word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
+            norm_name = self.name_processor.get_search_normalized(name)
+            if norm_name:
+                word_tokens.add(norm_name)
 
         with self.conn.cursor() as cur:
             # Get existing names
-            cur.execute("SELECT word_token FROM word WHERE country_code = %s",
+            cur.execute("""SELECT word_token FROM word
+                            WHERE type = 'C' and info->>'cc'= %s""",
                         (country_code, ))
             word_tokens.difference_update((t[0] for t in cur))
 
+            # Only add those names that are not yet in the list.
             if word_tokens:
-                cur.execute("""INSERT INTO word (word_id, word_token, country_code,
-                                                 search_name_count)
-                               (SELECT nextval('seq_word'), token, '{}', 0
+                cur.execute("""INSERT INTO word (word_token, type, info)
+                               (SELECT token, 'C', json_build_object('cc', %s)
                                 FROM unnest(%s) as token)
-                            """.format(country_code), (list(word_tokens),))
+                            """, (country_code, list(word_tokens)))
+
+            # No names are deleted at the moment.
+            # If deletion is made possible, then the static names from the
+            # initial 'country_name' table should be kept.
 
 
     def process_place(self, place):
@@ -411,33 +414,36 @@ class LegacyICUNameAnalyzer:
                 self.add_country_names(country_feature.lower(), names)
 
         address = place.get('address')
-
         if address:
-            hnrs = []
-            addr_terms = []
-            for key, value in address.items():
-                if key == 'postcode':
-                    self._add_postcode(value)
-                elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                    hnrs.append(value)
-                elif key == 'street':
-                    token_info.add_street(*self._compute_name_tokens({'name': value}))
-                elif key == 'place':
-                    token_info.add_place(*self._compute_name_tokens({'name': value}))
-                elif not key.startswith('_') and \
-                     key not in ('country', 'full'):
-                    addr_terms.append((key, *self._compute_name_tokens({'name': value})))
-
-            if hnrs:
-                hnrs = self._split_housenumbers(hnrs)
-                token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
-
-            if addr_terms:
-                token_info.add_address_terms(addr_terms)
+            self._process_place_address(token_info, address)
 
         return token_info.data
 
 
+    def _process_place_address(self, token_info, address):
+        hnrs = []
+        addr_terms = []
+        for key, value in address.items():
+            if key == 'postcode':
+                self._add_postcode(value)
+            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                hnrs.append(value)
+            elif key == 'street':
+                token_info.add_street(*self._compute_name_tokens({'name': value}))
+            elif key == 'place':
+                token_info.add_place(*self._compute_name_tokens({'name': value}))
+            elif not key.startswith('_') and \
+                 key not in ('country', 'full'):
+                addr_terms.append((key, *self._compute_name_tokens({'name': value})))
+
+        if hnrs:
+            hnrs = self._split_housenumbers(hnrs)
+            token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
+
+        if addr_terms:
+            token_info.add_address_terms(addr_terms)
+
+
     def _compute_name_tokens(self, names):
         """ Computes the full name and partial name tokens for the given
             dictionary of names.
@@ -595,7 +601,8 @@ class _TokenCache:
 
     def get_hnr_tokens(self, conn, terms):
         """ Get token ids for a list of housenumbers, looking them up in the
-            database if necessary.
+            database if necessary. `terms` is an iterable of normalized
+            housenumbers.
         """
         tokens = []
         askdb = []