]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/icu_tokenizer.py
bdd: run full import on tests
[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
index 98a1daedc37ea142bde24264ebc46fb22f8b2219..9c25b6d7940fc145a2565a326d239463e32227cc 100644 (file)
@@ -390,17 +390,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
 
 
     def add_country_names(self, country_code, names):
-        """ Add names for the given country to the search index.
+        """ Add default names for the given country to the search index.
         """
         # Make sure any name preprocessing for country names applies.
         info = PlaceInfo({'name': names, 'country_code': country_code,
                           'rank_address': 4, 'class': 'boundary',
                           'type': 'administrative'})
         self._add_country_full_names(country_code,
-                                     self.sanitizer.process_names(info)[0])
+                                     self.sanitizer.process_names(info)[0],
+                                     internal=True)
 
 
-    def _add_country_full_names(self, country_code, names):
+    def _add_country_full_names(self, country_code, names, internal=False):
         """ Add names for the given country from an already sanitized
             name list.
         """
@@ -412,21 +413,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
 
         with self.conn.cursor() as cur:
             # Get existing names
-            cur.execute("""SELECT word_token FROM word
-                            WHERE type = 'C' and word = %s""",
+            cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
+                             FROM word
+                             WHERE type = 'C' and word = %s""",
                         (country_code, ))
-            existing_tokens = {t[0] for t in cur}
-
-            # Only add those names that are not yet in the list.
-            new_tokens = word_tokens - existing_tokens
-            if new_tokens:
-                cur.execute("""INSERT INTO word (word_token, type, word)
-                               (SELECT token, 'C', %s
-                                FROM unnest(%s) as token)
-                            """, (country_code, list(new_tokens)))
+            existing_tokens = {True: set(), False: set()} # internal/external names
+            for word in cur:
+                existing_tokens[word[1]].add(word[0])
 
             # Delete names that no longer exist.
-            gone_tokens = existing_tokens - word_tokens
+            gone_tokens = existing_tokens[internal] - word_tokens
+            if internal:
+                gone_tokens.update(existing_tokens[False] & word_tokens)
             if gone_tokens:
                 cur.execute("""DELETE FROM word
                                USING unnest(%s) as token
@@ -434,6 +432,23 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                                      and word_token = token""",
                             (list(gone_tokens), country_code))
 
+            # Only add those names that are not yet in the list.
+            new_tokens = word_tokens - existing_tokens[True]
+            if not internal:
+                new_tokens -= existing_tokens[False]
+            if new_tokens:
+                if internal:
+                    sql = """INSERT INTO word (word_token, type, word, info)
+                               (SELECT token, 'C', %s, '{"internal": "yes"}'
+                                  FROM unnest(%s) as token)
+                           """
+                else:
+                    sql = """INSERT INTO word (word_token, type, word)
+                                   (SELECT token, 'C', %s
+                                    FROM unnest(%s) as token)
+                          """
+                cur.execute(sql, (country_code, list(new_tokens)))
+
 
     def process_place(self, place):
         """ Determine tokenizer information about the given place.