bdd: run full import on tests

[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index 98a1daedc37ea142bde24264ebc46fb22f8b2219..9c25b6d7940fc145a2565a326d239463e32227cc 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -390,17 +390,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
  
  
      def add_country_names(self, country_code, names):
-        """ Add names for the given country to the search index.
+        """ Add default names for the given country to the search index.
          """
          # Make sure any name preprocessing for country names applies.
          info = PlaceInfo({'name': names, 'country_code': country_code,
                            'rank_address': 4, 'class': 'boundary',
                            'type': 'administrative'})
          self._add_country_full_names(country_code,
-                                     self.sanitizer.process_names(info)[0])
+                                     self.sanitizer.process_names(info)[0],
+                                     internal=True)
  
  
-    def _add_country_full_names(self, country_code, names):
+    def _add_country_full_names(self, country_code, names, internal=False):
          """ Add names for the given country from an already sanitized
              name list.
          """
@@ -412,21 +413,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
  
          with self.conn.cursor() as cur:
              # Get existing names
-            cur.execute("""SELECT word_token FROM word
-                            WHERE type = 'C' and word = %s""",
+            cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
+                             FROM word
+                             WHERE type = 'C' and word = %s""",
                          (country_code, ))
-            existing_tokens = {t[0] for t in cur}
-
-            # Only add those names that are not yet in the list.
-            new_tokens = word_tokens - existing_tokens
-            if new_tokens:
-                cur.execute("""INSERT INTO word (word_token, type, word)
-                               (SELECT token, 'C', %s
-                                FROM unnest(%s) as token)
-                            """, (country_code, list(new_tokens)))
+            existing_tokens = {True: set(), False: set()} # internal/external names
+            for word in cur:
+                existing_tokens[word[1]].add(word[0])
  
              # Delete names that no longer exist.
-            gone_tokens = existing_tokens - word_tokens
+            gone_tokens = existing_tokens[internal] - word_tokens
+            if internal:
+                gone_tokens.update(existing_tokens[False] & word_tokens)
              if gone_tokens:
                  cur.execute("""DELETE FROM word
                                 USING unnest(%s) as token
@@ -434,6 +432,23 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                                       and word_token = token""",
                              (list(gone_tokens), country_code))
  
+            # Only add those names that are not yet in the list.
+            new_tokens = word_tokens - existing_tokens[True]
+            if not internal:
+                new_tokens -= existing_tokens[False]
+            if new_tokens:
+                if internal:
+                    sql = """INSERT INTO word (word_token, type, word, info)
+                               (SELECT token, 'C', %s, '{"internal": "yes"}'
+                                  FROM unnest(%s) as token)
+                           """
+                else:
+                    sql = """INSERT INTO word (word_token, type, word)
+                                   (SELECT token, 'C', %s
+                                    FROM unnest(%s) as token)
+                          """
+                cur.execute(sql, (country_code, list(new_tokens)))
+
  
      def process_place(self, place):
          """ Determine tokenizer information about the given place.