Merge remote-tracking branch 'upstream/master'

[nominatim.git] / nominatim / tokenizer / legacy_icu_tokenizer.py
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index 7205ddefab0c449ec33da6610fe98edb8cfb48ba..2bd22c7207cb3f3cbf2de920a0a5887d67afd04d 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -335,7 +335,7 @@ class LegacyICUNameAnalyzer:
                                         'search_name_count'])
  
  
                                         'search_name_count'])
  
  
-    def update_special_phrases(self, phrases):
+    def update_special_phrases(self, phrases, should_replace):
          """ Replace the search index for special phrases with the new phrases.
          """
          norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
          """ Replace the search index for special phrases with the new phrases.
          """
          norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
@@ -374,7 +374,7 @@ class LegacyICUNameAnalyzer:
                                columns=['word', 'word_token', 'class', 'type',
                                         'operator', 'search_name_count'])
  
                                columns=['word', 'word_token', 'class', 'type',
                                         'operator', 'search_name_count'])
  
-            if to_delete:
+            if to_delete and should_replace:
                  psycopg2.extras.execute_values(
                      cur,
                      """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
                  psycopg2.extras.execute_values(
                      cur,
                      """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
@@ -423,8 +423,7 @@ class LegacyICUNameAnalyzer:
          names = place.get('name')
  
          if names:
          names = place.get('name')
  
          if names:
-            full_names = set((self.make_standard_word(name) for name in names.values()))
-            full_names.discard('')
+            full_names = self._compute_full_names(names)
  
              token_info.add_names(self.conn, full_names)
  
  
              token_info.add_names(self.conn, full_names)
  
@@ -461,6 +460,25 @@ class LegacyICUNameAnalyzer:
          return token_info.data
  
  
          return token_info.data
  
  
+    def _compute_full_names(self, names):
+        """ Return the set of all full name word ids to be used with the
+            given dictionary of names.
+        """
+        full_names = set()
+        for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
+            word = self.make_standard_word(name)
+            if word:
+                full_names.add(word)
+
+                brace_split = name.split('(', 2)
+                if len(brace_split) > 1:
+                    word = self.make_standard_word(brace_split[0])
+                    if word:
+                        full_names.add(word)
+
+        return full_names
+
+
      def _add_postcode(self, postcode):
          """ Make sure the normalized postcode is present in the word table.
          """
      def _add_postcode(self, postcode):
          """ Make sure the normalized postcode is present in the word table.
          """
@@ -519,8 +537,6 @@ class _TokenInfo:
          """
          # Start with all partial names
          terms = set((part for ns in names for part in ns.split()))
          """
          # Start with all partial names
          terms = set((part for ns in names for part in ns.split()))
-        # Add partials for the full terms (TO BE REMOVED)
-        terms.update((n for n in names))
          # Add the full names
          terms.update((' ' + n for n in names))
  
          # Add the full names
          terms.update((' ' + n for n in names))