X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/63e35574d4998ea64c1b26b1680d14ff4fd14036..87907916ff06741bd4627101d30e6e11b8ea1a1a:/nominatim/tokenizer/legacy_icu_tokenizer.py?ds=sidebyside diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 7205ddef..2bd22c72 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -335,7 +335,7 @@ class LegacyICUNameAnalyzer: 'search_name_count']) - def update_special_phrases(self, phrases): + def update_special_phrases(self, phrases, should_replace): """ Replace the search index for special phrases with the new phrases. """ norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3]) @@ -374,7 +374,7 @@ class LegacyICUNameAnalyzer: columns=['word', 'word_token', 'class', 'type', 'operator', 'search_name_count']) - if to_delete: + if to_delete and should_replace: psycopg2.extras.execute_values( cur, """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) @@ -423,8 +423,7 @@ class LegacyICUNameAnalyzer: names = place.get('name') if names: - full_names = set((self.make_standard_word(name) for name in names.values())) - full_names.discard('') + full_names = self._compute_full_names(names) token_info.add_names(self.conn, full_names) @@ -461,6 +460,25 @@ class LegacyICUNameAnalyzer: return token_info.data + def _compute_full_names(self, names): + """ Return the set of all full name word ids to be used with the + given dictionary of names. + """ + full_names = set() + for name in (n for ns in names.values() for n in re.split('[;,]', ns)): + word = self.make_standard_word(name) + if word: + full_names.add(word) + + brace_split = name.split('(', 2) + if len(brace_split) > 1: + word = self.make_standard_word(brace_split[0]) + if word: + full_names.add(word) + + return full_names + + def _add_postcode(self, postcode): """ Make sure the normalized postcode is present in the word table. """ @@ -519,8 +537,6 @@ class _TokenInfo: """ # Start with all partial names terms = set((part for ns in names for part in ns.split())) - # Add partials for the full terms (TO BE REMOVED) - terms.update((n for n in names)) # Add the full names terms.update((' ' + n for n in names))