X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/a263e54b9463c89addeb0ac613d0586378de22f2..87907916ff06741bd4627101d30e6e11b8ea1a1a:/nominatim/tokenizer/legacy_icu_tokenizer.py diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 696004f5..2bd22c72 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -127,7 +127,7 @@ class LegacyICUTokenizer: Analyzers are not thread-safe. You need to instantiate one per thread. """ norm = Transliterator.createFromRules("normalizer", self.normalization) - trans = Transliterator.createFromRules("normalizer", self.transliteration) + trans = Transliterator.createFromRules("trans", self.transliteration) return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations) @@ -263,6 +263,16 @@ class LegacyICUNameAnalyzer: """ return self.normalizer.transliterate(phrase) + @staticmethod + def normalize_postcode(postcode): + """ Convert the postcode to a standardized form. + + This function must yield exactly the same result as the SQL function + 'token_normalized_postcode()'. + """ + return postcode.strip().upper() + + @functools.lru_cache(maxsize=1024) def make_standard_word(self, name): """ Create the normalised version of the input. @@ -285,28 +295,47 @@ class LegacyICUNameAnalyzer: return self.transliterator.transliterate(hnr) - def add_postcodes_from_db(self): - """ Add postcodes from the location_postcode table to the word table. + def update_postcodes_from_db(self): + """ Update postcode tokens in the word table from the location_postcode + table. """ + to_delete = [] copystr = io.StringIO() with self.conn.cursor() as cur: - cur.execute("SELECT distinct(postcode) FROM location_postcode") - for (postcode, ) in cur: - copystr.write(postcode) - copystr.write('\t ') - copystr.write(self.transliterator.transliterate(postcode)) - copystr.write('\tplace\tpostcode\t0\n') - - copystr.seek(0) - cur.copy_from(copystr, 'word', - columns=['word', 'word_token', 'class', 'type', - 'search_name_count']) - # Don't really need an ID for postcodes.... - # cur.execute("""UPDATE word SET word_id = nextval('seq_word') - # WHERE word_id is null and type = 'postcode'""") - - - def update_special_phrases(self, phrases): + # This finds us the rows in location_postcode and word that are + # missing in the other table. + cur.execute("""SELECT * FROM + (SELECT pc, word FROM + (SELECT distinct(postcode) as pc FROM location_postcode) p + FULL JOIN + (SELECT word FROM word + WHERE class ='place' and type = 'postcode') w + ON pc = word) x + WHERE pc is null or word is null""") + + for postcode, word in cur: + if postcode is None: + to_delete.append(word) + else: + copystr.write(postcode) + copystr.write('\t ') + copystr.write(self.transliterator.transliterate(postcode)) + copystr.write('\tplace\tpostcode\t0\n') + + if to_delete: + cur.execute("""DELETE FROM WORD + WHERE class ='place' and type = 'postcode' + and word = any(%s) + """, (to_delete, )) + + if copystr.getvalue(): + copystr.seek(0) + cur.copy_from(copystr, 'word', + columns=['word', 'word_token', 'class', 'type', + 'search_name_count']) + + + def update_special_phrases(self, phrases, should_replace): """ Replace the search index for special phrases with the new phrases. """ norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3]) @@ -345,7 +374,7 @@ class LegacyICUNameAnalyzer: columns=['word', 'word_token', 'class', 'type', 'operator', 'search_name_count']) - if to_delete: + if to_delete and should_replace: psycopg2.extras.execute_values( cur, """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) @@ -394,8 +423,7 @@ class LegacyICUNameAnalyzer: names = place.get('name') if names: - full_names = set((self.make_standard_word(name) for name in names.values())) - full_names.discard('') + full_names = self._compute_full_names(names) token_info.add_names(self.conn, full_names) @@ -432,25 +460,47 @@ class LegacyICUNameAnalyzer: return token_info.data + def _compute_full_names(self, names): + """ Return the set of all full name word ids to be used with the + given dictionary of names. + """ + full_names = set() + for name in (n for ns in names.values() for n in re.split('[;,]', ns)): + word = self.make_standard_word(name) + if word: + full_names.add(word) + + brace_split = name.split('(', 2) + if len(brace_split) > 1: + word = self.make_standard_word(brace_split[0]) + if word: + full_names.add(word) + + return full_names + + def _add_postcode(self, postcode): """ Make sure the normalized postcode is present in the word table. """ - if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes: - term = self.make_standard_word(postcode) - if not term: - return - - with self.conn.cursor() as cur: - # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word, word_token, class, type, - search_name_count) - (SELECT pc, %s, 'place', 'postcode', 0 - FROM (VALUES (%s)) as v(pc) - WHERE NOT EXISTS - (SELECT * FROM word - WHERE word = pc and class='place' and type='postcode')) - """, (' ' + term, postcode)) - self._cache.postcodes.add(postcode) + if re.search(r'[:,;]', postcode) is None: + postcode = self.normalize_postcode(postcode) + + if postcode not in self._cache.postcodes: + term = self.make_standard_word(postcode) + if not term: + return + + with self.conn.cursor() as cur: + # no word_id needed for postcodes + cur.execute("""INSERT INTO word (word, word_token, class, type, + search_name_count) + (SELECT pc, %s, 'place', 'postcode', 0 + FROM (VALUES (%s)) as v(pc) + WHERE NOT EXISTS + (SELECT * FROM word + WHERE word = pc and class='place' and type='postcode')) + """, (' ' + term, postcode)) + self._cache.postcodes.add(postcode) @staticmethod def _split_housenumbers(hnrs): @@ -487,8 +537,6 @@ class _TokenInfo: """ # Start with all partial names terms = set((part for ns in names for part in ns.split())) - # Add partials for the full terms (TO BE REMOVED) - terms.update((n for n in names)) # Add the full names terms.update((' ' + n for n in names))