X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/18c99a5c5f55636175a0b1baa2a8e3d426b0937c..d7f9d2bde90ccc12507598d87cc7fe57e225f5ea:/nominatim/tokenizer/legacy_icu_tokenizer.py?ds=sidebyside diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 6fba6c8d..7205ddef 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -35,7 +35,7 @@ def create(dsn, data_dir): class LegacyICUTokenizer: """ This tokenizer uses libICU to covert names and queries to ASCII. Otherwise it uses the same algorithms and data structures as the - normalization routines in Nominatm 3. + normalization routines in Nominatim 3. """ def __init__(self, dsn, data_dir): @@ -127,7 +127,7 @@ class LegacyICUTokenizer: Analyzers are not thread-safe. You need to instantiate one per thread. """ norm = Transliterator.createFromRules("normalizer", self.normalization) - trans = Transliterator.createFromRules("normalizer", self.transliteration) + trans = Transliterator.createFromRules("trans", self.transliteration) return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations) @@ -228,15 +228,54 @@ class LegacyICUNameAnalyzer: self.conn = None + def get_word_token_info(self, conn, words): + """ Return token information for the given list of words. + If a word starts with # it is assumed to be a full name + otherwise is a partial name. + + The function returns a list of tuples with + (original word, word token, word id). + + The function is used for testing and debugging only + and not necessarily efficient. + """ + tokens = {} + for word in words: + if word.startswith('#'): + tokens[word] = ' ' + self.make_standard_word(word[1:]) + else: + tokens[word] = self.make_standard_word(word) + + with conn.cursor() as cur: + cur.execute("""SELECT word_token, word_id + FROM word, (SELECT unnest(%s::TEXT[]) as term) t + WHERE word_token = t.term + and class is null and country_code is null""", + (list(tokens.values()), )) + ids = {r[0]: r[1] for r in cur} + + return [(k, v, ids[v]) for k, v in tokens.items()] + + def normalize(self, phrase): """ Normalize the given phrase, i.e. remove all properties that are irrelevant for search. """ return self.normalizer.transliterate(phrase) + @staticmethod + def normalize_postcode(postcode): + """ Convert the postcode to a standardized form. + + This function must yield exactly the same result as the SQL function + 'token_normalized_postcode()'. + """ + return postcode.strip().upper() + + @functools.lru_cache(maxsize=1024) def make_standard_word(self, name): - """ Create the normalised version of the name. + """ Create the normalised version of the input. """ norm = ' ' + self.transliterator.transliterate(name) + ' ' for full, abbr in self.abbreviations: @@ -256,25 +295,44 @@ class LegacyICUNameAnalyzer: return self.transliterator.transliterate(hnr) - def add_postcodes_from_db(self): - """ Add postcodes from the location_postcode table to the word table. + def update_postcodes_from_db(self): + """ Update postcode tokens in the word table from the location_postcode + table. """ + to_delete = [] copystr = io.StringIO() with self.conn.cursor() as cur: - cur.execute("SELECT distinct(postcode) FROM location_postcode") - for (postcode, ) in cur: - copystr.write(postcode) - copystr.write('\t ') - copystr.write(self.transliterator.transliterate(postcode)) - copystr.write('\tplace\tpostcode\t0\n') - - copystr.seek(0) - cur.copy_from(copystr, 'word', - columns=['word', 'word_token', 'class', 'type', - 'search_name_count']) - # Don't really need an ID for postcodes.... - # cur.execute("""UPDATE word SET word_id = nextval('seq_word') - # WHERE word_id is null and type = 'postcode'""") + # This finds us the rows in location_postcode and word that are + # missing in the other table. + cur.execute("""SELECT * FROM + (SELECT pc, word FROM + (SELECT distinct(postcode) as pc FROM location_postcode) p + FULL JOIN + (SELECT word FROM word + WHERE class ='place' and type = 'postcode') w + ON pc = word) x + WHERE pc is null or word is null""") + + for postcode, word in cur: + if postcode is None: + to_delete.append(word) + else: + copystr.write(postcode) + copystr.write('\t ') + copystr.write(self.transliterator.transliterate(postcode)) + copystr.write('\tplace\tpostcode\t0\n') + + if to_delete: + cur.execute("""DELETE FROM WORD + WHERE class ='place' and type = 'postcode' + and word = any(%s) + """, (to_delete, )) + + if copystr.getvalue(): + copystr.seek(0) + cur.copy_from(copystr, 'word', + columns=['word', 'word_token', 'class', 'type', + 'search_name_count']) def update_special_phrases(self, phrases): @@ -333,24 +391,25 @@ class LegacyICUNameAnalyzer: """ full_names = set((self.make_standard_word(n) for n in names)) full_names.discard('') - self._add_normalised_country_names(country_code, full_names) + self._add_normalized_country_names(country_code, full_names) - def _add_normalised_country_names(self, country_code, names): + def _add_normalized_country_names(self, country_code, names): """ Add names for the given country to the search index. """ + word_tokens = set((' ' + name for name in names)) with self.conn.cursor() as cur: # Get existing names cur.execute("SELECT word_token FROM word WHERE country_code = %s", (country_code, )) - new_names = names.difference((t[0] for t in cur)) + word_tokens.difference_update((t[0] for t in cur)) - if new_names: + if word_tokens: cur.execute("""INSERT INTO word (word_id, word_token, country_code, search_name_count) (SELECT nextval('seq_word'), token, '{}', 0 FROM unnest(%s) as token) - """.format(country_code), (list(new_names),)) + """.format(country_code), (list(word_tokens),)) def process_place(self, place): @@ -371,7 +430,7 @@ class LegacyICUNameAnalyzer: country_feature = place.get('country_feature') if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): - self._add_normalised_country_names(country_feature.lower(), + self._add_normalized_country_names(country_feature.lower(), full_names) address = place.get('address') @@ -405,22 +464,25 @@ class LegacyICUNameAnalyzer: def _add_postcode(self, postcode): """ Make sure the normalized postcode is present in the word table. """ - if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes: - term = self.make_standard_word(postcode) - if not term: - return - - with self.conn.cursor() as cur: - # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word, word_token, class, type, - search_name_count) - (SELECT pc, %s, 'place', 'postcode', 0 - FROM (VALUES (%s)) as v(pc) - WHERE NOT EXISTS - (SELECT * FROM word - WHERE word = pc and class='place' and type='postcode')) - """, (' ' + term, postcode)) - self._cache.postcodes.add(postcode) + if re.search(r'[:,;]', postcode) is None: + postcode = self.normalize_postcode(postcode) + + if postcode not in self._cache.postcodes: + term = self.make_standard_word(postcode) + if not term: + return + + with self.conn.cursor() as cur: + # no word_id needed for postcodes + cur.execute("""INSERT INTO word (word, word_token, class, type, + search_name_count) + (SELECT pc, %s, 'place', 'postcode', 0 + FROM (VALUES (%s)) as v(pc) + WHERE NOT EXISTS + (SELECT * FROM word + WHERE word = pc and class='place' and type='postcode')) + """, (' ' + term, postcode)) + self._cache.postcodes.add(postcode) @staticmethod def _split_housenumbers(hnrs):