X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/206ee8718864d623507a0ae69070478dec411e84..a3e4e8e5cdffb0056bccb79e11690ed01c9aa5ea:/nominatim/tokenizer/icu_tokenizer.py diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index cfbb44e3..98a1daed 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -112,6 +112,47 @@ class LegacyICUTokenizer(AbstractTokenizer): conn.commit() + def _cleanup_housenumbers(self): + """ Remove unused house numbers. + """ + with connect(self.dsn) as conn: + if not conn.table_exists('search_name'): + return + with conn.cursor(name="hnr_counter") as cur: + cur.execute("""SELECT word_id, word_token FROM word + WHERE type = 'H' + AND NOT EXISTS(SELECT * FROM search_name + WHERE ARRAY[word.word_id] && name_vector) + AND (char_length(word_token) > 6 + OR word_token not similar to '\\d+') + """) + candidates = {token: wid for wid, token in cur} + with conn.cursor(name="hnr_counter") as cur: + cur.execute("""SELECT housenumber FROM placex + WHERE housenumber is not null + AND (char_length(housenumber) > 6 + OR housenumber not similar to '\\d+') + """) + for row in cur: + for hnr in row[0].split(';'): + candidates.pop(hnr, None) + LOG.info("There are %s outdated housenumbers.", len(candidates)) + if candidates: + with conn.cursor() as cur: + cur.execute("""DELETE FROM word WHERE word_id = any(%s)""", + (list(candidates.values()), )) + conn.commit() + + + + def update_word_tokens(self): + """ Remove unused tokens. + """ + LOG.warning("Cleaning up housenumber tokens.") + self._cleanup_housenumbers() + LOG.warning("Tokenizer house-keeping done.") + + def name_analyzer(self): """ Create a new analyzer for tokenizing names and queries using this tokinzer. Analyzers are context managers and should @@ -374,18 +415,24 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): cur.execute("""SELECT word_token FROM word WHERE type = 'C' and word = %s""", (country_code, )) - word_tokens.difference_update((t[0] for t in cur)) + existing_tokens = {t[0] for t in cur} # Only add those names that are not yet in the list. - if word_tokens: + new_tokens = word_tokens - existing_tokens + if new_tokens: cur.execute("""INSERT INTO word (word_token, type, word) (SELECT token, 'C', %s FROM unnest(%s) as token) - """, (country_code, list(word_tokens))) - - # No names are deleted at the moment. - # If deletion is made possible, then the static names from the - # initial 'country_name' table should be kept. + """, (country_code, list(new_tokens))) + + # Delete names that no longer exist. + gone_tokens = existing_tokens - word_tokens + if gone_tokens: + cur.execute("""DELETE FROM word + USING unnest(%s) as token + WHERE type = 'C' and word = %s + and word_token = token""", + (list(gone_tokens), country_code)) def process_place(self, place):