From: Sarah Hoffmann Date: Fri, 18 Feb 2022 15:43:17 +0000 (+0100) Subject: adapt housenumber cleanup to new word table structure X-Git-Tag: v4.1.0~75^2~3 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/4a3bbd031951e0c2ff0cd88072701c313611ee11 adapt housenumber cleanup to new word table structure --- diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 7bc4720e..b20b32d9 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -119,12 +119,12 @@ class LegacyICUTokenizer(AbstractTokenizer): if not conn.table_exists('search_name'): return with conn.cursor(name="hnr_counter") as cur: - cur.execute("""SELECT word_id, word_token FROM word + cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token) FROM word WHERE type = 'H' AND NOT EXISTS(SELECT * FROM search_name WHERE ARRAY[word.word_id] && name_vector) - AND (char_length(word_token) > 6 - OR word_token not similar to '\\d+') + AND (char_length(coalesce(word, word_token)) > 6 + OR coalesce(word, word_token) not similar to '\\d+') """) candidates = {token: wid for wid, token in cur} with conn.cursor(name="hnr_counter") as cur: @@ -137,6 +137,7 @@ class LegacyICUTokenizer(AbstractTokenizer): for hnr in row[0].split(';'): candidates.pop(hnr, None) LOG.info("There are %s outdated housenumbers.", len(candidates)) + LOG.debug("Outdated housenumbers: %s", candidates.keys()) if candidates: with conn.cursor() as cur: cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",