From a3e4e8e5cdffb0056bccb79e11690ed01c9aa5ea Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 22 Feb 2022 20:15:04 +0100 Subject: [PATCH] delete unused country name tokens --- nominatim/tokenizer/icu_tokenizer.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index f5addd3e..98a1daed 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -415,18 +415,24 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): cur.execute("""SELECT word_token FROM word WHERE type = 'C' and word = %s""", (country_code, )) - word_tokens.difference_update((t[0] for t in cur)) + existing_tokens = {t[0] for t in cur} # Only add those names that are not yet in the list. - if word_tokens: + new_tokens = word_tokens - existing_tokens + if new_tokens: cur.execute("""INSERT INTO word (word_token, type, word) (SELECT token, 'C', %s FROM unnest(%s) as token) - """, (country_code, list(word_tokens))) - - # No names are deleted at the moment. - # If deletion is made possible, then the static names from the - # initial 'country_name' table should be kept. + """, (country_code, list(new_tokens))) + + # Delete names that no longer exist. + gone_tokens = existing_tokens - word_tokens + if gone_tokens: + cur.execute("""DELETE FROM word + USING unnest(%s) as token + WHERE type = 'C' and word = %s + and word_token = token""", + (list(gone_tokens), country_code)) def process_place(self, place): -- 2.39.5