From: Sarah Hoffmann Date: Thu, 20 Mar 2025 19:01:46 +0000 (+0100) Subject: do not save word counts of 1 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/1705bb5f5758d6e5aa41a676a3b73902150e22f5?ds=inline;hp=--cc do not save word counts of 1 This is the default setting, which will be assumed when the count is missing. --- 1705bb5f5758d6e5aa41a676a3b73902150e22f5 diff --git a/src/nominatim_db/tokenizer/icu_tokenizer.py b/src/nominatim_db/tokenizer/icu_tokenizer.py index 3da1171f..2b17d611 100644 --- a/src/nominatim_db/tokenizer/icu_tokenizer.py +++ b/src/nominatim_db/tokenizer/icu_tokenizer.py @@ -121,10 +121,10 @@ class ICUTokenizer(AbstractTokenizer): SELECT unnest(nameaddress_vector) as id, count(*) FROM search_name GROUP BY id) SELECT coalesce(a.id, w.id) as id, - (CASE WHEN w.count is null THEN '{}'::JSONB + (CASE WHEN w.count is null or w.count <= 1 THEN '{}'::JSONB ELSE jsonb_build_object('count', w.count) END || - CASE WHEN a.count is null THEN '{}'::JSONB + CASE WHEN a.count is null or a.count <= 1 THEN '{}'::JSONB ELSE jsonb_build_object('addr_count', a.count) END) as info FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id; """) @@ -134,9 +134,10 @@ class ICUTokenizer(AbstractTokenizer): drop_tables(conn, 'tmp_word') cur.execute("""CREATE TABLE tmp_word AS SELECT word_id, word_token, type, word, - (CASE WHEN wf.info is null THEN word.info - ELSE coalesce(word.info, '{}'::jsonb) || wf.info - END) as info + coalesce(word.info, '{}'::jsonb) + - 'count' - 'addr_count' || + coalesce(wf.info, '{}'::jsonb) + as info FROM word LEFT JOIN word_frequencies wf ON word.word_id = wf.id """)