From: Sarah Hoffmann Date: Sat, 26 Jun 2021 09:57:09 +0000 (+0200) Subject: only consider partials in multi-words for initial count X-Git-Tag: v4.0.0~58^2~7 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/b9fbfeff67b420905a4176f4f5e9312746d0c42e?ds=sidebyside only consider partials in multi-words for initial count This ensures that it is less likely that we exclude meaningful words like 'hauptstrasse' just because they are frequent. --- diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 5f83b73d..6bf409cc 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -168,7 +168,8 @@ class LegacyICUTokenizer: for name, cnt in cur: terms = set() for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)): - terms.update(word.split()) + if ' ' in word: + terms.update(word.split()) for term in terms: words[term] += cnt diff --git a/test/python/test_tokenizer_legacy_icu.py b/test/python/test_tokenizer_legacy_icu.py index 56c08e5a..39fc9fb4 100644 --- a/test/python/test_tokenizer_legacy_icu.py +++ b/test/python/test_tokenizer_legacy_icu.py @@ -150,9 +150,8 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table): tok = tokenizer_factory() tok.init_new_db(test_config) - assert word_table.get_partial_words() == {('test', 1), ('52', 1), + assert word_table.get_partial_words() == {('test', 1), ('no', 1), ('area', 2), - ('holzstrasse', 1), ('holzstr', 1), ('holz', 1), ('strasse', 1), ('str', 1)}