only consider partials in multi-words for initial count

author Sarah Hoffmann <lonvia@denofr.de>

Sat, 26 Jun 2021 09:57:09 +0000 (11:57 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Sun, 4 Jul 2021 08:28:20 +0000 (10:28 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Sat, 26 Jun 2021 09:57:09 +0000 (11:57 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Sun, 4 Jul 2021 08:28:20 +0000 (10:28 +0200)
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index 5f83b73dc2c7c7fb6e7c444404549abf11c863cb..6bf409cca3ab3674b41605b06e8dfe49eda40e41 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -168,7 +168,8 @@ class LegacyICUTokenizer:
                  for name, cnt in cur:
                      terms = set()
                      for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
                  for name, cnt in cur:
                      terms = set()
                      for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
-                        terms.update(word.split())
+                        if ' ' in word:
+                            terms.update(word.split())
                      for term in terms:
                          words[term] += cnt
  
                      for term in terms:
                          words[term] += cnt
  
diff --git a/test/python/test_tokenizer_legacy_icu.py b/test/python/test_tokenizer_legacy_icu.py

index 56c08e5aa11b5d623bb21daafea291e8e6f8f133..39fc9fb4c5a7f348c29ffe8c3b490caf458063f4 100644 (file)
--- a/test/python/test_tokenizer_legacy_icu.py
+++ b/test/python/test_tokenizer_legacy_icu.py
@@ -150,9 +150,8 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
      tok = tokenizer_factory()
      tok.init_new_db(test_config)
  
      tok = tokenizer_factory()
      tok.init_new_db(test_config)
  
-    assert word_table.get_partial_words() == {('test', 1), ('52', 1),
+    assert word_table.get_partial_words() == {('test', 1),
                                                ('no', 1), ('area', 2),
                                                ('no', 1), ('area', 2),
-                                              ('holzstrasse', 1), ('holzstr', 1),
                                                ('holz', 1), ('strasse', 1),
                                                ('str', 1)}
  
                                                ('holz', 1), ('strasse', 1),
                                                ('str', 1)}
author	Sarah Hoffmann <lonvia@denofr.de>
	Sat, 26 Jun 2021 09:57:09 +0000 (11:57 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Sun, 4 Jul 2021 08:28:20 +0000 (10:28 +0200)
nominatim/tokenizer/legacy_icu_tokenizer.py		patch \| blob \| history
test/python/test_tokenizer_legacy_icu.py		patch \| blob \| history