only consider partials in multi-words for initial count

[nominatim.git] / test / python / test_tokenizer_legacy_icu.py
diff --git a/test/python/test_tokenizer_legacy_icu.py b/test/python/test_tokenizer_legacy_icu.py

index b86925ee5dd22b134bd66b7419db8b7c7b45f9d1..39fc9fb4c5a7f348c29ffe8c3b490caf458063f4 100644 (file)
--- a/test/python/test_tokenizer_legacy_icu.py
+++ b/test/python/test_tokenizer_legacy_icu.py
@@ -60,13 +60,12 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
      monkeypatch.undo()
  
      def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
-                     suffixes=('gasse', ), abbr=('street => st', )):
+                     variants=('~gasse -> gasse', 'street => st', )):
          cfgfile = tmp_path / 'analyser_test_config.yaml'
          with cfgfile.open('w') as stream:
              cfgstr = {'normalization' : list(norm),
                         'transliteration' : list(trans),
-                       'compound_suffixes' : list(suffixes),
-                       'abbreviations' : list(abbr)}
+                       'variants' : [ {'words': list(variants)}]}
              yaml.dump(cfgstr, stream)
          tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgfile))
  
@@ -151,9 +150,8 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
      tok = tokenizer_factory()
      tok.init_new_db(test_config)
  
-    assert word_table.get_partial_words() == {('test', 1), ('52', 1),
+    assert word_table.get_partial_words() == {('test', 1),
                                                ('no', 1), ('area', 2),
-                                              ('holzstrasse', 1), ('holzstr', 1),
                                                ('holz', 1), ('strasse', 1),
                                                ('str', 1)}