This ensures that it is less likely that we exclude meaningful
words like 'hauptstrasse' just because they are frequent.
for name, cnt in cur:
terms = set()
for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
for name, cnt in cur:
terms = set()
for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
- terms.update(word.split())
+ if ' ' in word:
+ terms.update(word.split())
for term in terms:
words[term] += cnt
for term in terms:
words[term] += cnt
tok = tokenizer_factory()
tok.init_new_db(test_config)
tok = tokenizer_factory()
tok.init_new_db(test_config)
- assert word_table.get_partial_words() == {('test', 1), ('52', 1),
+ assert word_table.get_partial_words() == {('test', 1),
- ('holzstrasse', 1), ('holzstr', 1),
('holz', 1), ('strasse', 1),
('str', 1)}
('holz', 1), ('strasse', 1),
('str', 1)}