monkeypatch.undo()
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
- suffixes=('gasse', ), abbr=('street => st', )):
+ variants=('~gasse -> gasse', 'street => st', )):
cfgfile = tmp_path / 'analyser_test_config.yaml'
with cfgfile.open('w') as stream:
cfgstr = {'normalization' : list(norm),
'transliteration' : list(trans),
- 'compound_suffixes' : list(suffixes),
- 'abbreviations' : list(abbr)}
+ 'variants' : [ {'words': list(variants)}]}
yaml.dump(cfgstr, stream)
tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgfile))
tok = tokenizer_factory()
tok.init_new_db(test_config)
- assert word_table.get_partial_words() == {('te', 1), ('st', 1), ('52', 1),
+ assert word_table.get_partial_words() == {('test', 1),
('no', 1), ('area', 2),
('holz', 1), ('strasse', 1),
('str', 1)}