2 Tests for converting a config file to ICU rules.
5 from textwrap import dedent
7 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
8 from nominatim.errors import UsageError
10 from icu import Transliterator
13 def cfgfile(tmp_path, suffix='.yaml'):
14 def _create_config(suffixes, abbr):
18 - "[[:Nonspacing Mark:] [:Cf:]] >"
20 - "[[:Punctuation:][:Space:]]+ > ' '"
24 - "[[:Punctuation:][:Space:]]+ > ' '"
26 content += "compound_suffixes:\n"
27 content += '\n'.join((" - " + s for s in suffixes)) + '\n'
28 content += "abbreviations:\n"
29 content += '\n'.join((" - " + s for s in abbr)) + '\n'
30 fpath = tmp_path / ('test_config' + suffix)
31 fpath.write_text(dedent(content))
37 def test_empty_rule_file(tmp_path):
38 fpath = tmp_path / ('test_config.yaml')
39 fpath.write_text(dedent("""\
46 rules = ICURuleLoader(fpath)
47 assert rules.get_search_rules() == ''
48 assert rules.get_normalization_rules() == ''
49 assert rules.get_transliteration_rules() == ''
50 assert rules.get_replacement_pairs() == []
52 CONFIG_SECTIONS = ('normalization', 'transliteration',
53 'compound_suffixes', 'abbreviations')
55 @pytest.mark.parametrize("section", CONFIG_SECTIONS)
56 def test_missing_normalization(tmp_path, section):
57 fpath = tmp_path / ('test_config.yaml')
58 with fpath.open('w') as fd:
59 for name in CONFIG_SECTIONS:
61 fd.write(name + ':\n')
63 with pytest.raises(UsageError):
67 def test_get_search_rules(cfgfile):
68 fpath = cfgfile(['strasse', 'straße', 'weg'],
69 ['strasse,straße => str',
72 loader = ICURuleLoader(fpath)
74 rules = loader.get_search_rules()
75 trans = Transliterator.createFromRules("test", rules)
77 assert trans.transliterate(" Baum straße ") == " baum straße "
78 assert trans.transliterate(" Baumstraße ") == " baum straße "
79 assert trans.transliterate(" Baumstrasse ") == " baum strasse "
80 assert trans.transliterate(" Baumstr ") == " baum str "
81 assert trans.transliterate(" Baumwegstr ") == " baumweg str "
82 assert trans.transliterate(" Αθήνα ") == " athēna "
83 assert trans.transliterate(" проспект ") == " prospekt "
86 def test_get_normalization_rules(cfgfile):
87 fpath = cfgfile(['strasse', 'straße', 'weg'],
88 ['strasse,straße => str'])
90 loader = ICURuleLoader(fpath)
91 rules = loader.get_normalization_rules()
92 trans = Transliterator.createFromRules("test", rules)
94 assert trans.transliterate(" проспект-Prospekt ") == " проспект prospekt "
97 def test_get_transliteration_rules(cfgfile):
98 fpath = cfgfile(['strasse', 'straße', 'weg'],
99 ['strasse,straße => str'])
101 loader = ICURuleLoader(fpath)
102 rules = loader.get_transliteration_rules()
103 trans = Transliterator.createFromRules("test", rules)
105 assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
108 def test_get_synonym_pairs(cfgfile):
109 fpath = cfgfile(['Weg', 'Strasse'],
110 ['Strasse => str,st'])
112 loader = ICURuleLoader(fpath)
114 repl = loader.get_replacement_pairs()
116 assert sorted(((a, sorted(b)) for a, b in repl)) == \
117 sorted([(' strasse ', [' st ', ' str ', ' strasse ']),
118 ('strasse ', [' st ', ' str ', ' strasse ']),
120 ('str ' , [' str ']),
121 ('weg ', [' weg '])])