2 Tests for converting a config file to ICU rules.
5 from textwrap import dedent
7 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
8 from nominatim.errors import UsageError
10 from icu import Transliterator
13 def cfgfile(tmp_path, suffix='.yaml'):
14 def _create_config(suffixes, abbr):
18 - "[[:Nonspacing Mark:] [:Cf:]] >"
20 - "[[:Punctuation:][:Space:]]+ > ' '"
24 - "[[:Punctuation:][:Space:]]+ > ' '"
26 content += "compound_suffixes:\n"
27 content += '\n'.join((" - " + s for s in suffixes)) + '\n'
28 content += "abbreviations:\n"
29 content += '\n'.join((" - " + s for s in abbr)) + '\n'
30 fpath = tmp_path / ('test_config' + suffix)
31 fpath.write_text(dedent(content))
37 def test_empty_rule_file(tmp_path):
38 fpath = tmp_path / ('test_config.yaml')
39 fpath.write_text(dedent("""\
46 rules = ICURuleLoader(fpath)
47 assert rules.get_search_rules() == ''
48 assert rules.get_normalization_rules() == ''
49 assert rules.get_transliteration_rules() == ''
50 assert rules.get_replacement_pairs() == []
52 CONFIG_SECTIONS = ('normalization', 'transliteration',
53 'compound_suffixes', 'abbreviations')
55 @pytest.mark.parametrize("section", CONFIG_SECTIONS)
56 def test_missing_normalization(tmp_path, section):
57 fpath = tmp_path / ('test_config.yaml')
58 with fpath.open('w') as fd:
59 for name in CONFIG_SECTIONS:
61 fd.write(name + ':\n')
63 with pytest.raises(UsageError):
66 @pytest.mark.parametrize("abbr", ["simple",
67 "double => arrow => bad",
69 def test_bad_abbreviation_syntax(tmp_path, abbr):
70 fpath = tmp_path / ('test_config.yaml')
71 fpath.write_text(dedent("""\
79 with pytest.raises(UsageError):
80 rules = ICURuleLoader(fpath)
83 def test_get_search_rules(cfgfile):
84 fpath = cfgfile(['strasse', 'straße', 'weg'],
85 ['strasse,straße => str',
88 loader = ICURuleLoader(fpath)
90 rules = loader.get_search_rules()
91 trans = Transliterator.createFromRules("test", rules)
93 assert trans.transliterate(" Baum straße ") == " baum straße "
94 assert trans.transliterate(" Baumstraße ") == " baumstraße "
95 assert trans.transliterate(" Baumstrasse ") == " baumstrasse "
96 assert trans.transliterate(" Baumstr ") == " baumstr "
97 assert trans.transliterate(" Baumwegstr ") == " baumwegstr "
98 assert trans.transliterate(" Αθήνα ") == " athēna "
99 assert trans.transliterate(" проспект ") == " prospekt "
102 def test_get_normalization_rules(cfgfile):
103 fpath = cfgfile(['strasse', 'straße', 'weg'],
104 ['strasse,straße => str'])
106 loader = ICURuleLoader(fpath)
107 rules = loader.get_normalization_rules()
108 trans = Transliterator.createFromRules("test", rules)
110 assert trans.transliterate(" проспект-Prospekt ") == " проспект prospekt "
113 def test_get_transliteration_rules(cfgfile):
114 fpath = cfgfile(['strasse', 'straße', 'weg'],
115 ['strasse,straße => str'])
117 loader = ICURuleLoader(fpath)
118 rules = loader.get_transliteration_rules()
119 trans = Transliterator.createFromRules("test", rules)
121 assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
124 def test_transliteration_rules_from_file(tmp_path):
125 cfgpath = tmp_path / ('test_config.yaml')
126 cfgpath.write_text(dedent("""\
130 - !include transliteration.yaml
134 transpath = tmp_path / ('transliteration.yaml')
135 transpath.write_text('- "x > y"')
137 loader = ICURuleLoader(cfgpath)
138 rules = loader.get_transliteration_rules()
139 trans = Transliterator.createFromRules("test", rules)
141 assert trans.transliterate(" axxt ") == " byt "
144 def test_get_replacement_pairs_multi_to(cfgfile):
145 fpath = cfgfile(['Pfad', 'Strasse'],
146 ['Strasse => str,st'])
148 repl = ICURuleLoader(fpath).get_replacement_pairs()
150 assert [(a, sorted(b)) for a, b in repl] == \
151 [(' strasse ', [' st ', ' str ', ' strasse ', 'st ', 'str ', 'strasse ']),
152 ('strasse ', [' st ', ' str ', ' strasse ', 'st ', 'str ', 'strasse ']),
153 (' pfad ', [' pfad ', 'pfad ']),
154 ('pfad ', [' pfad ', 'pfad '])]
157 def test_get_replacement_pairs_multi_from(cfgfile):
158 fpath = cfgfile([], ['saint,Sainte => st'])
160 repl = ICURuleLoader(fpath).get_replacement_pairs()
162 assert [(a, sorted(b)) for a, b in repl] == \
163 [(' sainte ', [' sainte ', ' st ']),
164 (' saint ', [' saint ', ' st '])]
167 def test_get_replacement_pairs_cross_abbreviations(cfgfile):
168 fpath = cfgfile([], ['saint,Sainte => st',
171 repl = ICURuleLoader(fpath).get_replacement_pairs()
173 assert [(a, sorted(b)) for a, b in repl] == \
174 [(' sainte ', [' sainte ', ' st ', ' ste ']),
175 (' saint ', [' saint ', ' st '])]
178 @pytest.mark.parametrize("abbr", ["missing to =>",
181 def test_bad_abbreviation_syntax(tmp_path, abbr):
182 fpath = tmp_path / ('test_config.yaml')
183 fpath.write_text(dedent("""\
191 repl = ICURuleLoader(fpath).get_replacement_pairs()