2 Tests for converting a config file to ICU rules.
5 from textwrap import dedent
7 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
8 from nominatim.errors import UsageError
10 from icu import Transliterator
13 def cfgfile(tmp_path, suffix='.yaml'):
14 def _create_config(suffixes, abbr):
18 - "[[:Nonspacing Mark:] [:Cf:]] >"
20 - "[[:Punctuation:][:Space:]]+ > ' '"
24 - "[[:Punctuation:][:Space:]]+ > ' '"
26 content += "compound_suffixes:\n"
27 content += '\n'.join((" - " + s for s in suffixes)) + '\n'
28 content += "abbreviations:\n"
29 content += '\n'.join((" - " + s for s in abbr)) + '\n'
30 fpath = tmp_path / ('test_config' + suffix)
31 fpath.write_text(dedent(content))
37 def test_empty_rule_file(tmp_path):
38 fpath = tmp_path / ('test_config.yaml')
39 fpath.write_text(dedent("""\
46 rules = ICURuleLoader(fpath)
47 assert rules.get_search_rules() == ''
48 assert rules.get_normalization_rules() == ''
49 assert rules.get_transliteration_rules() == ''
50 assert rules.get_replacement_pairs() == []
52 CONFIG_SECTIONS = ('normalization', 'transliteration',
53 'compound_suffixes', 'abbreviations')
55 @pytest.mark.parametrize("section", CONFIG_SECTIONS)
56 def test_missing_normalization(tmp_path, section):
57 fpath = tmp_path / ('test_config.yaml')
58 with fpath.open('w') as fd:
59 for name in CONFIG_SECTIONS:
61 fd.write(name + ':\n')
63 with pytest.raises(UsageError):
66 @pytest.mark.parametrize("abbr", ["simple",
67 "double => arrow => bad",
69 def test_bad_abbreviation_syntax(tmp_path, abbr):
70 fpath = tmp_path / ('test_config.yaml')
71 fpath.write_text(dedent("""\
79 with pytest.raises(UsageError):
80 rules = ICURuleLoader(fpath)
83 def test_get_search_rules(cfgfile):
84 fpath = cfgfile(['strasse', 'straße', 'weg'],
85 ['strasse,straße => str',
88 loader = ICURuleLoader(fpath)
90 rules = loader.get_search_rules()
91 trans = Transliterator.createFromRules("test", rules)
93 assert trans.transliterate(" Baum straße ") == " baum straße "
94 assert trans.transliterate(" Baumstraße ") == " baumstraße "
95 assert trans.transliterate(" Baumstrasse ") == " baumstrasse "
96 assert trans.transliterate(" Baumstr ") == " baumstr "
97 assert trans.transliterate(" Baumwegstr ") == " baumwegstr "
98 assert trans.transliterate(" Αθήνα ") == " athēna "
99 assert trans.transliterate(" проспект ") == " prospekt "
102 def test_get_normalization_rules(cfgfile):
103 fpath = cfgfile(['strasse', 'straße', 'weg'],
104 ['strasse,straße => str'])
106 loader = ICURuleLoader(fpath)
107 rules = loader.get_normalization_rules()
108 trans = Transliterator.createFromRules("test", rules)
110 assert trans.transliterate(" проспект-Prospekt ") == " проспект prospekt "
113 def test_get_transliteration_rules(cfgfile):
114 fpath = cfgfile(['strasse', 'straße', 'weg'],
115 ['strasse,straße => str'])
117 loader = ICURuleLoader(fpath)
118 rules = loader.get_transliteration_rules()
119 trans = Transliterator.createFromRules("test", rules)
121 assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
124 def test_get_replacement_pairs_multi_to(cfgfile):
125 fpath = cfgfile(['Pfad', 'Strasse'],
126 ['Strasse => str,st'])
128 repl = ICURuleLoader(fpath).get_replacement_pairs()
130 assert [(a, sorted(b)) for a, b in repl] == \
131 [(' strasse ', [' st ', ' str ', ' strasse ', 'st ', 'str ', 'strasse ']),
132 ('strasse ', [' st ', ' str ', ' strasse ', 'st ', 'str ', 'strasse ']),
133 (' pfad ', [' pfad ', 'pfad ']),
134 ('pfad ', [' pfad ', 'pfad '])]
137 def test_get_replacement_pairs_multi_from(cfgfile):
138 fpath = cfgfile([], ['saint,Sainte => st'])
140 repl = ICURuleLoader(fpath).get_replacement_pairs()
142 assert [(a, sorted(b)) for a, b in repl] == \
143 [(' sainte ', [' sainte ', ' st ']),
144 (' saint ', [' saint ', ' st '])]
147 def test_get_replacement_pairs_cross_abbreviations(cfgfile):
148 fpath = cfgfile([], ['saint,Sainte => st',
151 repl = ICURuleLoader(fpath).get_replacement_pairs()
153 assert [(a, sorted(b)) for a, b in repl] == \
154 [(' sainte ', [' sainte ', ' st ', ' ste ']),
155 (' saint ', [' saint ', ' st '])]
158 @pytest.mark.parametrize("abbr", ["missing to =>",
161 def test_bad_abbreviation_syntax(tmp_path, abbr):
162 fpath = tmp_path / ('test_config.yaml')
163 fpath.write_text(dedent("""\
171 repl = ICURuleLoader(fpath).get_replacement_pairs()