2 Tests for converting a config file to ICU rules.
5 from textwrap import dedent
7 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
8 from nominatim.errors import UsageError
10 from icu import Transliterator
13 def cfgfile(tmp_path, suffix='.yaml'):
14 def _create_config(suffixes, abbr):
18 - "[[:Nonspacing Mark:] [:Cf:]] >"
20 - "[[:Punctuation:][:Space:]]+ > ' '"
24 - "[[:Punctuation:][:Space:]]+ > ' '"
26 content += "compound_suffixes:\n"
27 content += '\n'.join((" - " + s for s in suffixes)) + '\n'
28 content += "abbreviations:\n"
29 content += '\n'.join((" - " + s for s in abbr)) + '\n'
30 fpath = tmp_path / ('test_config' + suffix)
31 fpath.write_text(dedent(content))
37 def test_empty_rule_file(tmp_path):
38 fpath = tmp_path / ('test_config.yaml')
39 fpath.write_text(dedent("""\
46 rules = ICURuleLoader(fpath)
47 assert rules.get_search_rules() == ''
48 assert rules.get_normalization_rules() == ''
49 assert rules.get_transliteration_rules() == ''
50 assert rules.get_replacement_pairs() == []
52 CONFIG_SECTIONS = ('normalization', 'transliteration',
53 'compound_suffixes', 'abbreviations')
55 @pytest.mark.parametrize("section", CONFIG_SECTIONS)
56 def test_missing_normalization(tmp_path, section):
57 fpath = tmp_path / ('test_config.yaml')
58 with fpath.open('w') as fd:
59 for name in CONFIG_SECTIONS:
61 fd.write(name + ':\n')
63 with pytest.raises(UsageError):
66 @pytest.mark.parametrize("abbr", ["simple",
67 "double => arrow => bad",
69 def test_bad_abbreviation_syntax(tmp_path, abbr):
70 fpath = tmp_path / ('test_config.yaml')
71 fpath.write_text(dedent("""\
79 with pytest.raises(UsageError):
80 rules = ICURuleLoader(fpath)
83 def test_get_search_rules(cfgfile):
84 fpath = cfgfile(['strasse', 'straße', 'weg'],
85 ['strasse,straße => str',
88 loader = ICURuleLoader(fpath)
90 rules = loader.get_search_rules()
91 trans = Transliterator.createFromRules("test", rules)
93 assert trans.transliterate(" Baum straße ") == " baum straße "
94 assert trans.transliterate(" Baumstraße ") == " baum straße "
95 assert trans.transliterate(" Baumstrasse ") == " baum strasse "
96 assert trans.transliterate(" Baumstr ") == " baum str "
97 assert trans.transliterate(" Baumwegstr ") == " baumweg str "
98 assert trans.transliterate(" Αθήνα ") == " athēna "
99 assert trans.transliterate(" проспект ") == " prospekt "
102 def test_get_normalization_rules(cfgfile):
103 fpath = cfgfile(['strasse', 'straße', 'weg'],
104 ['strasse,straße => str'])
106 loader = ICURuleLoader(fpath)
107 rules = loader.get_normalization_rules()
108 trans = Transliterator.createFromRules("test", rules)
110 assert trans.transliterate(" проспект-Prospekt ") == " проспект prospekt "
113 def test_get_transliteration_rules(cfgfile):
114 fpath = cfgfile(['strasse', 'straße', 'weg'],
115 ['strasse,straße => str'])
117 loader = ICURuleLoader(fpath)
118 rules = loader.get_transliteration_rules()
119 trans = Transliterator.createFromRules("test", rules)
121 assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
124 def test_get_replacement_pairs_multi_to(cfgfile):
125 fpath = cfgfile(['Pfad', 'Strasse'],
126 ['Strasse => str,st'])
128 repl = ICURuleLoader(fpath).get_replacement_pairs()
130 assert [(a, sorted(b)) for a, b in repl] == \
131 [(' strasse ', [' st ', ' str ', ' strasse ']),
132 ('strasse ', [' st ', ' str ', ' strasse ']),
133 ('pfad ', [' pfad ']),
134 ('str ' , [' str ']),
138 def test_get_replacement_pairs_multi_from(cfgfile):
139 fpath = cfgfile([], ['saint,Sainte => st'])
141 repl = ICURuleLoader(fpath).get_replacement_pairs()
143 assert [(a, sorted(b)) for a, b in repl] == \
144 [(' sainte ', [' sainte ', ' st ']),
145 (' saint ', [' saint ', ' st '])]
148 def test_get_replacement_pairs_cross_abbreviations(cfgfile):
149 fpath = cfgfile([], ['saint,Sainte => st',
152 repl = ICURuleLoader(fpath).get_replacement_pairs()
154 assert [(a, sorted(b)) for a, b in repl] == \
155 [(' sainte ', [' sainte ', ' st ', ' ste ']),
156 (' saint ', [' saint ', ' st '])]
159 @pytest.mark.parametrize("abbr", ["missing to =>",
162 def test_bad_abbreviation_syntax(tmp_path, abbr):
163 fpath = tmp_path / ('test_config.yaml')
164 fpath.write_text(dedent("""\
172 repl = ICURuleLoader(fpath).get_replacement_pairs()