complete tests for rule loader

[nominatim.git] / test / python / test_tokenizer_icu_rule_loader.py
diff --git a/test/python/test_tokenizer_icu_rule_loader.py b/test/python/test_tokenizer_icu_rule_loader.py

index d89e13b519d196de4422650e011dbcd103397866..51927eaacf420e0745f21ff2f76b82c29ed7f3dc 100644 (file)
--- a/test/python/test_tokenizer_icu_rule_loader.py
+++ b/test/python/test_tokenizer_icu_rule_loader.py
@@ -21,6 +21,7 @@ def cfgfile(tmp_path, suffix='.yaml'):
              - ":: NFC ()"
          transliteration:
              - "::  Latin ()"
+            - "[[:Punctuation:][:Space:]]+ > ' '"
          """)
          content += "compound_suffixes:\n"
          content += '\n'.join(("    - " + s for s in suffixes)) + '\n'
@@ -32,16 +33,52 @@ def cfgfile(tmp_path, suffix='.yaml'):
  
      return _create_config
  
-def test_missing_normalization(tmp_path):
+
+def test_empty_rule_file(tmp_path):
      fpath = tmp_path / ('test_config.yaml')
      fpath.write_text(dedent("""\
-        normalizatio:
-            - ":: NFD ()"
+        normalization:
+        transliteration:
+        compound_suffixes:
+        abbreviations:
          """))
  
+    rules = ICURuleLoader(fpath)
+    assert rules.get_search_rules() == ''
+    assert rules.get_normalization_rules() == ''
+    assert rules.get_transliteration_rules() == ''
+    assert rules.get_replacement_pairs() == []
+
+CONFIG_SECTIONS = ('normalization', 'transliteration',
+                   'compound_suffixes', 'abbreviations')
+
+@pytest.mark.parametrize("section", CONFIG_SECTIONS)
+def test_missing_normalization(tmp_path, section):
+    fpath = tmp_path / ('test_config.yaml')
+    with fpath.open('w') as fd:
+        for name in CONFIG_SECTIONS:
+            if name != section:
+                fd.write(name + ':\n')
+
      with pytest.raises(UsageError):
          ICURuleLoader(fpath)
  
+@pytest.mark.parametrize("abbr", ["simple",
+                                  "double => arrow => bad",
+                                  "bad = > arrow"])
+def test_bad_abbreviation_syntax(tmp_path, abbr):
+    fpath = tmp_path / ('test_config.yaml')
+    fpath.write_text(dedent("""\
+        normalization:
+        transliteration:
+        compound_suffixes:
+        abbreviations:
+         - {}
+        """.format(abbr)))
+
+    with pytest.raises(UsageError):
+        rules = ICURuleLoader(fpath)
+
  
  def test_get_search_rules(cfgfile):
      fpath = cfgfile(['strasse', 'straße', 'weg'],
@@ -53,6 +90,7 @@ def test_get_search_rules(cfgfile):
      rules = loader.get_search_rules()
      trans = Transliterator.createFromRules("test", rules)
  
+    assert trans.transliterate(" Baum straße ") == " baum straße "
      assert trans.transliterate(" Baumstraße ") == " baum straße "
      assert trans.transliterate(" Baumstrasse ") == " baum strasse "
      assert trans.transliterate(" Baumstr ") == " baum str "
@@ -61,15 +99,76 @@ def test_get_search_rules(cfgfile):
      assert trans.transliterate(" проспект ") == " prospekt "
  
  
-def test_get_synonym_pairs(cfgfile):
-    fpath = cfgfile(['Weg', 'Strasse'],
-                    ['Strasse => str,st'])
+def test_get_normalization_rules(cfgfile):
+    fpath = cfgfile(['strasse', 'straße', 'weg'],
+                    ['strasse,straße => str'])
  
      loader = ICURuleLoader(fpath)
+    rules = loader.get_normalization_rules()
+    trans = Transliterator.createFromRules("test", rules)
+
+    assert trans.transliterate(" проспект-Prospekt ") == " проспект prospekt "
+
+
+def test_get_transliteration_rules(cfgfile):
+    fpath = cfgfile(['strasse', 'straße', 'weg'],
+                    ['strasse,straße => str'])
  
-    repl = loader.get_replacement_pairs()
+    loader = ICURuleLoader(fpath)
+    rules = loader.get_transliteration_rules()
+    trans = Transliterator.createFromRules("test", rules)
+
+    assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
+
+
+def test_get_replacement_pairs_multi_to(cfgfile):
+    fpath = cfgfile(['Pfad', 'Strasse'],
+                    ['Strasse => str,st'])
+
+    repl = ICURuleLoader(fpath).get_replacement_pairs()
+
+    assert [(a, sorted(b)) for a, b in repl] == \
+             [(' strasse ', [' st ', ' str ', ' strasse ']),
+              ('strasse ', [' st ', ' str ', ' strasse ']),
+              ('pfad ', [' pfad ']),
+              ('str ' , [' str ']),
+              ('st ' , [' st '])]
+
+
+def test_get_replacement_pairs_multi_from(cfgfile):
+    fpath = cfgfile([], ['saint,Sainte => st'])
+
+    repl = ICURuleLoader(fpath).get_replacement_pairs()
+
+    assert [(a, sorted(b)) for a, b in repl] == \
+             [(' sainte ', [' sainte ', ' st ']),
+              (' saint ', [' saint ', ' st '])]
+
+
+def test_get_replacement_pairs_cross_abbreviations(cfgfile):
+    fpath = cfgfile([], ['saint,Sainte => st',
+                         'sainte => ste'])
+
+    repl = ICURuleLoader(fpath).get_replacement_pairs()
+
+    assert [(a, sorted(b)) for a, b in repl] == \
+             [(' sainte ', [' sainte ', ' st ', ' ste ']),
+              (' saint ', [' saint ', ' st '])]
+
+
+@pytest.mark.parametrize("abbr", ["missing to =>",
+                                  "  => missing from",
+                                  "=>"])
+def test_bad_abbreviation_syntax(tmp_path, abbr):
+    fpath = tmp_path / ('test_config.yaml')
+    fpath.write_text(dedent("""\
+        normalization:
+        transliteration:
+        compound_suffixes:
+        abbreviations:
+         - {}
+        """.format(abbr)))
  
-    assert repl == [(' strasse ', {' strasse ', ' str ', ' st '}),
-                    ('strasse ', {' strasse ', ' str ', ' st '}),
-                    ('weg ', {' weg '})]
+    repl = ICURuleLoader(fpath).get_replacement_pairs()
  
+    assert repl == []