test/python/test_tokenizer_icu_rule_loader.py

   1 """
   2 Tests for converting a config file to ICU rules.
   3 """
   4 import pytest
   5 from textwrap import dedent
   6
   7 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
   8 from nominatim.errors import UsageError
   9
  10 from icu import Transliterator
  11
  12 @pytest.fixture
  13 def cfgfile(tmp_path, suffix='.yaml'):
  14     def _create_config(suffixes, abbr):
  15         content = dedent("""\
  16         normalization:
  17             - ":: NFD ()"
  18             - "[[:Nonspacing Mark:] [:Cf:]] >"
  19             - ":: lower ()"
  20             - "[[:Punctuation:][:Space:]]+ > ' '"
  21             - ":: NFC ()"
  22         transliteration:
  23             - "::  Latin ()"
  24             - "[[:Punctuation:][:Space:]]+ > ' '"
  25         """)
  26         content += "compound_suffixes:\n"
  27         content += '\n'.join(("    - " + s for s in suffixes)) + '\n'
  28         content += "abbreviations:\n"
  29         content += '\n'.join(("    - " + s for s in abbr)) + '\n'
  30         fpath = tmp_path / ('test_config' + suffix)
  31         fpath.write_text(dedent(content))
  32         return fpath
  33
  34     return _create_config
  35
  36
  37 def test_empty_rule_file(tmp_path):
  38     fpath = tmp_path / ('test_config.yaml')
  39     fpath.write_text(dedent("""\
  40         normalization:
  41         transliteration:
  42         compound_suffixes:
  43         abbreviations:
  44         """))
  45
  46     rules = ICURuleLoader(fpath)
  47     assert rules.get_search_rules() == ''
  48     assert rules.get_normalization_rules() == ''
  49     assert rules.get_transliteration_rules() == ''
  50     assert rules.get_replacement_pairs() == []
  51
  52 CONFIG_SECTIONS = ('normalization', 'transliteration',
  53                    'compound_suffixes', 'abbreviations')
  54
  55 @pytest.mark.parametrize("section", CONFIG_SECTIONS)
  56 def test_missing_normalization(tmp_path, section):
  57     fpath = tmp_path / ('test_config.yaml')
  58     with fpath.open('w') as fd:
  59         for name in CONFIG_SECTIONS:
  60             if name != section:
  61                 fd.write(name + ':\n')
  62
  63     with pytest.raises(UsageError):
  64         ICURuleLoader(fpath)
  65
  66
  67 def test_get_search_rules(cfgfile):
  68     fpath = cfgfile(['strasse', 'straße', 'weg'],
  69                     ['strasse,straße => str',
  70                      'prospekt => pr'])
  71
  72     loader = ICURuleLoader(fpath)
  73
  74     rules = loader.get_search_rules()
  75     trans = Transliterator.createFromRules("test", rules)
  76
  77     assert trans.transliterate(" Baum straße ") == " baum straße "
  78     assert trans.transliterate(" Baumstraße ") == " baum straße "
  79     assert trans.transliterate(" Baumstrasse ") == " baum strasse "
  80     assert trans.transliterate(" Baumstr ") == " baum str "
  81     assert trans.transliterate(" Baumwegstr ") == " baumweg str "
  82     assert trans.transliterate(" Αθήνα ") == " athēna "
  83     assert trans.transliterate(" проспект ") == " prospekt "
  84
  85
  86 def test_get_normalization_rules(cfgfile):
  87     fpath = cfgfile(['strasse', 'straße', 'weg'],
  88                     ['strasse,straße => str'])
  89
  90     loader = ICURuleLoader(fpath)
  91     rules = loader.get_normalization_rules()
  92     trans = Transliterator.createFromRules("test", rules)
  93
  94     assert trans.transliterate(" проспект-Prospekt ") == " проспект prospekt "
  95
  96
  97 def test_get_transliteration_rules(cfgfile):
  98     fpath = cfgfile(['strasse', 'straße', 'weg'],
  99                     ['strasse,straße => str'])
 100
 101     loader = ICURuleLoader(fpath)
 102     rules = loader.get_transliteration_rules()
 103     trans = Transliterator.createFromRules("test", rules)
 104
 105     assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
 106
 107
 108 def test_get_synonym_pairs(cfgfile):
 109     fpath = cfgfile(['Weg', 'Strasse'],
 110                     ['Strasse => str,st'])
 111
 112     loader = ICURuleLoader(fpath)
 113
 114     repl = loader.get_replacement_pairs()
 115
 116     assert sorted(((a, sorted(b)) for a, b in repl)) == \
 117              sorted([(' strasse ', [' st ', ' str ', ' strasse ']),
 118                      ('strasse ', [' st ', ' str ', ' strasse ']),
 119                      ('st ' , [' st ']),
 120                      ('str ' , [' str ']),
 121                      ('weg ', [' weg '])])
 122