From: Sarah Hoffmann Date: Wed, 6 Oct 2021 10:29:25 +0000 (+0200) Subject: add tests for sanitizer tagging language X-Git-Tag: v4.0.0~22^2~3 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/b18d042832bed3639314168fc64bf9928ef76fcd?hp=-c add tests for sanitizer tagging language --- b18d042832bed3639314168fc64bf9928ef76fcd diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py index b3e9c4c7..b8551038 100644 --- a/nominatim/tokenizer/icu_rule_loader.py +++ b/nominatim/tokenizer/icu_rule_loader.py @@ -124,7 +124,7 @@ class ICURuleLoader: else: LOG.fatal("ICU tokenizer configuration has two token " "analyzers with id '%s'.", name) - UsageError("Syntax error in ICU tokenizer config.") + raise UsageError("Syntax error in ICU tokenizer config.") self.analysis[name] = TokenAnalyzerRule(section, self.normalization_rules) diff --git a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py index c98c825d..c0ddd836 100644 --- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py +++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py @@ -15,30 +15,30 @@ class _AnalyzerByLanguage: else: self.regexes = None - self.use_defaults = config.get('use-defaults', 'no') - if self.use_defaults not in ('mono', 'all'): - self.use_defaults = False - self.replace = config.get('mode', 'replace') != 'append' self.whitelist = config.get('whitelist') - # Compute the languages to use when no suffix is given. + self.__compute_default_languages(config.get('use-defaults', 'no')) + + + def __compute_default_languages(self, use_defaults): self.deflangs = {} - for ccode, prop in country_info.iterate(): - clangs = prop['languages'] - if len(clangs) == 1 or self.use_defaults == 'all': - if self.whitelist: - self.deflangs[ccode] = [l for l in clangs if l in self.whitelist] - else: - self.deflangs[ccode] = clangs + if use_defaults in ('mono', 'all'): + for ccode, prop in country_info.iterate(): + clangs = prop['languages'] + if len(clangs) == 1 or use_defaults == 'all': + if self.whitelist: + self.deflangs[ccode] = [l for l in clangs if l in self.whitelist] + else: + self.deflangs[ccode] = clangs def _kind_matches(self, kind): if self.regexes is None: return True - return any(regex.search(kind) for regex in self.regexes) + return any(regex.fullmatch(kind) for regex in self.regexes) def _suffix_matches(self, suffix): @@ -59,10 +59,8 @@ class _AnalyzerByLanguage: if name.suffix: langs = [name.suffix] if self._suffix_matches(name.suffix) else None else: - if self.use_defaults: - langs = self.deflangs.get(obj.place.country_code) - if self.use_defaults == 'mono' and len(langs) > 1: - langs = None + langs = self.deflangs.get(obj.place.country_code) + if langs: if self.replace: diff --git a/test/python/tokenizer/sanitizers/test_tag_analyzer_by_language.py b/test/python/tokenizer/sanitizers/test_tag_analyzer_by_language.py new file mode 100644 index 00000000..e4a836fa --- /dev/null +++ b/test/python/tokenizer/sanitizers/test_tag_analyzer_by_language.py @@ -0,0 +1,259 @@ +""" +Tests for the sanitizer that enables language-dependent analyzers. +""" +import pytest + +from nominatim.indexer.place_info import PlaceInfo +from nominatim.tokenizer.place_sanitizer import PlaceSanitizer +from nominatim.tools.country_info import setup_country_config + +class TestWithDefaults: + + @staticmethod + def run_sanitizer_on(country, **kwargs): + place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, + 'country_code': country}) + name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language'}]).process_names(place) + + return sorted([(p.name, p.kind, p.suffix, p.attr) for p in name]) + + + def test_no_names(self): + assert self.run_sanitizer_on('de') == [] + + + def test_simple(self): + res = self.run_sanitizer_on('fr', name='Foo',name_de='Zoo', ref_abc='M') + + assert res == [('Foo', 'name', None, {}), + ('M', 'ref', 'abc', {'analyzer': 'abc'}), + ('Zoo', 'name', 'de', {'analyzer': 'de'})] + + + @pytest.mark.parametrize('suffix', ['DE', 'asbc']) + def test_illegal_suffix(self, suffix): + assert self.run_sanitizer_on('fr', **{'name_' + suffix: 'Foo'}) \ + == [('Foo', 'name', suffix, {})] + + +class TestFilterKind: + + @staticmethod + def run_sanitizer_on(filt, **kwargs): + place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, + 'country_code': 'de'}) + name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language', + 'filter-kind': filt}]).process_names(place) + + return sorted([(p.name, p.kind, p.suffix, p.attr) for p in name]) + + + def test_single_exact_name(self): + res = self.run_sanitizer_on(['name'], name_fr='A', ref_fr='12', + shortname_fr='C', name='D') + + assert res == [('12', 'ref', 'fr', {}), + ('A', 'name', 'fr', {'analyzer': 'fr'}), + ('C', 'shortname', 'fr', {}), + ('D', 'name', None, {})] + + + def test_single_pattern(self): + res = self.run_sanitizer_on(['.*name'], + name_fr='A', ref_fr='12', namexx_fr='B', + shortname_fr='C', name='D') + + assert res == [('12', 'ref', 'fr', {}), + ('A', 'name', 'fr', {'analyzer': 'fr'}), + ('B', 'namexx', 'fr', {}), + ('C', 'shortname', 'fr', {'analyzer': 'fr'}), + ('D', 'name', None, {})] + + + def test_multiple_patterns(self): + res = self.run_sanitizer_on(['.*name', 'ref'], + name_fr='A', ref_fr='12', oldref_fr='X', + namexx_fr='B', shortname_fr='C', name='D') + + assert res == [('12', 'ref', 'fr', {'analyzer': 'fr'}), + ('A', 'name', 'fr', {'analyzer': 'fr'}), + ('B', 'namexx', 'fr', {}), + ('C', 'shortname', 'fr', {'analyzer': 'fr'}), + ('D', 'name', None, {}), + ('X', 'oldref', 'fr', {})] + + +class TestDefaultCountry: + + @pytest.fixture(autouse=True) + def setup_country(self, def_config): + setup_country_config(def_config) + + @staticmethod + def run_sanitizer_append(mode, country, **kwargs): + place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, + 'country_code': country}) + name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language', + 'use-defaults': mode, + 'mode': 'append'}]).process_names(place) + + assert all(isinstance(p.attr, dict) for p in name) + assert all(len(p.attr) <= 1 for p in name) + assert all(not p.attr or ('analyzer' in p.attr and p.attr['analyzer']) + for p in name) + + return sorted([(p.name, p.attr.get('analyzer', '')) for p in name]) + + + @staticmethod + def run_sanitizer_replace(mode, country, **kwargs): + place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, + 'country_code': country}) + name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language', + 'use-defaults': mode, + 'mode': 'replace'}]).process_names(place) + + assert all(isinstance(p.attr, dict) for p in name) + assert all(len(p.attr) <= 1 for p in name) + assert all(not p.attr or ('analyzer' in p.attr and p.attr['analyzer']) + for p in name) + + return sorted([(p.name, p.attr.get('analyzer', '')) for p in name]) + + + def test_missing_country(self): + place = PlaceInfo({'name': {'name': 'something'}}) + name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language', + 'use-defaults': 'all', + 'mode': 'replace'}]).process_names(place) + + assert len(name) == 1 + assert name[0].name == 'something' + assert name[0].suffix is None + assert 'analyzer' not in name[0].attr + + + def test_mono_unknown_country(self): + expect = [('XX', '')] + + assert self.run_sanitizer_replace('mono', 'xx', name='XX') == expect + assert self.run_sanitizer_append('mono', 'xx', name='XX') == expect + + + def test_mono_monoling_replace(self): + res = self.run_sanitizer_replace('mono', 'de', name='Foo') + + assert res == [('Foo', 'de')] + + + def test_mono_monoling_append(self): + res = self.run_sanitizer_append('mono', 'de', name='Foo') + + assert res == [('Foo', ''), ('Foo', 'de')] + + + def test_mono_multiling(self): + expect = [('XX', '')] + + assert self.run_sanitizer_replace('mono', 'ch', name='XX') == expect + assert self.run_sanitizer_append('mono', 'ch', name='XX') == expect + + + def test_all_unknown_country(self): + expect = [('XX', '')] + + assert self.run_sanitizer_replace('all', 'xx', name='XX') == expect + assert self.run_sanitizer_append('all', 'xx', name='XX') == expect + + + def test_all_monoling_replace(self): + res = self.run_sanitizer_replace('all', 'de', name='Foo') + + assert res == [('Foo', 'de')] + + + def test_all_monoling_append(self): + res = self.run_sanitizer_append('all', 'de', name='Foo') + + assert res == [('Foo', ''), ('Foo', 'de')] + + + def test_all_multiling_append(self): + res = self.run_sanitizer_append('all', 'ch', name='XX') + + assert res == [('XX', ''), + ('XX', 'de'), ('XX', 'fr'), ('XX', 'it'), ('XX', 'rm')] + + + def test_all_multiling_replace(self): + res = self.run_sanitizer_replace('all', 'ch', name='XX') + + assert res == [('XX', 'de'), ('XX', 'fr'), ('XX', 'it'), ('XX', 'rm')] + + +class TestCountryWithWhitelist: + + @staticmethod + def run_sanitizer_on(mode, country, **kwargs): + place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, + 'country_code': country}) + name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language', + 'use-defaults': mode, + 'mode': 'replace', + 'whitelist': ['de', 'fr', 'ru']}]).process_names(place) + + assert all(isinstance(p.attr, dict) for p in name) + assert all(len(p.attr) <= 1 for p in name) + assert all(not p.attr or ('analyzer' in p.attr and p.attr['analyzer']) + for p in name) + + return sorted([(p.name, p.attr.get('analyzer', '')) for p in name]) + + + def test_mono_monoling(self): + assert self.run_sanitizer_on('mono', 'de', name='Foo') == [('Foo', 'de')] + assert self.run_sanitizer_on('mono', 'pt', name='Foo') == [('Foo', '')] + + + def test_mono_multiling(self): + assert self.run_sanitizer_on('mono', 'ca', name='Foo') == [('Foo', '')] + + + def test_all_monoling(self): + assert self.run_sanitizer_on('all', 'de', name='Foo') == [('Foo', 'de')] + assert self.run_sanitizer_on('all', 'pt', name='Foo') == [('Foo', '')] + + + def test_all_multiling(self): + assert self.run_sanitizer_on('all', 'ca', name='Foo') == [('Foo', 'fr')] + assert self.run_sanitizer_on('all', 'ch', name='Foo') \ + == [('Foo', 'de'), ('Foo', 'fr')] + + +class TestWhiteList: + + @staticmethod + def run_sanitizer_on(whitelist, **kwargs): + place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}}) + name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language', + 'mode': 'replace', + 'whitelist': whitelist}]).process_names(place) + + assert all(isinstance(p.attr, dict) for p in name) + assert all(len(p.attr) <= 1 for p in name) + assert all(not p.attr or ('analyzer' in p.attr and p.attr['analyzer']) + for p in name) + + return sorted([(p.name, p.attr.get('analyzer', '')) for p in name]) + + + def test_in_whitelist(self): + assert self.run_sanitizer_on(['de', 'xx'], ref_xx='123') == [('123', 'xx')] + + + def test_not_in_whitelist(self): + assert self.run_sanitizer_on(['de', 'xx'], ref_yy='123') == [('123', '')] + + + def test_empty_whitelist(self): + assert self.run_sanitizer_on([], ref_yy='123') == [('123', '')]