From 6d41046b152bb9380766edce12379b7c3c585c0a Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 25 Jul 2022 16:10:19 +0200 Subject: [PATCH] add support for external sanitizer modules --- nominatim/tokenizer/icu_rule_loader.py | 3 +- nominatim/tokenizer/place_sanitizer.py | 15 ++-- test/python/config/test_config_load_module.py | 5 ++ .../sanitizers/test_clean_housenumbers.py | 12 +-- .../sanitizers/test_clean_postcodes.py | 2 +- .../sanitizers/test_split_name_list.py | 78 ++++++++++--------- .../sanitizers/test_strip_brace_terms.py | 48 +++++++----- .../test_tag_analyzer_by_language.py | 61 ++++++++++----- test/python/tokenizer/test_place_sanitizer.py | 12 +-- 9 files changed, 142 insertions(+), 94 deletions(-) diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py index 84040ddc..cf9fdb88 100644 --- a/nominatim/tokenizer/icu_rule_loader.py +++ b/nominatim/tokenizer/icu_rule_loader.py @@ -45,6 +45,7 @@ class ICURuleLoader: """ def __init__(self, config: Configuration) -> None: + self.config = config rules = config.load_sub_configuration('icu_tokenizer.yaml', config='TOKENIZER_CONFIG') @@ -92,7 +93,7 @@ class ICURuleLoader: def make_sanitizer(self) -> PlaceSanitizer: """ Create a place sanitizer from the configured rules. """ - return PlaceSanitizer(self.sanitizer_rules) + return PlaceSanitizer(self.sanitizer_rules, self.config) def make_token_analysis(self) -> ICUTokenAnalysis: diff --git a/nominatim/tokenizer/place_sanitizer.py b/nominatim/tokenizer/place_sanitizer.py index 3f548e06..c7dfd1ba 100644 --- a/nominatim/tokenizer/place_sanitizer.py +++ b/nominatim/tokenizer/place_sanitizer.py @@ -9,9 +9,9 @@ Handler for cleaning name and address tags in place information before it is handed to the token analysis. """ from typing import Optional, List, Mapping, Sequence, Callable, Any, Tuple -import importlib from nominatim.errors import UsageError +from nominatim.config import Configuration from nominatim.tokenizer.sanitizers.config import SanitizerConfig from nominatim.tokenizer.sanitizers.base import SanitizerHandler, ProcessInfo, PlaceName from nominatim.data.place_info import PlaceInfo @@ -22,16 +22,21 @@ class PlaceSanitizer: names and address before they are used by the token analysers. """ - def __init__(self, rules: Optional[Sequence[Mapping[str, Any]]]) -> None: + def __init__(self, rules: Optional[Sequence[Mapping[str, Any]]], + config: Configuration) -> None: self.handlers: List[Callable[[ProcessInfo], None]] = [] if rules: for func in rules: if 'step' not in func: raise UsageError("Sanitizer rule is missing the 'step' attribute.") - module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_') - handler_module: SanitizerHandler = importlib.import_module(module_name) - self.handlers.append(handler_module.create(SanitizerConfig(func))) + if not isinstance(func['step'], str): + raise UsageError("'step' attribute must be a simple string.") + + module: SanitizerHandler = \ + config.load_plugin_module(func['step'], 'nominatim.tokenizer.sanitizers') + + self.handlers.append(module.create(SanitizerConfig(func))) def process_names(self, place: PlaceInfo) -> Tuple[List[PlaceName], List[PlaceName]]: diff --git a/test/python/config/test_config_load_module.py b/test/python/config/test_config_load_module.py index cee88c82..df6c4794 100644 --- a/test/python/config/test_config_load_module.py +++ b/test/python/config/test_config_load_module.py @@ -31,6 +31,11 @@ def test_load_default_module(test_config): assert isinstance(module.NOMINATIM_VERSION, tuple) +def test_load_default_module_with_hyphen(test_config): + module = test_config.load_plugin_module('place-info', 'nominatim.data') + + assert isinstance(module.PlaceInfo, object) + def test_load_plugin_module(test_config, tmp_path): (tmp_path / 'project' / 'testpath').mkdir() diff --git a/test/python/tokenizer/sanitizers/test_clean_housenumbers.py b/test/python/tokenizer/sanitizers/test_clean_housenumbers.py index 128e1201..11a71a5f 100644 --- a/test/python/tokenizer/sanitizers/test_clean_housenumbers.py +++ b/test/python/tokenizer/sanitizers/test_clean_housenumbers.py @@ -13,14 +13,14 @@ from nominatim.tokenizer.place_sanitizer import PlaceSanitizer from nominatim.data.place_info import PlaceInfo @pytest.fixture -def sanitize(request): +def sanitize(request, def_config): sanitizer_args = {'step': 'clean-housenumbers'} for mark in request.node.iter_markers(name="sanitizer_params"): sanitizer_args.update({k.replace('_', '-') : v for k,v in mark.kwargs.items()}) def _run(**kwargs): place = PlaceInfo({'address': kwargs}) - _, address = PlaceSanitizer([sanitizer_args]).process_names(place) + _, address = PlaceSanitizer([sanitizer_args], def_config).process_names(place) return sorted([(p.kind, p.name) for p in address]) @@ -45,24 +45,24 @@ def test_filter_kind(sanitize): @pytest.mark.parametrize('number', ('6523', 'n/a', '4')) -def test_convert_to_name_converted(number): +def test_convert_to_name_converted(def_config, number): sanitizer_args = {'step': 'clean-housenumbers', 'convert-to-name': (r'\d+', 'n/a')} place = PlaceInfo({'address': {'housenumber': number}}) - names, address = PlaceSanitizer([sanitizer_args]).process_names(place) + names, address = PlaceSanitizer([sanitizer_args], def_config).process_names(place) assert ('housenumber', number) in set((p.kind, p.name) for p in names) assert 'housenumber' not in set(p.kind for p in address) @pytest.mark.parametrize('number', ('a54', 'n.a', 'bow')) -def test_convert_to_name_unconverted(number): +def test_convert_to_name_unconverted(def_config, number): sanitizer_args = {'step': 'clean-housenumbers', 'convert-to-name': (r'\d+', 'n/a')} place = PlaceInfo({'address': {'housenumber': number}}) - names, address = PlaceSanitizer([sanitizer_args]).process_names(place) + names, address = PlaceSanitizer([sanitizer_args], def_config).process_names(place) assert 'housenumber' not in set(p.kind for p in names) assert ('housenumber', number) in set((p.kind, p.name) for p in address) diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py index 237527f1..f2c965ad 100644 --- a/test/python/tokenizer/sanitizers/test_clean_postcodes.py +++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py @@ -25,7 +25,7 @@ def sanitize(def_config, request): if country is not None: pi['country_code'] = country - _, address = PlaceSanitizer([sanitizer_args]).process_names(PlaceInfo(pi)) + _, address = PlaceSanitizer([sanitizer_args], def_config).process_names(PlaceInfo(pi)) return sorted([(p.kind, p.name) for p in address]) diff --git a/test/python/tokenizer/sanitizers/test_split_name_list.py b/test/python/tokenizer/sanitizers/test_split_name_list.py index 67157fba..9ca539d5 100644 --- a/test/python/tokenizer/sanitizers/test_split_name_list.py +++ b/test/python/tokenizer/sanitizers/test_split_name_list.py @@ -14,58 +14,66 @@ from nominatim.data.place_info import PlaceInfo from nominatim.errors import UsageError -def run_sanitizer_on(**kwargs): - place = PlaceInfo({'name': kwargs}) - name, _ = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place) +class TestSplitName: - return sorted([(p.name, p.kind, p.suffix) for p in name]) + @pytest.fixture(autouse=True) + def setup_country(self, def_config): + self.config = def_config -def sanitize_with_delimiter(delimiter, name): - place = PlaceInfo({'name': {'name': name}}) - san = PlaceSanitizer([{'step': 'split-name-list', 'delimiters': delimiter}]) - name, _ = san.process_names(place) + def run_sanitizer_on(self, **kwargs): + place = PlaceInfo({'name': kwargs}) + name, _ = PlaceSanitizer([{'step': 'split-name-list'}], self.config).process_names(place) - return sorted([p.name for p in name]) + return sorted([(p.name, p.kind, p.suffix) for p in name]) -def test_simple(): - assert run_sanitizer_on(name='ABC') == [('ABC', 'name', None)] - assert run_sanitizer_on(name='') == [('', 'name', None)] + def sanitize_with_delimiter(self, delimiter, name): + place = PlaceInfo({'name': {'name': name}}) + san = PlaceSanitizer([{'step': 'split-name-list', 'delimiters': delimiter}], + self.config) + name, _ = san.process_names(place) + return sorted([p.name for p in name]) -def test_splits(): - assert run_sanitizer_on(name='A;B;C') == [('A', 'name', None), - ('B', 'name', None), - ('C', 'name', None)] - assert run_sanitizer_on(short_name=' House, boat ') == [('House', 'short_name', None), - ('boat', 'short_name', None)] + def test_simple(self): + assert self.run_sanitizer_on(name='ABC') == [('ABC', 'name', None)] + assert self.run_sanitizer_on(name='') == [('', 'name', None)] -def test_empty_fields(): - assert run_sanitizer_on(name='A;;B') == [('A', 'name', None), - ('B', 'name', None)] - assert run_sanitizer_on(name='A; ,B') == [('A', 'name', None), - ('B', 'name', None)] - assert run_sanitizer_on(name=' ;B') == [('B', 'name', None)] - assert run_sanitizer_on(name='B,') == [('B', 'name', None)] + def test_splits(self): + assert self.run_sanitizer_on(name='A;B;C') == [('A', 'name', None), + ('B', 'name', None), + ('C', 'name', None)] + assert self.run_sanitizer_on(short_name=' House, boat ') == [('House', 'short_name', None), + ('boat', 'short_name', None)] -def test_custom_delimiters(): - assert sanitize_with_delimiter(':', '12:45,3') == ['12', '45,3'] - assert sanitize_with_delimiter('\\', 'a;\\b!#@ \\') == ['a;', 'b!#@'] - assert sanitize_with_delimiter('[]', 'foo[to]be') == ['be', 'foo', 'to'] - assert sanitize_with_delimiter(' ', 'morning sun') == ['morning', 'sun'] + def test_empty_fields(self): + assert self.run_sanitizer_on(name='A;;B') == [('A', 'name', None), + ('B', 'name', None)] + assert self.run_sanitizer_on(name='A; ,B') == [('A', 'name', None), + ('B', 'name', None)] + assert self.run_sanitizer_on(name=' ;B') == [('B', 'name', None)] + assert self.run_sanitizer_on(name='B,') == [('B', 'name', None)] -def test_empty_delimiter_set(): - with pytest.raises(UsageError): - sanitize_with_delimiter('', 'abc') + def test_custom_delimiters(self): + assert self.sanitize_with_delimiter(':', '12:45,3') == ['12', '45,3'] + assert self.sanitize_with_delimiter('\\', 'a;\\b!#@ \\') == ['a;', 'b!#@'] + assert self.sanitize_with_delimiter('[]', 'foo[to]be') == ['be', 'foo', 'to'] + assert self.sanitize_with_delimiter(' ', 'morning sun') == ['morning', 'sun'] -def test_no_name_list(): + + def test_empty_delimiter_set(self): + with pytest.raises(UsageError): + self.sanitize_with_delimiter('', 'abc') + + +def test_no_name_list(def_config): place = PlaceInfo({'address': {'housenumber': '3'}}) - name, address = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place) + name, address = PlaceSanitizer([{'step': 'split-name-list'}], def_config).process_names(place) assert not name assert len(address) == 1 diff --git a/test/python/tokenizer/sanitizers/test_strip_brace_terms.py b/test/python/tokenizer/sanitizers/test_strip_brace_terms.py index eb554364..7fa0a018 100644 --- a/test/python/tokenizer/sanitizers/test_strip_brace_terms.py +++ b/test/python/tokenizer/sanitizers/test_strip_brace_terms.py @@ -12,39 +12,45 @@ import pytest from nominatim.tokenizer.place_sanitizer import PlaceSanitizer from nominatim.data.place_info import PlaceInfo -def run_sanitizer_on(**kwargs): - place = PlaceInfo({'name': kwargs}) - name, _ = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place) +class TestStripBrace: - return sorted([(p.name, p.kind, p.suffix) for p in name]) + @pytest.fixture(autouse=True) + def setup_country(self, def_config): + self.config = def_config + def run_sanitizer_on(self, **kwargs): + place = PlaceInfo({'name': kwargs}) + name, _ = PlaceSanitizer([{'step': 'strip-brace-terms'}], self.config).process_names(place) -def test_no_braces(): - assert run_sanitizer_on(name='foo', ref='23') == [('23', 'ref', None), - ('foo', 'name', None)] + return sorted([(p.name, p.kind, p.suffix) for p in name]) -def test_simple_braces(): - assert run_sanitizer_on(name='Halle (Saale)', ref='3')\ - == [('3', 'ref', None), ('Halle', 'name', None), ('Halle (Saale)', 'name', None)] - assert run_sanitizer_on(name='ack ( bar')\ - == [('ack', 'name', None), ('ack ( bar', 'name', None)] + def test_no_braces(self): + assert self.run_sanitizer_on(name='foo', ref='23') == [('23', 'ref', None), + ('foo', 'name', None)] -def test_only_braces(): - assert run_sanitizer_on(name='(maybe)') == [('(maybe)', 'name', None)] + def test_simple_braces(self): + assert self.run_sanitizer_on(name='Halle (Saale)', ref='3')\ + == [('3', 'ref', None), ('Halle', 'name', None), ('Halle (Saale)', 'name', None)] + assert self.run_sanitizer_on(name='ack ( bar')\ + == [('ack', 'name', None), ('ack ( bar', 'name', None)] -def test_double_braces(): - assert run_sanitizer_on(name='a((b))') == [('a', 'name', None), - ('a((b))', 'name', None)] - assert run_sanitizer_on(name='a (b) (c)') == [('a', 'name', None), - ('a (b) (c)', 'name', None)] + def test_only_braces(self): + assert self.run_sanitizer_on(name='(maybe)') == [('(maybe)', 'name', None)] -def test_no_names(): + def test_double_braces(self): + assert self.run_sanitizer_on(name='a((b))') == [('a', 'name', None), + ('a((b))', 'name', None)] + assert self.run_sanitizer_on(name='a (b) (c)') == [('a', 'name', None), + ('a (b) (c)', 'name', None)] + + +def test_no_names(def_config): place = PlaceInfo({'address': {'housenumber': '3'}}) - name, address = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place) + name, address = PlaceSanitizer([{'step': 'strip-brace-terms'}], def_config).process_names(place) assert not name assert len(address) == 1 diff --git a/test/python/tokenizer/sanitizers/test_tag_analyzer_by_language.py b/test/python/tokenizer/sanitizers/test_tag_analyzer_by_language.py index 306b8027..1feecf3f 100644 --- a/test/python/tokenizer/sanitizers/test_tag_analyzer_by_language.py +++ b/test/python/tokenizer/sanitizers/test_tag_analyzer_by_language.py @@ -15,11 +15,16 @@ from nominatim.data.country_info import setup_country_config class TestWithDefaults: - @staticmethod - def run_sanitizer_on(country, **kwargs): + @pytest.fixture(autouse=True) + def setup_country(self, def_config): + self.config = def_config + + + def run_sanitizer_on(self, country, **kwargs): place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, 'country_code': country}) - name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language'}]).process_names(place) + name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language'}], + self.config).process_names(place) return sorted([(p.name, p.kind, p.suffix, p.attr) for p in name]) @@ -44,12 +49,17 @@ class TestWithDefaults: class TestFilterKind: - @staticmethod - def run_sanitizer_on(filt, **kwargs): + @pytest.fixture(autouse=True) + def setup_country(self, def_config): + self.config = def_config + + + def run_sanitizer_on(self, filt, **kwargs): place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, 'country_code': 'de'}) name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language', - 'filter-kind': filt}]).process_names(place) + 'filter-kind': filt}], + self.config).process_names(place) return sorted([(p.name, p.kind, p.suffix, p.attr) for p in name]) @@ -94,14 +104,16 @@ class TestDefaultCountry: @pytest.fixture(autouse=True) def setup_country(self, def_config): setup_country_config(def_config) + self.config = def_config + - @staticmethod - def run_sanitizer_append(mode, country, **kwargs): + def run_sanitizer_append(self, mode, country, **kwargs): place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, 'country_code': country}) name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language', 'use-defaults': mode, - 'mode': 'append'}]).process_names(place) + 'mode': 'append'}], + self.config).process_names(place) assert all(isinstance(p.attr, dict) for p in name) assert all(len(p.attr) <= 1 for p in name) @@ -111,13 +123,13 @@ class TestDefaultCountry: return sorted([(p.name, p.attr.get('analyzer', '')) for p in name]) - @staticmethod - def run_sanitizer_replace(mode, country, **kwargs): + def run_sanitizer_replace(self, mode, country, **kwargs): place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, 'country_code': country}) name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language', 'use-defaults': mode, - 'mode': 'replace'}]).process_names(place) + 'mode': 'replace'}], + self.config).process_names(place) assert all(isinstance(p.attr, dict) for p in name) assert all(len(p.attr) <= 1 for p in name) @@ -131,7 +143,8 @@ class TestDefaultCountry: place = PlaceInfo({'name': {'name': 'something'}}) name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language', 'use-defaults': 'all', - 'mode': 'replace'}]).process_names(place) + 'mode': 'replace'}], + self.config).process_names(place) assert len(name) == 1 assert name[0].name == 'something' @@ -199,14 +212,19 @@ class TestDefaultCountry: class TestCountryWithWhitelist: - @staticmethod - def run_sanitizer_on(mode, country, **kwargs): + @pytest.fixture(autouse=True) + def setup_country(self, def_config): + self.config = def_config + + + def run_sanitizer_on(self, mode, country, **kwargs): place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, 'country_code': country}) name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language', 'use-defaults': mode, 'mode': 'replace', - 'whitelist': ['de', 'fr', 'ru']}]).process_names(place) + 'whitelist': ['de', 'fr', 'ru']}], + self.config).process_names(place) assert all(isinstance(p.attr, dict) for p in name) assert all(len(p.attr) <= 1 for p in name) @@ -238,12 +256,17 @@ class TestCountryWithWhitelist: class TestWhiteList: - @staticmethod - def run_sanitizer_on(whitelist, **kwargs): + @pytest.fixture(autouse=True) + def setup_country(self, def_config): + self.config = def_config + + + def run_sanitizer_on(self, whitelist, **kwargs): place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}}) name, _ = PlaceSanitizer([{'step': 'tag-analyzer-by-language', 'mode': 'replace', - 'whitelist': whitelist}]).process_names(place) + 'whitelist': whitelist}], + self.config).process_names(place) assert all(isinstance(p.attr, dict) for p in name) assert all(len(p.attr) <= 1 for p in name) diff --git a/test/python/tokenizer/test_place_sanitizer.py b/test/python/tokenizer/test_place_sanitizer.py index 31401bd1..3dd3033c 100644 --- a/test/python/tokenizer/test_place_sanitizer.py +++ b/test/python/tokenizer/test_place_sanitizer.py @@ -47,8 +47,8 @@ def test_placeinfo_has_attr(): assert not place.has_attr('whatever') -def test_sanitizer_default(): - san = sanitizer.PlaceSanitizer([{'step': 'split-name-list'}]) +def test_sanitizer_default(def_config): + san = sanitizer.PlaceSanitizer([{'step': 'split-name-list'}], def_config) name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'}, 'address': {'street': 'Bald'}})) @@ -63,8 +63,8 @@ def test_sanitizer_default(): @pytest.mark.parametrize('rules', [None, []]) -def test_sanitizer_empty_list(rules): - san = sanitizer.PlaceSanitizer(rules) +def test_sanitizer_empty_list(def_config, rules): + san = sanitizer.PlaceSanitizer(rules, def_config) name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'}})) @@ -72,6 +72,6 @@ def test_sanitizer_empty_list(rules): assert all(isinstance(n, sanitizer.PlaceName) for n in name) -def test_sanitizer_missing_step_definition(): +def test_sanitizer_missing_step_definition(def_config): with pytest.raises(UsageError): - san = sanitizer.PlaceSanitizer([{'id': 'split-name-list'}]) + san = sanitizer.PlaceSanitizer([{'id': 'split-name-list'}], def_config) -- 2.39.5