From 610f2cc254cf442c895351907be6405f03026903 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 7 Feb 2022 10:48:00 +0100 Subject: [PATCH] sanitizer: move helpers into a configuration class --- nominatim/tokenizer/icu_rule_loader.py | 2 +- nominatim/tokenizer/place_sanitizer.py | 3 +- .../sanitizers/clean_housenumbers.py | 15 ++-- nominatim/tokenizer/sanitizers/config.py | 82 +++++++++++++++++++ nominatim/tokenizer/sanitizers/helpers.py | 52 ------------ .../tokenizer/sanitizers/split_name_list.py | 6 +- .../sanitizers/tag_analyzer_by_language.py | 7 +- settings/icu_tokenizer.yaml | 2 +- .../sanitizers/test_clean_housenumbers.py | 13 +++ .../{test_helpers.py => test_config.py} | 22 ++--- 10 files changed, 124 insertions(+), 80 deletions(-) create mode 100644 nominatim/tokenizer/sanitizers/config.py delete mode 100644 nominatim/tokenizer/sanitizers/helpers.py rename test/python/tokenizer/sanitizers/{test_helpers.py => test_config.py} (75%) diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py index dcf119a3..8a564355 100644 --- a/nominatim/tokenizer/icu_rule_loader.py +++ b/nominatim/tokenizer/icu_rule_loader.py @@ -45,7 +45,7 @@ class ICURuleLoader: rules = config.load_sub_configuration('icu_tokenizer.yaml', config='TOKENIZER_CONFIG') - # Make sure country information is available to analyzers and sanatizers. + # Make sure country information is available to analyzers and sanitizers. nominatim.tools.country_info.setup_country_config(config) self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization') diff --git a/nominatim/tokenizer/place_sanitizer.py b/nominatim/tokenizer/place_sanitizer.py index 1d47a8b4..913b363c 100644 --- a/nominatim/tokenizer/place_sanitizer.py +++ b/nominatim/tokenizer/place_sanitizer.py @@ -11,6 +11,7 @@ is handed to the token analysis. import importlib from nominatim.errors import UsageError +from nominatim.tokenizer.sanitizers.config import SanitizerConfig class PlaceName: """ A searchable name for a place together with properties. @@ -117,7 +118,7 @@ class PlaceSanitizer: raise UsageError("Sanitizer rule is missing the 'step' attribute.") module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_') handler_module = importlib.import_module(module_name) - self.handlers.append(handler_module.create(func)) + self.handlers.append(handler_module.create(SanitizerConfig(func))) def process_names(self, place): diff --git a/nominatim/tokenizer/sanitizers/clean_housenumbers.py b/nominatim/tokenizer/sanitizers/clean_housenumbers.py index 48021793..c229716f 100644 --- a/nominatim/tokenizer/sanitizers/clean_housenumbers.py +++ b/nominatim/tokenizer/sanitizers/clean_housenumbers.py @@ -19,17 +19,20 @@ Arguments: where each string is a regular expression. An address item is considered a house number if the 'kind' fully matches any of the given regular expressions. (default: 'housenumber') - + convert-to-name: Define house numbers that should be treated as a name + instead of a house number. Either takes a single string + or a list of strings, where each string is a regular + expression that must match the full house number value. """ -from nominatim.tokenizer.sanitizers.helpers import create_split_regex, create_kind_filter +import re class _HousenumberSanitizer: def __init__(self, config): - self.filter_kind = create_kind_filter(config, 'housenumber') - self.split_regexp = create_split_regex(config) + self.filter_kind = config.get_filter_kind('housenumber') + self.split_regexp = config.get_delimiter() - nameregexps = config.get('is-a-name', []) + nameregexps = config.get_string_list('convert-to-name', []) self.is_name_regexp = [re.compile(r) for r in nameregexps] @@ -41,7 +44,7 @@ class _HousenumberSanitizer: new_address = [] for item in obj.address: if self.filter_kind(item): - if self.treat_as_name(item.name): + if self._treat_as_name(item.name): obj.names.append(item.clone(kind='housenumber')) else: new_address.extend(item.clone(kind='housenumber', name=n) diff --git a/nominatim/tokenizer/sanitizers/config.py b/nominatim/tokenizer/sanitizers/config.py new file mode 100644 index 00000000..3ff681b9 --- /dev/null +++ b/nominatim/tokenizer/sanitizers/config.py @@ -0,0 +1,82 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Configuration for Sanitizers. +""" +from collections import UserDict +import re + +from nominatim.errors import UsageError + +class SanitizerConfig(UserDict): + """ Dictionary with configuration options for a sanitizer. + + In addition to the usualy dictionary function, the class provides + accessors to standard sanatizer options that are used by many of the + sanitizers. + """ + + def get_string_list(self, param, default=tuple()): + """ Extract a configuration parameter as a string list. + If the parameter value is a simple string, it is returned as a + one-item list. If the parameter value does not exist, the given + default is returned. If the parameter value is a list, it is checked + to contain only strings before being returned. + """ + values = self.data.get(param, None) + + if values is None: + return None if default is None else list(default) + + if isinstance(values, str): + return [values] + + if not isinstance(values, (list, tuple)): + raise UsageError(f"Parameter '{param}' must be string or list of strings.") + + if any(not isinstance(value, str) for value in values): + raise UsageError(f"Parameter '{param}' must be string or list of strings.") + + return values + + + def get_delimiter(self, default=',;'): + """ Return the 'delimiter' parameter in the configuration as a + compiled regular expression that can be used to split the names on the + delimiters. The regular expression makes sure that the resulting names + are stripped and that repeated delimiters + are ignored but it will still create empty fields on occasion. The + code needs to filter those. + + The 'default' parameter defines the delimiter set to be used when + not explicitly configured. + """ + delimiter_set = set(self.data.get('delimiters', default)) + if not delimiter_set: + raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.") + + return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set))) + + + def get_filter_kind(self, *default): + """ Return a filter function for the name kind from the 'filter-kind' + config parameter. The filter functions takes a name item and returns + True when the item passes the filter. + + If the parameter is empty, the filter lets all items pass. If the + paramter is a string, it is interpreted as a single regular expression + that must match the full kind string. If the parameter is a list then + any of the regular expressions in the list must match to pass. + """ + filters = self.get_string_list('filter-kind', default) + + if not filters: + return lambda _: True + + regexes = [re.compile(regex) for regex in filters] + + return lambda name: any(regex.fullmatch(name.kind) for regex in regexes) diff --git a/nominatim/tokenizer/sanitizers/helpers.py b/nominatim/tokenizer/sanitizers/helpers.py deleted file mode 100644 index b92914e1..00000000 --- a/nominatim/tokenizer/sanitizers/helpers.py +++ /dev/null @@ -1,52 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# This file is part of Nominatim. (https://nominatim.org) -# -# Copyright (C) 2022 by the Nominatim developer community. -# For a full list of authors see the git log. -""" -Helper functions for sanitizers. -""" -import re - -from nominatim.errors import UsageError - -def create_split_regex(config, default=',;'): - """ Converts the 'delimiter' parameter in the configuration into a - compiled regular expression that can be used to split the names on the - delimiters. The regular expression makes sure that the resulting names - are stripped and that repeated delimiters - are ignored but it will still create empty fields on occasion. The - code needs to filter those. - - The 'default' parameter defines the delimiter set to be used when - not explicitly configured. - """ - delimiter_set = set(config.get('delimiters', default)) - if not delimiter_set: - raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.") - - return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set))) - - -def create_kind_filter(config, default=None): - """ Create a filter function for the name kind from the 'filter-kind' - config parameter. The filter functions takes a name item and returns - True when the item passes the filter. - - If the parameter is empty, the filter lets all items pass. If the - paramter is a string, it is interpreted as a single regular expression - that must match the full kind string. If the parameter is a list then - any of the regular expressions in the list must match to pass. - """ - filters = config.get('filter-kind', default) - - if not filters: - return lambda _: True - - if isinstance(filters, str): - regex = re.compile(filters) - return lambda name: regex.fullmatch(name.kind) - - regexes = [re.compile(regex) for regex in filters] - return lambda name: any(regex.fullmatch(name.kind) for regex in regexes) diff --git a/nominatim/tokenizer/sanitizers/split_name_list.py b/nominatim/tokenizer/sanitizers/split_name_list.py index 13921c3e..c9db0a9d 100644 --- a/nominatim/tokenizer/sanitizers/split_name_list.py +++ b/nominatim/tokenizer/sanitizers/split_name_list.py @@ -11,13 +11,11 @@ Arguments: delimiters: Define the set of characters to be used for splitting the list. (default: ',;') """ -from nominatim.tokenizer.sanitizers.helpers import create_split_regex - -def create(func): +def create(config): """ Create a name processing function that splits name values with multiple values into their components. """ - regexp = create_split_regex(func) + regexp = config.get_delimiter() def _process(obj): if not obj.names: diff --git a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py index cbf32179..7898b1c6 100644 --- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py +++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py @@ -31,21 +31,20 @@ Arguments: """ from nominatim.tools import country_info -from nominatim.tokenizer.sanitizers.helpers import create_kind_filter class _AnalyzerByLanguage: """ Processor for tagging the language of names in a place. """ def __init__(self, config): - self.filter_kind = create_kind_filter(config) + self.filter_kind = config.get_filter_kind() self.replace = config.get('mode', 'replace') != 'append' self.whitelist = config.get('whitelist') - self.__compute_default_languages(config.get('use-defaults', 'no')) + self._compute_default_languages(config.get('use-defaults', 'no')) - def __compute_default_languages(self, use_defaults): + def _compute_default_languages(self, use_defaults): self.deflangs = {} if use_defaults in ('mono', 'all'): diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index 7f53c5a7..50bb72d2 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -30,7 +30,7 @@ sanitizers: - housenumber - conscriptionnumber - streetnumber - is-a-name: + convert-to-name: - (\A|.*,)[^\d,]{3,}(,.*|\Z) - step: split-name-list - step: strip-brace-terms diff --git a/test/python/tokenizer/sanitizers/test_clean_housenumbers.py b/test/python/tokenizer/sanitizers/test_clean_housenumbers.py index 5784619b..8a27de6f 100644 --- a/test/python/tokenizer/sanitizers/test_clean_housenumbers.py +++ b/test/python/tokenizer/sanitizers/test_clean_housenumbers.py @@ -42,3 +42,16 @@ def test_housenumber_lists(sanitize, number): def test_filter_kind(sanitize): assert sanitize(housenumber='34', number='4', badnumber='65') == \ [('badnumber', '65'), ('housenumber', '34'), ('housenumber', '4')] + + +@pytest.mark.parametrize('number', ('6523', 'n/a', '4')) +def test_convert_to_name_converted(number): + sanitizer_args = {'step': 'clean-housenumbers', + 'convert-to-name': (r'\d+', 'n/a')} + + place = PlaceInfo({'address': {'housenumber': number}}) + names, address = PlaceSanitizer([sanitizer_args]).process_names(place) + + assert ('housenumber', number) in set((p.kind, p.name) for p in names) + assert 'housenumber' not in set(p.kind for p in address) + diff --git a/test/python/tokenizer/sanitizers/test_helpers.py b/test/python/tokenizer/sanitizers/test_config.py similarity index 75% rename from test/python/tokenizer/sanitizers/test_helpers.py rename to test/python/tokenizer/sanitizers/test_config.py index 911fbdd7..612df5d9 100644 --- a/test/python/tokenizer/sanitizers/test_helpers.py +++ b/test/python/tokenizer/sanitizers/test_config.py @@ -5,17 +5,17 @@ # Copyright (C) 2022 by the Nominatim developer community. # For a full list of authors see the git log. """ -Tests for sanitizer helper functions. +Tests for sanitizer configuration helper functions. """ import pytest from nominatim.errors import UsageError from nominatim.tokenizer.place_sanitizer import PlaceName -import nominatim.tokenizer.sanitizers.helpers as helpers +from nominatim.tokenizer.sanitizers.config import SanitizerConfig @pytest.mark.parametrize('inp', ('fg34', 'f\\f', 'morning [glory]', '56.78')) def test_create_split_regex_no_params_unsplit(inp): - regex = helpers.create_split_regex({}) + regex = SanitizerConfig().get_delimiter() assert list(regex.split(inp)) == [inp] @@ -26,14 +26,14 @@ def test_create_split_regex_no_params_unsplit(inp): ('1, 3 ,5', ['1', '3', '5']) ]) def test_create_split_regex_no_params_split(inp, outp): - regex = helpers.create_split_regex({}) + regex = SanitizerConfig().get_delimiter() assert list(regex.split(inp)) == outp @pytest.mark.parametrize('delimiter', ['.', '\\', '[]', ' ', '/.*+']) def test_create_split_regex_custom(delimiter): - regex = helpers.create_split_regex({'delimiters': delimiter}) + regex = SanitizerConfig({'delimiters': delimiter}).get_delimiter() assert list(regex.split(f'out{delimiter}house')) == ['out', 'house'] assert list(regex.split('out,house')) == ['out,house'] @@ -41,39 +41,39 @@ def test_create_split_regex_custom(delimiter): def test_create_split_regex_empty_delimiter(): with pytest.raises(UsageError): - regex = helpers.create_split_regex({'delimiters': ''}) + regex = SanitizerConfig({'delimiters': ''}).get_delimiter() @pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*')) def test_create_kind_filter_no_params(inp): - filt = helpers.create_kind_filter({}) + filt = SanitizerConfig().get_filter_kind() assert filt(PlaceName('something', inp, '')) @pytest.mark.parametrize('kind', ('de', 'name:de', 'ende')) def test_create_kind_filter_custom_regex_positive(kind): - filt = helpers.create_kind_filter({'filter-kind': '.*de'}) + filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind() assert filt(PlaceName('something', kind, '')) @pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece')) def test_create_kind_filter_custom_regex_negative(kind): - filt = helpers.create_kind_filter({'filter-kind': '.*de'}) + filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind() assert not filt(PlaceName('something', kind, '')) @pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34')) def test_create_kind_filter_many_positive(kind): - filt = helpers.create_kind_filter({'filter-kind': ['.*fr', 'name', r'\d+']}) + filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind() assert filt(PlaceName('something', kind, '')) @pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\')) def test_create_kind_filter_many_negative(kind): - filt = helpers.create_kind_filter({'filter-kind': ['.*fr', 'name', r'\d+']}) + filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind() assert not filt(PlaceName('something', kind, '')) -- 2.39.5