From 3741afa6dc5eced78483b2c2793a8eead0a2396e Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 20 Jan 2022 15:42:42 +0100 Subject: [PATCH] generalize filter-kind parameter for sanatizers Now behaves the same for tag_analyzer_by_language and clean_housenumbers. Adds tests. --- .../sanitizers/clean_housenumbers.py | 15 +++++--- nominatim/tokenizer/sanitizers/helpers.py | 23 ++++++++++++ .../sanitizers/tag_analyzer_by_language.py | 16 ++------- .../tokenizer/sanitizers/test_helpers.py | 36 +++++++++++++++++++ 4 files changed, 72 insertions(+), 18 deletions(-) diff --git a/nominatim/tokenizer/sanitizers/clean_housenumbers.py b/nominatim/tokenizer/sanitizers/clean_housenumbers.py index b65880c3..9777a7fc 100644 --- a/nominatim/tokenizer/sanitizers/clean_housenumbers.py +++ b/nominatim/tokenizer/sanitizers/clean_housenumbers.py @@ -5,19 +5,24 @@ # Copyright (C) 2022 by the Nominatim developer community. # For a full list of authors see the git log. """ -Sanitizer that cleans and normalizes housenumbers. +Sanitizer that cleans and normalizes house numbers. Arguments: delimiters: Define the set of characters to be used for - splitting a list of housenumbers into parts. (default: ',;') + splitting a list of house numbers into parts. (default: ',;') + filter-kind: Define the address tags that are considered to be a + house number. Either takes a single string or a list of strings, + where each string is a regular expression. An address item + is considered a house number if the 'kind' fully matches any + of the given regular expressions. (default: 'housenumber') """ -from nominatim.tokenizer.sanitizers.helpers import create_split_regex +from nominatim.tokenizer.sanitizers.helpers import create_split_regex, create_kind_filter class _HousenumberSanitizer: def __init__(self, config): - self.kinds = config.get('filter-kind', ('housenumber', )) + self.filter_kind = create_kind_filter(config, 'housenumber') self.split_regexp = create_split_regex(config) @@ -27,7 +32,7 @@ class _HousenumberSanitizer: new_address = [] for item in obj.address: - if item.kind in self.kinds: + if self.filter_kind(item): new_address.extend(item.clone(kind='housenumber', name=n) for n in self.sanitize(item.name)) else: # Don't touch other address items. diff --git a/nominatim/tokenizer/sanitizers/helpers.py b/nominatim/tokenizer/sanitizers/helpers.py index 78b9a831..b92914e1 100644 --- a/nominatim/tokenizer/sanitizers/helpers.py +++ b/nominatim/tokenizer/sanitizers/helpers.py @@ -27,3 +27,26 @@ def create_split_regex(config, default=',;'): raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.") return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set))) + + +def create_kind_filter(config, default=None): + """ Create a filter function for the name kind from the 'filter-kind' + config parameter. The filter functions takes a name item and returns + True when the item passes the filter. + + If the parameter is empty, the filter lets all items pass. If the + paramter is a string, it is interpreted as a single regular expression + that must match the full kind string. If the parameter is a list then + any of the regular expressions in the list must match to pass. + """ + filters = config.get('filter-kind', default) + + if not filters: + return lambda _: True + + if isinstance(filters, str): + regex = re.compile(filters) + return lambda name: regex.fullmatch(name.kind) + + regexes = [re.compile(regex) for regex in filters] + return lambda name: any(regex.fullmatch(name.kind) for regex in regexes) diff --git a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py index 1305029a..964a9016 100644 --- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py +++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py @@ -33,17 +33,14 @@ Arguments: import re from nominatim.tools import country_info +from nominatim.tokenizer.sanitizers.helpers import create_kind_filter class _AnalyzerByLanguage: """ Processor for tagging the language of names in a place. """ def __init__(self, config): - if 'filter-kind' in config: - self.regexes = [re.compile(regex) for regex in config['filter-kind']] - else: - self.regexes = None - + self.filter_kind = create_kind_filter(config) self.replace = config.get('mode', 'replace') != 'append' self.whitelist = config.get('whitelist') @@ -63,13 +60,6 @@ class _AnalyzerByLanguage: self.deflangs[ccode] = clangs - def _kind_matches(self, kind): - if self.regexes is None: - return True - - return any(regex.fullmatch(kind) for regex in self.regexes) - - def _suffix_matches(self, suffix): if self.whitelist is None: return len(suffix) in (2, 3) and suffix.islower() @@ -84,7 +74,7 @@ class _AnalyzerByLanguage: more_names = [] for name in (n for n in obj.names - if not n.has_attr('analyzer') and self._kind_matches(n.kind)): + if not n.has_attr('analyzer') and self.filter_kind(n)): if name.suffix: langs = [name.suffix] if self._suffix_matches(name.suffix) else None else: diff --git a/test/python/tokenizer/sanitizers/test_helpers.py b/test/python/tokenizer/sanitizers/test_helpers.py index a0a1d29c..911fbdd7 100644 --- a/test/python/tokenizer/sanitizers/test_helpers.py +++ b/test/python/tokenizer/sanitizers/test_helpers.py @@ -10,6 +10,7 @@ Tests for sanitizer helper functions. import pytest from nominatim.errors import UsageError +from nominatim.tokenizer.place_sanitizer import PlaceName import nominatim.tokenizer.sanitizers.helpers as helpers @pytest.mark.parametrize('inp', ('fg34', 'f\\f', 'morning [glory]', '56.78')) @@ -41,3 +42,38 @@ def test_create_split_regex_custom(delimiter): def test_create_split_regex_empty_delimiter(): with pytest.raises(UsageError): regex = helpers.create_split_regex({'delimiters': ''}) + + +@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*')) +def test_create_kind_filter_no_params(inp): + filt = helpers.create_kind_filter({}) + + assert filt(PlaceName('something', inp, '')) + + +@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende')) +def test_create_kind_filter_custom_regex_positive(kind): + filt = helpers.create_kind_filter({'filter-kind': '.*de'}) + + assert filt(PlaceName('something', kind, '')) + + +@pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece')) +def test_create_kind_filter_custom_regex_negative(kind): + filt = helpers.create_kind_filter({'filter-kind': '.*de'}) + + assert not filt(PlaceName('something', kind, '')) + + +@pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34')) +def test_create_kind_filter_many_positive(kind): + filt = helpers.create_kind_filter({'filter-kind': ['.*fr', 'name', r'\d+']}) + + assert filt(PlaceName('something', kind, '')) + + +@pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\')) +def test_create_kind_filter_many_negative(kind): + filt = helpers.create_kind_filter({'filter-kind': ['.*fr', 'name', r'\d+']}) + + assert not filt(PlaceName('something', kind, '')) -- 2.39.5