From ca149fb796b1c5e6705a25005683548b816d20f2 Mon Sep 17 00:00:00 2001 From: biswajit-k Date: Thu, 2 Mar 2023 20:25:06 +0530 Subject: [PATCH] Adds sanitizer for preventing certain tags to enter search index based on parameters fix: pylint error added docs for delete tags sanitizer fixed typos in docs and code comments fix: python typechecking error fixed rank address type Revert "fixed typos in docs and code comments" This reverts commit 6839eea755a87f557895f30524fb5c03dd983d60. added default parameters and refactored code added test for all parameters --- docs/customize/Tokenizers.md | 10 +- nominatim/tokenizer/sanitizers/delete_tags.py | 144 ++++++++ .../tokenizer/sanitizers/test_delete_tags.py | 327 ++++++++++++++++++ 3 files changed, 479 insertions(+), 2 deletions(-) create mode 100644 nominatim/tokenizer/sanitizers/delete_tags.py create mode 100644 test/python/tokenizer/sanitizers/test_delete_tags.py diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md index 58606c29..11c27e38 100644 --- a/docs/customize/Tokenizers.md +++ b/docs/customize/Tokenizers.md @@ -102,7 +102,7 @@ Here is an example configuration file: ``` yaml normalization: - ":: lower ()" - - "ß > 'ss'" # German szet is unimbigiously equal to double ss + - "ß > 'ss'" # German szet is unambiguously equal to double ss transliteration: - !include /etc/nominatim/icu-rules/extended-unicode-to-asccii.yaml - ":: Ascii ()" @@ -128,7 +128,7 @@ The configuration file contains four sections: The normalization and transliteration sections each define a set of ICU rules that are applied to the names. -The **normalisation** rules are applied after sanitation. They should remove +The **normalization** rules are applied after sanitation. They should remove any information that is not relevant for search at all. Usual rules to be applied here are: lower-casing, removing of special characters, cleanup of spaces. @@ -221,7 +221,13 @@ The following is a list of sanitizers that are shipped with Nominatim. rendering: heading_level: 6 +#### delete-tags +::: nominatim.tokenizer.sanitizers.delete_tags + selection: + members: False + rendering: + heading_level: 6 #### Token Analysis diff --git a/nominatim/tokenizer/sanitizers/delete_tags.py b/nominatim/tokenizer/sanitizers/delete_tags.py new file mode 100644 index 00000000..fd35de48 --- /dev/null +++ b/nominatim/tokenizer/sanitizers/delete_tags.py @@ -0,0 +1,144 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2023 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Sanitizer which prevents certain tags from getting into the search index. +It remove tags which matches all properties given below. + + +Arguments: + type: Define which type of tags should be considered for removal. + There are two types of tags 'name' and 'address' tags. + Takes a string 'name' or 'address'. (default: 'name') + + filter-kind: Define which 'kind' of tags should be removed. + Takes a string or list of strings where each + string is a regular expression. A tag is considered + to be a candidate for removal if its 'kind' property + fully matches any of the given regular expressions. + Note that by default all 'kind' of tags are considered. + + suffix: Define the 'suffix' property of the tags which should be + removed. Takes a string or list of strings where each + string is a regular expression. A tag is considered to be a + candidate for removal if its 'suffix' property fully + matches any of the given regular expressions. Note that by + default tags with any suffix value are considered including + those which don't have a suffix at all. + + name: Define the 'name' property corresponding to the 'kind' property + of the tag. Takes a string or list of strings where each string + is a regular expression. A tag is considered to be a candidate + for removal if its name fully matches any of the given regular + expressions. Note that by default tags with any 'name' are + considered. + + country_code: Define the country code of places whose tags should be + considered for removed. Takes a string or list of strings + where each string is a two-letter lower-case country code. + Note that by default tags of places with any country code + are considered including those which don't have a country + code at all. + + rank_address: Define the address rank of places whose tags should be + considered for removal. Takes a string or list of strings + where each string is a number or range of number or the + form -. + Note that default is '0-30', which means that tags of all + places are considered. + See https://nominatim.org/release-docs/latest/customize/Ranking/#address-rank + to learn more about address rank. + + +""" +from typing import Callable, List, Optional, Pattern, Tuple, Sequence +import re + +from nominatim.tokenizer.sanitizers.base import ProcessInfo +from nominatim.data.place_name import PlaceName +from nominatim.tokenizer.sanitizers.config import SanitizerConfig + +class _TagSanitizer: + + def __init__(self, config: SanitizerConfig) -> None: + self.type = config.get('type', 'name') + self.filter_kind = config.get_filter_kind() + self.country_codes = config.get_string_list('country_code', []) + self.allowed_ranks = self._set_allowed_ranks( \ + config.get_string_list('rank_address', ['0-30'])) + + self.has_country_code = config.get('country_code', None) is not None + + suffixregexps = config.get_string_list('suffix', [r'[\s\S]*']) + self.suffix_regexp = [re.compile(r) for r in suffixregexps] + + nameregexps = config.get_string_list('name', [r'[\s\S]*']) + self.name_regexp = [re.compile(r) for r in nameregexps] + + + + def __call__(self, obj: ProcessInfo) -> None: + tags = obj.names if self.type == 'name' else obj.address + + if (not tags or + self.has_country_code and + obj.place.country_code not in self.country_codes or + not self.allowed_ranks[obj.place.rank_address]): + return + + filtered_tags: List[PlaceName] = [] + + for tag in tags: + + if (not self.filter_kind(tag.kind) or + not self._matches(tag.suffix, self.suffix_regexp) or + not self._matches(tag.name, self.name_regexp)): + filtered_tags.append(tag) + + + if self.type == 'name': + obj.names = filtered_tags + else: + obj.address = filtered_tags + + + def _set_allowed_ranks(self, ranks: Sequence[str]) -> Tuple[bool, ...]: + """ Returns a tuple of 31 boolean values corresponding to the + address ranks 0-30. Value at index 'i' is True if rank 'i' + is present in the ranks or lies in the range of any of the + ranks provided in the sanitizer configuration, otherwise + the value is False. + """ + allowed_ranks = [False] * 31 + + for rank in ranks: + intvl = [int(x) for x in rank.split('-')] + + start, end = (intvl[0], intvl[0]) if len(intvl) == 1 else (intvl[0], intvl[1]) + + for i in range(start, end + 1): + allowed_ranks[i] = True + + + return tuple(allowed_ranks) + + + def _matches(self, value: Optional[str], patterns: List[Pattern[str]]) -> bool: + """ Returns True if the given value fully matches any of the regular + expression pattern in the list. Otherwise, returns False. + + Note that if the value is None, it is taken as an empty string. + """ + target = '' if value is None else value + return any(r.fullmatch(target) is not None for r in patterns) + + + +def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]: + """ Create a function to process removal of certain tags. + """ + + return _TagSanitizer(config) diff --git a/test/python/tokenizer/sanitizers/test_delete_tags.py b/test/python/tokenizer/sanitizers/test_delete_tags.py new file mode 100644 index 00000000..f9ccc2f6 --- /dev/null +++ b/test/python/tokenizer/sanitizers/test_delete_tags.py @@ -0,0 +1,327 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2023 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for the sanitizer that normalizes housenumbers. +""" +import pytest + + +from nominatim.data.place_info import PlaceInfo +from nominatim.tokenizer.place_sanitizer import PlaceSanitizer + + +class TestWithDefault: + + @pytest.fixture(autouse=True) + def setup_country(self, def_config): + self.config = def_config + + def run_sanitizer_on(self, type, **kwargs): + + place = PlaceInfo({type: {k.replace('_', ':'): v for k, v in kwargs.items()}, + 'country_code': 'de', 'rank_address': 30}) + + sanitizer_args = {'step': 'delete-tags'} + + name, address = PlaceSanitizer([sanitizer_args], + self.config).process_names(place) + + return { + 'name': sorted([(p.name, p.kind, p.suffix or '') for p in name]), + 'address': sorted([(p.name, p.kind, p.suffix or '') for p in address]) + } + + + def test_on_name(self): + res = self.run_sanitizer_on('name', name='foo', ref='bar', ref_abc='baz') + + assert res.get('name') == [] + + def test_on_address(self): + res = self.run_sanitizer_on('address', name='foo', ref='bar', ref_abc='baz') + + assert res.get('address') == [('bar', 'ref', ''), ('baz', 'ref', 'abc'), + ('foo', 'name', '')] + + +class TestTypeField: + + @pytest.fixture(autouse=True) + def setup_country(self, def_config): + self.config = def_config + + def run_sanitizer_on(self, type, **kwargs): + + place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, + 'country_code': 'de', 'rank_address': 30}) + + sanitizer_args = { + 'step': 'delete-tags', + 'type': type, + } + + name, _ = PlaceSanitizer([sanitizer_args], + self.config).process_names(place) + + return sorted([(p.name, p.kind, p.suffix or '') for p in name]) + + def test_name_type(self): + res = self.run_sanitizer_on('name', name='foo', ref='bar', ref_abc='baz') + + assert res == [] + + def test_address_type(self): + res = self.run_sanitizer_on('address', name='foo', ref='bar', ref_abc='baz') + + assert res == [('bar', 'ref', ''), ('baz', 'ref', 'abc'), + ('foo', 'name', '')] + +class TestFilterKind: + + @pytest.fixture(autouse=True) + def setup_country(self, def_config): + self.config = def_config + + def run_sanitizer_on(self, filt, **kwargs): + + place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, + 'country_code': 'de', 'rank_address': 30}) + + sanitizer_args = { + 'step': 'delete-tags', + 'filter-kind': filt, + } + + name, _ = PlaceSanitizer([sanitizer_args], + self.config).process_names(place) + + return sorted([(p.name, p.kind, p.suffix or '') for p in name]) + + def test_single_exact_name(self): + res = self.run_sanitizer_on(['name'], ref='foo', name='foo', + name_abc='bar', ref_abc='bar') + + assert res == [('bar', 'ref', 'abc'), ('foo', 'ref', '')] + + + def test_single_pattern(self): + res = self.run_sanitizer_on(['.*name'], + name_fr='foo', ref_fr='foo', namexx_fr='bar', + shortname_fr='bar', name='bar') + + assert res == [('bar', 'namexx', 'fr'), ('foo', 'ref', 'fr')] + + + def test_multiple_patterns(self): + res = self.run_sanitizer_on(['.*name', 'ref'], + name_fr='foo', ref_fr='foo', oldref_fr='foo', + namexx_fr='bar', shortname_fr='baz', name='baz') + + assert res == [('bar', 'namexx', 'fr'), ('foo', 'oldref', 'fr')] + + +class TestRankAddress: + + @pytest.fixture(autouse=True) + def setup_country(self, def_config): + self.config = def_config + + def run_sanitizer_on(self, rank_addr, **kwargs): + + place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, + 'country_code': 'de', 'rank_address': 30}) + + sanitizer_args = { + 'step': 'delete-tags', + 'rank_address': rank_addr + } + + name, _ = PlaceSanitizer([sanitizer_args], + self.config).process_names(place) + + return sorted([(p.name, p.kind, p.suffix or '') for p in name]) + + + def test_single_rank(self): + res = self.run_sanitizer_on('30', name='foo', ref='bar') + + assert res == [] + + def test_single_rank_fail(self): + res = self.run_sanitizer_on('28', name='foo', ref='bar') + + assert res == [('bar', 'ref', ''), ('foo', 'name', '')] + + def test_ranged_rank_pass(self): + res = self.run_sanitizer_on('26-30', name='foo', ref='bar') + + assert res == [] + + def test_ranged_rank_fail(self): + res = self.run_sanitizer_on('26-29', name='foo', ref='bar') + + assert res == [('bar', 'ref', ''), ('foo', 'name', '')] + + def test_mixed_rank_pass(self): + res = self.run_sanitizer_on(['4', '20-28', '30', '10-12'], name='foo', ref='bar') + + assert res == [] + + def test_mixed_rank_fail(self): + res = self.run_sanitizer_on(['4-8', '10', '26-29', '18'], name='foo', ref='bar') + + assert res == [('bar', 'ref', ''), ('foo', 'name', '')] + + +class TestSuffix: + + @pytest.fixture(autouse=True) + def setup_country(self, def_config): + self.config = def_config + + def run_sanitizer_on(self, suffix, **kwargs): + + place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, + 'country_code': 'de', 'rank_address': 30}) + + sanitizer_args = { + 'step': 'delete-tags', + 'suffix': suffix, + } + + name, _ = PlaceSanitizer([sanitizer_args], + self.config).process_names(place) + + return sorted([(p.name, p.kind, p.suffix or '') for p in name]) + + + def test_single_suffix(self): + res = self.run_sanitizer_on('abc', name='foo', name_abc='foo', + name_pqr='bar', ref='bar', ref_abc='baz') + + assert res == [('bar', 'name', 'pqr'), ('bar', 'ref', ''), ('foo', 'name', '')] + + def test_multiple_suffix(self): + res = self.run_sanitizer_on(['abc.*', 'pqr'], name='foo', name_abcxx='foo', + ref_pqr='bar', name_pqrxx='baz') + + assert res == [('baz', 'name', 'pqrxx'), ('foo', 'name', '')] + + + +class TestCountryCodes: + + @pytest.fixture(autouse=True) + def setup_country(self, def_config): + self.config = def_config + + def run_sanitizer_on(self, country_code, **kwargs): + + place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, + 'country_code': 'de', 'rank_address': 30}) + + sanitizer_args = { + 'step': 'delete-tags', + 'country_code': country_code, + } + + name, _ = PlaceSanitizer([sanitizer_args], + self.config).process_names(place) + + return sorted([(p.name, p.kind) for p in name]) + + + def test_single_country_code_pass(self): + res = self.run_sanitizer_on('de', name='foo', ref='bar') + + assert res == [] + + def test_single_country_code_fail(self): + res = self.run_sanitizer_on('in', name='foo', ref='bar') + + assert res == [('bar', 'ref'), ('foo', 'name')] + + def test_empty_country_code_list(self): + res = self.run_sanitizer_on([], name='foo', ref='bar') + + assert res == [('bar', 'ref'), ('foo', 'name')] + + def test_multiple_country_code_pass(self): + res = self.run_sanitizer_on(['in', 'de', 'fr'], name='foo', ref='bar') + + assert res == [] + + def test_multiple_country_code_fail(self): + res = self.run_sanitizer_on(['in', 'au', 'fr'], name='foo', ref='bar') + + assert res == [('bar', 'ref'), ('foo', 'name')] + +class TestAllParameters: + + @pytest.fixture(autouse=True) + def setup_country(self, def_config): + self.config = def_config + + def run_sanitizer_on(self, country_code, rank_addr, suffix, **kwargs): + + place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()}, + 'country_code': 'de', 'rank_address': 30}) + + sanitizer_args = { + 'step': 'delete-tags', + 'type': 'name', + 'filter-kind': ['name', 'ref'], + 'country_code': country_code, + 'rank_address': rank_addr, + 'suffix': suffix, + 'name': r'[\s\S]*', + } + + name, _ = PlaceSanitizer([sanitizer_args], + self.config).process_names(place) + + return sorted([(p.name, p.kind, p.suffix or '') for p in name]) + + + def test_string_arguments_pass(self): + res = self.run_sanitizer_on('de', '25-30', r'[\s\S]*', + name='foo', ref='foo', name_abc='bar', ref_abc='baz') + + assert res == [] + + def test_string_arguments_fail(self): + res = self.run_sanitizer_on('in', '25-30', r'[\s\S]*', + name='foo', ref='foo', name_abc='bar', ref_abc='baz') + + assert res == [('bar', 'name', 'abc'), ('baz', 'ref', 'abc'), + ('foo', 'name', ''), ('foo', 'ref', '')] + + def test_list_arguments_pass(self): + res = self.run_sanitizer_on(['de', 'in'], ['20-28', '30'], [r'abc.*', r'[\s\S]*'], + name='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz') + + assert res == [] + + def test_list_arguments_fail(self): + res = self.run_sanitizer_on(['de', 'in'], ['14', '20-29'], [r'abc.*', r'pqr'], + name='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz') + + assert res == [('bar', 'name', 'abcxx'), ('baz', 'ref', 'pqr'), + ('foo', 'name', ''), ('foo', 'ref', 'abc')] + + def test_mix_arguments_pass(self): + res = self.run_sanitizer_on('de', ['10', '20-28', '30'], r'[\s\S]*', + name='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz') + + assert res == [] + + def test_mix_arguments_fail(self): + res = self.run_sanitizer_on(['de', 'in'], ['10', '20-28', '30'], r'abc.*', + name='foo', ref='foo', name_pqr='bar', ref_pqr='baz') + + assert res == [('bar', 'name', 'pqr'), ('baz', 'ref', 'pqr'), + ('foo', 'name', ''), ('foo', 'ref', '')] \ No newline at end of file -- 2.39.5