--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only\r
+#\r
+# This file is part of Nominatim. (https://nominatim.org)\r
+#\r
+# Copyright (C) 2023 by the Nominatim developer community.\r
+# For a full list of authors see the git log.\r
+"""\r
+Sanitizer which prevents certain tags from getting into the search index.\r
+It remove tags which matches all properties given below.\r
+\r
+\r
+Arguments:\r
+ type: Define which type of tags should be considered for removal.\r
+ There are two types of tags 'name' and 'address' tags.\r
+ Takes a string 'name' or 'address'. (default: 'name')\r
+\r
+ filter-kind: Define which 'kind' of tags should be removed.\r
+ Takes a string or list of strings where each\r
+ string is a regular expression. A tag is considered\r
+ to be a candidate for removal if its 'kind' property\r
+ fully matches any of the given regular expressions.\r
+ Note that by default all 'kind' of tags are considered.\r
+\r
+ suffix: Define the 'suffix' property of the tags which should be\r
+ removed. Takes a string or list of strings where each\r
+ string is a regular expression. A tag is considered to be a\r
+ candidate for removal if its 'suffix' property fully\r
+ matches any of the given regular expressions. Note that by\r
+ default tags with any suffix value are considered including\r
+ those which don't have a suffix at all.\r
+\r
+ name: Define the 'name' property corresponding to the 'kind' property\r
+ of the tag. Takes a string or list of strings where each string\r
+ is a regular expression. A tag is considered to be a candidate\r
+ for removal if its name fully matches any of the given regular\r
+ expressions. Note that by default tags with any 'name' are\r
+ considered.\r
+\r
+ country_code: Define the country code of places whose tags should be\r
+ considered for removed. Takes a string or list of strings\r
+ where each string is a two-letter lower-case country code.\r
+ Note that by default tags of places with any country code\r
+ are considered including those which don't have a country\r
+ code at all.\r
+\r
+ rank_address: Define the address rank of places whose tags should be\r
+ considered for removal. Takes a string or list of strings\r
+ where each string is a number or range of number or the\r
+ form <from>-<to>.\r
+ Note that default is '0-30', which means that tags of all\r
+ places are considered.\r
+ See https://nominatim.org/release-docs/latest/customize/Ranking/#address-rank\r
+ to learn more about address rank.\r
+\r
+\r
+"""\r
+from typing import Callable, List, Optional, Pattern, Tuple, Sequence\r
+import re\r
+\r
+from nominatim.tokenizer.sanitizers.base import ProcessInfo\r
+from nominatim.data.place_name import PlaceName\r
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig\r
+\r
+class _TagSanitizer:\r
+\r
+ def __init__(self, config: SanitizerConfig) -> None:\r
+ self.type = config.get('type', 'name')\r
+ self.filter_kind = config.get_filter_kind()\r
+ self.country_codes = config.get_string_list('country_code', [])\r
+ self.allowed_ranks = self._set_allowed_ranks( \\r
+ config.get_string_list('rank_address', ['0-30']))\r
+\r
+ self.has_country_code = config.get('country_code', None) is not None\r
+\r
+ suffixregexps = config.get_string_list('suffix', [r'[\s\S]*'])\r
+ self.suffix_regexp = [re.compile(r) for r in suffixregexps]\r
+\r
+ nameregexps = config.get_string_list('name', [r'[\s\S]*'])\r
+ self.name_regexp = [re.compile(r) for r in nameregexps]\r
+\r
+\r
+\r
+ def __call__(self, obj: ProcessInfo) -> None:\r
+ tags = obj.names if self.type == 'name' else obj.address\r
+\r
+ if (not tags or\r
+ self.has_country_code and\r
+ obj.place.country_code not in self.country_codes or\r
+ not self.allowed_ranks[obj.place.rank_address]):\r
+ return\r
+\r
+ filtered_tags: List[PlaceName] = []\r
+\r
+ for tag in tags:\r
+\r
+ if (not self.filter_kind(tag.kind) or\r
+ not self._matches(tag.suffix, self.suffix_regexp) or\r
+ not self._matches(tag.name, self.name_regexp)):\r
+ filtered_tags.append(tag)\r
+\r
+\r
+ if self.type == 'name':\r
+ obj.names = filtered_tags\r
+ else:\r
+ obj.address = filtered_tags\r
+\r
+\r
+ def _set_allowed_ranks(self, ranks: Sequence[str]) -> Tuple[bool, ...]:\r
+ """ Returns a tuple of 31 boolean values corresponding to the\r
+ address ranks 0-30. Value at index 'i' is True if rank 'i'\r
+ is present in the ranks or lies in the range of any of the\r
+ ranks provided in the sanitizer configuration, otherwise\r
+ the value is False.\r
+ """\r
+ allowed_ranks = [False] * 31\r
+\r
+ for rank in ranks:\r
+ intvl = [int(x) for x in rank.split('-')]\r
+\r
+ start, end = (intvl[0], intvl[0]) if len(intvl) == 1 else (intvl[0], intvl[1])\r
+\r
+ for i in range(start, end + 1):\r
+ allowed_ranks[i] = True\r
+\r
+\r
+ return tuple(allowed_ranks)\r
+\r
+\r
+ def _matches(self, value: Optional[str], patterns: List[Pattern[str]]) -> bool:\r
+ """ Returns True if the given value fully matches any of the regular\r
+ expression pattern in the list. Otherwise, returns False.\r
+\r
+ Note that if the value is None, it is taken as an empty string.\r
+ """\r
+ target = '' if value is None else value\r
+ return any(r.fullmatch(target) is not None for r in patterns)\r
+\r
+\r
+\r
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:\r
+ """ Create a function to process removal of certain tags.\r
+ """\r
+\r
+ return _TagSanitizer(config)\r
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only\r
+#\r
+# This file is part of Nominatim. (https://nominatim.org)\r
+#\r
+# Copyright (C) 2023 by the Nominatim developer community.\r
+# For a full list of authors see the git log.\r
+"""\r
+Tests for the sanitizer that normalizes housenumbers.\r
+"""\r
+import pytest\r
+\r
+\r
+from nominatim.data.place_info import PlaceInfo\r
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer\r
+\r
+\r
+class TestWithDefault:\r
+\r
+ @pytest.fixture(autouse=True)\r
+ def setup_country(self, def_config):\r
+ self.config = def_config\r
+\r
+ def run_sanitizer_on(self, type, **kwargs):\r
+\r
+ place = PlaceInfo({type: {k.replace('_', ':'): v for k, v in kwargs.items()},\r
+ 'country_code': 'de', 'rank_address': 30})\r
+\r
+ sanitizer_args = {'step': 'delete-tags'}\r
+\r
+ name, address = PlaceSanitizer([sanitizer_args],\r
+ self.config).process_names(place)\r
+\r
+ return {\r
+ 'name': sorted([(p.name, p.kind, p.suffix or '') for p in name]),\r
+ 'address': sorted([(p.name, p.kind, p.suffix or '') for p in address])\r
+ }\r
+\r
+\r
+ def test_on_name(self):\r
+ res = self.run_sanitizer_on('name', name='foo', ref='bar', ref_abc='baz')\r
+\r
+ assert res.get('name') == []\r
+\r
+ def test_on_address(self):\r
+ res = self.run_sanitizer_on('address', name='foo', ref='bar', ref_abc='baz')\r
+\r
+ assert res.get('address') == [('bar', 'ref', ''), ('baz', 'ref', 'abc'),\r
+ ('foo', 'name', '')]\r
+\r
+\r
+class TestTypeField:\r
+\r
+ @pytest.fixture(autouse=True)\r
+ def setup_country(self, def_config):\r
+ self.config = def_config\r
+\r
+ def run_sanitizer_on(self, type, **kwargs):\r
+\r
+ place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},\r
+ 'country_code': 'de', 'rank_address': 30})\r
+\r
+ sanitizer_args = {\r
+ 'step': 'delete-tags',\r
+ 'type': type,\r
+ }\r
+\r
+ name, _ = PlaceSanitizer([sanitizer_args],\r
+ self.config).process_names(place)\r
+\r
+ return sorted([(p.name, p.kind, p.suffix or '') for p in name])\r
+\r
+ def test_name_type(self):\r
+ res = self.run_sanitizer_on('name', name='foo', ref='bar', ref_abc='baz')\r
+\r
+ assert res == []\r
+\r
+ def test_address_type(self):\r
+ res = self.run_sanitizer_on('address', name='foo', ref='bar', ref_abc='baz')\r
+\r
+ assert res == [('bar', 'ref', ''), ('baz', 'ref', 'abc'),\r
+ ('foo', 'name', '')]\r
+\r
+class TestFilterKind:\r
+\r
+ @pytest.fixture(autouse=True)\r
+ def setup_country(self, def_config):\r
+ self.config = def_config\r
+\r
+ def run_sanitizer_on(self, filt, **kwargs):\r
+\r
+ place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},\r
+ 'country_code': 'de', 'rank_address': 30})\r
+\r
+ sanitizer_args = {\r
+ 'step': 'delete-tags',\r
+ 'filter-kind': filt,\r
+ }\r
+\r
+ name, _ = PlaceSanitizer([sanitizer_args],\r
+ self.config).process_names(place)\r
+\r
+ return sorted([(p.name, p.kind, p.suffix or '') for p in name])\r
+\r
+ def test_single_exact_name(self):\r
+ res = self.run_sanitizer_on(['name'], ref='foo', name='foo',\r
+ name_abc='bar', ref_abc='bar')\r
+\r
+ assert res == [('bar', 'ref', 'abc'), ('foo', 'ref', '')]\r
+\r
+\r
+ def test_single_pattern(self):\r
+ res = self.run_sanitizer_on(['.*name'],\r
+ name_fr='foo', ref_fr='foo', namexx_fr='bar',\r
+ shortname_fr='bar', name='bar')\r
+\r
+ assert res == [('bar', 'namexx', 'fr'), ('foo', 'ref', 'fr')]\r
+\r
+\r
+ def test_multiple_patterns(self):\r
+ res = self.run_sanitizer_on(['.*name', 'ref'],\r
+ name_fr='foo', ref_fr='foo', oldref_fr='foo',\r
+ namexx_fr='bar', shortname_fr='baz', name='baz')\r
+\r
+ assert res == [('bar', 'namexx', 'fr'), ('foo', 'oldref', 'fr')]\r
+\r
+\r
+class TestRankAddress:\r
+\r
+ @pytest.fixture(autouse=True)\r
+ def setup_country(self, def_config):\r
+ self.config = def_config\r
+\r
+ def run_sanitizer_on(self, rank_addr, **kwargs):\r
+\r
+ place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},\r
+ 'country_code': 'de', 'rank_address': 30})\r
+\r
+ sanitizer_args = {\r
+ 'step': 'delete-tags',\r
+ 'rank_address': rank_addr\r
+ }\r
+\r
+ name, _ = PlaceSanitizer([sanitizer_args],\r
+ self.config).process_names(place)\r
+\r
+ return sorted([(p.name, p.kind, p.suffix or '') for p in name])\r
+\r
+\r
+ def test_single_rank(self):\r
+ res = self.run_sanitizer_on('30', name='foo', ref='bar')\r
+\r
+ assert res == []\r
+\r
+ def test_single_rank_fail(self):\r
+ res = self.run_sanitizer_on('28', name='foo', ref='bar')\r
+\r
+ assert res == [('bar', 'ref', ''), ('foo', 'name', '')]\r
+\r
+ def test_ranged_rank_pass(self):\r
+ res = self.run_sanitizer_on('26-30', name='foo', ref='bar')\r
+\r
+ assert res == []\r
+\r
+ def test_ranged_rank_fail(self):\r
+ res = self.run_sanitizer_on('26-29', name='foo', ref='bar')\r
+\r
+ assert res == [('bar', 'ref', ''), ('foo', 'name', '')]\r
+\r
+ def test_mixed_rank_pass(self):\r
+ res = self.run_sanitizer_on(['4', '20-28', '30', '10-12'], name='foo', ref='bar')\r
+\r
+ assert res == []\r
+\r
+ def test_mixed_rank_fail(self):\r
+ res = self.run_sanitizer_on(['4-8', '10', '26-29', '18'], name='foo', ref='bar')\r
+\r
+ assert res == [('bar', 'ref', ''), ('foo', 'name', '')]\r
+\r
+\r
+class TestSuffix:\r
+\r
+ @pytest.fixture(autouse=True)\r
+ def setup_country(self, def_config):\r
+ self.config = def_config\r
+\r
+ def run_sanitizer_on(self, suffix, **kwargs):\r
+\r
+ place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},\r
+ 'country_code': 'de', 'rank_address': 30})\r
+\r
+ sanitizer_args = {\r
+ 'step': 'delete-tags',\r
+ 'suffix': suffix,\r
+ }\r
+\r
+ name, _ = PlaceSanitizer([sanitizer_args],\r
+ self.config).process_names(place)\r
+\r
+ return sorted([(p.name, p.kind, p.suffix or '') for p in name])\r
+\r
+\r
+ def test_single_suffix(self):\r
+ res = self.run_sanitizer_on('abc', name='foo', name_abc='foo',\r
+ name_pqr='bar', ref='bar', ref_abc='baz')\r
+\r
+ assert res == [('bar', 'name', 'pqr'), ('bar', 'ref', ''), ('foo', 'name', '')]\r
+\r
+ def test_multiple_suffix(self):\r
+ res = self.run_sanitizer_on(['abc.*', 'pqr'], name='foo', name_abcxx='foo',\r
+ ref_pqr='bar', name_pqrxx='baz')\r
+\r
+ assert res == [('baz', 'name', 'pqrxx'), ('foo', 'name', '')]\r
+\r
+\r
+\r
+class TestCountryCodes:\r
+\r
+ @pytest.fixture(autouse=True)\r
+ def setup_country(self, def_config):\r
+ self.config = def_config\r
+\r
+ def run_sanitizer_on(self, country_code, **kwargs):\r
+\r
+ place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},\r
+ 'country_code': 'de', 'rank_address': 30})\r
+\r
+ sanitizer_args = {\r
+ 'step': 'delete-tags',\r
+ 'country_code': country_code,\r
+ }\r
+\r
+ name, _ = PlaceSanitizer([sanitizer_args],\r
+ self.config).process_names(place)\r
+\r
+ return sorted([(p.name, p.kind) for p in name])\r
+\r
+\r
+ def test_single_country_code_pass(self):\r
+ res = self.run_sanitizer_on('de', name='foo', ref='bar')\r
+\r
+ assert res == []\r
+\r
+ def test_single_country_code_fail(self):\r
+ res = self.run_sanitizer_on('in', name='foo', ref='bar')\r
+\r
+ assert res == [('bar', 'ref'), ('foo', 'name')]\r
+\r
+ def test_empty_country_code_list(self):\r
+ res = self.run_sanitizer_on([], name='foo', ref='bar')\r
+\r
+ assert res == [('bar', 'ref'), ('foo', 'name')]\r
+\r
+ def test_multiple_country_code_pass(self):\r
+ res = self.run_sanitizer_on(['in', 'de', 'fr'], name='foo', ref='bar')\r
+\r
+ assert res == []\r
+\r
+ def test_multiple_country_code_fail(self):\r
+ res = self.run_sanitizer_on(['in', 'au', 'fr'], name='foo', ref='bar')\r
+\r
+ assert res == [('bar', 'ref'), ('foo', 'name')]\r
+\r
+class TestAllParameters:\r
+\r
+ @pytest.fixture(autouse=True)\r
+ def setup_country(self, def_config):\r
+ self.config = def_config\r
+\r
+ def run_sanitizer_on(self, country_code, rank_addr, suffix, **kwargs):\r
+\r
+ place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},\r
+ 'country_code': 'de', 'rank_address': 30})\r
+\r
+ sanitizer_args = {\r
+ 'step': 'delete-tags',\r
+ 'type': 'name',\r
+ 'filter-kind': ['name', 'ref'],\r
+ 'country_code': country_code,\r
+ 'rank_address': rank_addr,\r
+ 'suffix': suffix,\r
+ 'name': r'[\s\S]*',\r
+ }\r
+\r
+ name, _ = PlaceSanitizer([sanitizer_args],\r
+ self.config).process_names(place)\r
+\r
+ return sorted([(p.name, p.kind, p.suffix or '') for p in name])\r
+\r
+\r
+ def test_string_arguments_pass(self):\r
+ res = self.run_sanitizer_on('de', '25-30', r'[\s\S]*',\r
+ name='foo', ref='foo', name_abc='bar', ref_abc='baz')\r
+\r
+ assert res == []\r
+\r
+ def test_string_arguments_fail(self):\r
+ res = self.run_sanitizer_on('in', '25-30', r'[\s\S]*',\r
+ name='foo', ref='foo', name_abc='bar', ref_abc='baz')\r
+\r
+ assert res == [('bar', 'name', 'abc'), ('baz', 'ref', 'abc'),\r
+ ('foo', 'name', ''), ('foo', 'ref', '')]\r
+\r
+ def test_list_arguments_pass(self):\r
+ res = self.run_sanitizer_on(['de', 'in'], ['20-28', '30'], [r'abc.*', r'[\s\S]*'],\r
+ name='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')\r
+\r
+ assert res == []\r
+\r
+ def test_list_arguments_fail(self):\r
+ res = self.run_sanitizer_on(['de', 'in'], ['14', '20-29'], [r'abc.*', r'pqr'],\r
+ name='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')\r
+\r
+ assert res == [('bar', 'name', 'abcxx'), ('baz', 'ref', 'pqr'),\r
+ ('foo', 'name', ''), ('foo', 'ref', 'abc')]\r
+\r
+ def test_mix_arguments_pass(self):\r
+ res = self.run_sanitizer_on('de', ['10', '20-28', '30'], r'[\s\S]*',\r
+ name='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')\r
+\r
+ assert res == []\r
+\r
+ def test_mix_arguments_fail(self):\r
+ res = self.run_sanitizer_on(['de', 'in'], ['10', '20-28', '30'], r'abc.*',\r
+ name='foo', ref='foo', name_pqr='bar', ref_pqr='baz')\r
+\r
+ assert res == [('bar', 'name', 'pqr'), ('baz', 'ref', 'pqr'),\r
+ ('foo', 'name', ''), ('foo', 'ref', '')]
\ No newline at end of file