expression that must match the full house number value.
"""
from typing import Callable, Iterator, List
-import re
from nominatim.tokenizer.sanitizers.base import ProcessInfo
from nominatim.data.place_name import PlaceName
class _HousenumberSanitizer:
def __init__(self, config: SanitizerConfig) -> None:
- self.filter_kind = config.get_filter_kind('housenumber')
+ self.filter_kind = config.get_filter('filter-kind', ['housenumber'])
self.split_regexp = config.get_delimiter()
- nameregexps = config.get_string_list('convert-to-name', [])
- self.is_name_regexp = [re.compile(r) for r in nameregexps]
-
+ self.filter_name = config.get_filter('convert-to-name', 'FAIL_ALL')
def __call__(self, obj: ProcessInfo) -> None:
new_address: List[PlaceName] = []
for item in obj.address:
if self.filter_kind(item.kind):
- if self._treat_as_name(item.name):
+ if self.filter_name(item.name):
obj.names.append(item.clone(kind='housenumber'))
else:
new_address.extend(item.clone(kind='housenumber', name=n)
yield hnr
- def _treat_as_name(self, housenumber: str) -> bool:
- return any(r.fullmatch(housenumber) is not None for r in self.is_name_regexp)
-
-
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a housenumber processing function.
"""
"""
Configuration for Sanitizers.
"""
-from typing import Sequence, Optional, Pattern, Callable, Any, TYPE_CHECKING
+from typing import Sequence, Union, Optional, Pattern, Callable, Any, TYPE_CHECKING
from collections import UserDict
import re
Arguments:
param: Name of the configuration parameter.
- default: Value to return, when the parameter is missing.
+ default: Takes a tuple or list of strings which will
+ be returned if the parameter is missing in the
+ sanitizer configuration.
+ Note that if this default parameter is not
+ provided then an empty list is returned.
Returns:
If the parameter value is a simple string, it is returned as a
values = self.data.get(param, None)
if values is None:
- return None if default is None else list(default)
+ return list(default)
if isinstance(values, str):
return [values] if values else []
value = self.data.get(param, default)
if not isinstance(value, bool):
- raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no'.")
+ raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no').")
return value
return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
- def get_filter_kind(self, *default: str) -> Callable[[str], bool]:
- """ Return a filter function for the name kind from the 'filter-kind'
- config parameter.
+ def get_filter(self, param: str, default: Union[str, Sequence[str]] = 'PASS_ALL'
+ ) -> Callable[[str], bool]:
+ """ Returns a filter function for the given parameter of the sanitizer
+ configuration.
- If the 'filter-kind' parameter is empty, the filter lets all items
- pass. If the parameter is a string, it is interpreted as a single
- regular expression that must match the full kind string.
- If the parameter is a list then
- any of the regular expressions in the list must match to pass.
+ The value provided for the parameter in sanitizer configuration
+ should be a string or list of strings, where each string is a regular
+ expression. These regular expressions will later be used by the
+ filter function to filter strings.
Arguments:
- default: Filters to be used, when the 'filter-kind' parameter
- is not specified. If omitted then the default is to
- let all names pass.
+ param: The parameter for which the filter function
+ will be created.
+ default: Defines the behaviour of filter function if
+ parameter is missing in the sanitizer configuration.
+ Takes a string(PASS_ALL or FAIL_ALL) or a list of strings.
+ Any other value of string or an empty list is not allowed,
+ and will raise a ValueError. If the value is PASS_ALL, the filter
+ function will let all strings to pass, if the value is FAIL_ALL,
+ filter function will let no strings to pass.
+ If value provided is a list of strings each string
+ is treated as a regular expression. In this case these regular
+ expressions will be used by the filter function.
+ By default allow filter function to let all strings pass.
Returns:
- A filter function which takes a name string and returns
- True when the item passes the filter.
+ A filter function that takes a target string as the argument and
+ returns True if it fully matches any of the regular expressions
+ otherwise returns False.
"""
- filters = self.get_string_list('filter-kind', default)
+ filters = self.get_string_list(param) or default
- if not filters:
+ if filters == 'PASS_ALL':
return lambda _: True
+ if filters == 'FAIL_ALL':
+ return lambda _: False
- regexes = [re.compile(regex) for regex in filters]
+ if filters and isinstance(filters, (list, tuple)):
+ regexes = [re.compile(regex) for regex in filters]
+ return lambda target: any(regex.fullmatch(target) for regex in regexes)
- return lambda name: any(regex.fullmatch(name) for regex in regexes)
+ raise ValueError("Default parameter must be a non-empty list or a string value \
+ ('PASS_ALL' or 'FAIL_ALL').")
\r
\r
"""\r
-from typing import Callable, List, Optional, Pattern, Tuple, Sequence\r
-import re\r
+from typing import Callable, List, Tuple, Sequence\r
\r
from nominatim.tokenizer.sanitizers.base import ProcessInfo\r
from nominatim.data.place_name import PlaceName\r
\r
def __init__(self, config: SanitizerConfig) -> None:\r
self.type = config.get('type', 'name')\r
- self.filter_kind = config.get_filter_kind()\r
+ self.filter_kind = config.get_filter('filter-kind')\r
self.country_codes = config.get_string_list('country_code', [])\r
- self.allowed_ranks = self._set_allowed_ranks( \\r
- config.get_string_list('rank_address', ['0-30']))\r
+ self.filter_suffix = config.get_filter('suffix')\r
+ self.filter_name = config.get_filter('name')\r
+ self.allowed_ranks = self._set_allowed_ranks(\r
+ config.get_string_list("rank_address", ["0-30"])\r
+ )\r
\r
self.has_country_code = config.get('country_code', None) is not None\r
\r
- suffixregexps = config.get_string_list('suffix', [r'[\s\S]*'])\r
- self.suffix_regexp = [re.compile(r) for r in suffixregexps]\r
-\r
- nameregexps = config.get_string_list('name', [r'[\s\S]*'])\r
- self.name_regexp = [re.compile(r) for r in nameregexps]\r
-\r
-\r
\r
def __call__(self, obj: ProcessInfo) -> None:\r
tags = obj.names if self.type == 'name' else obj.address\r
\r
- if (not tags or\r
- self.has_country_code and\r
- obj.place.country_code not in self.country_codes or\r
- not self.allowed_ranks[obj.place.rank_address]):\r
+ if not tags \\r
+ or not self.allowed_ranks[obj.place.rank_address] \\r
+ or self.has_country_code \\r
+ and obj.place.country_code not in self.country_codes:\r
return\r
\r
filtered_tags: List[PlaceName] = []\r
\r
for tag in tags:\r
\r
- if (not self.filter_kind(tag.kind) or\r
- not self._matches(tag.suffix, self.suffix_regexp) or\r
- not self._matches(tag.name, self.name_regexp)):\r
+ if not self.filter_kind(tag.kind) \\r
+ or not self.filter_suffix(tag.suffix or '') \\r
+ or not self.filter_name(tag.name):\r
filtered_tags.append(tag)\r
\r
\r
for rank in ranks:\r
intvl = [int(x) for x in rank.split('-')]\r
\r
- start, end = (intvl[0], intvl[0]) if len(intvl) == 1 else (intvl[0], intvl[1])\r
+ start, end = intvl[0], intvl[0] if len(intvl) == 1 else intvl[1]\r
\r
for i in range(start, end + 1):\r
allowed_ranks[i] = True\r
return tuple(allowed_ranks)\r
\r
\r
- def _matches(self, value: Optional[str], patterns: List[Pattern[str]]) -> bool:\r
- """ Returns True if the given value fully matches any of the regular\r
- expression pattern in the list. Otherwise, returns False.\r
-\r
- Note that if the value is None, it is taken as an empty string.\r
- """\r
- target = '' if value is None else value\r
- return any(r.fullmatch(target) is not None for r in patterns)\r
-\r
-\r
-\r
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:\r
""" Create a function to process removal of certain tags.\r
"""\r
"""
def __init__(self, config: SanitizerConfig) -> None:
- self.filter_kind = config.get_filter_kind()
+ self.filter_kind = config.get_filter('filter-kind')
self.replace = config.get('mode', 'replace') != 'append'
self.whitelist = config.get('whitelist')
\r
def test_list_arguments_pass(self):\r
res = self.run_sanitizer_on(['de', 'in'], ['20-28', '30'], [r'abc.*', r'[\s\S]*'],\r
- name='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')\r
+ name='foo', ref='foo', name_abcxx='bar', ref_pqr='baz')\r
\r
assert res == []\r
\r
\r
def test_mix_arguments_pass(self):\r
res = self.run_sanitizer_on('de', ['10', '20-28', '30'], r'[\s\S]*',\r
- name='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')\r
+ name_abc='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')\r
\r
assert res == []\r
\r
import pytest
from nominatim.errors import UsageError
-from nominatim.tokenizer.place_sanitizer import PlaceName
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
def test_string_list_default_empty():
assert SanitizerConfig().get_string_list('op') == []
-def test_string_list_default_none():
- assert SanitizerConfig().get_string_list('op', default=None) is None
-
-
def test_string_list_default_something():
assert SanitizerConfig().get_string_list('op', default=['a', 'b']) == ['a', 'b']
regex = SanitizerConfig({'delimiters': ''}).get_delimiter()
-@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*'))
-def test_create_kind_filter_no_params(inp):
- filt = SanitizerConfig().get_filter_kind()
+@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*', ''))
+def test_create_name_filter_no_param_no_default(inp):
+ filt = SanitizerConfig({'filter-kind': 'place'}).get_filter('name')
+
+ assert filt(inp)
+
+
+@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*', ''))
+def test_create_name_filter_no_param_default_pass_all(inp):
+ filt = SanitizerConfig().get_filter('name', 'PASS_ALL')
assert filt(inp)
+@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*', ''))
+def test_create_name_filter_no_param_default_fail_all(inp):
+ filt = SanitizerConfig().get_filter('name', 'FAIL_ALL')
+
+ assert not filt(inp)
+
+
+def test_create_name_filter_no_param_default_invalid_string():
+ with pytest.raises(ValueError):
+ filt = SanitizerConfig().get_filter('name', 'abc')
+
+
+def test_create_name_filter_no_param_default_empty_list():
+ with pytest.raises(ValueError):
+ filt = SanitizerConfig().get_filter('name', [])
+
+
@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
+def test_create_kind_filter_default_positive(kind):
+ filt = SanitizerConfig().get_filter('filter-kind', ['.*de'])
+
+ assert filt(kind)
+
+
+@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
+def test_create_kind_filter_default_negetive(kind):
+ filt = SanitizerConfig().get_filter('filter-kind', ['.*fr'])
+
+ assert not filt(kind)
+
+
+@pytest.mark.parametrize('kind', ('lang', 'lang:de', 'langxx'))
def test_create_kind_filter_custom_regex_positive(kind):
- filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
+ filt = SanitizerConfig({'filter-kind': 'lang.*'}
+ ).get_filter('filter-kind', ['.*fr'])
assert filt(kind)
@pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
def test_create_kind_filter_custom_regex_negative(kind):
- filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
+ filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter('filter-kind')
assert not filt(kind)
@pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
def test_create_kind_filter_many_positive(kind):
- filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
+ filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}
+ ).get_filter('filter-kind')
assert filt(kind)
@pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
def test_create_kind_filter_many_negative(kind):
- filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
+ filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}
+ ).get_filter('filter-kind')
assert not filt(kind)