X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/3746befd8859cbf1a1f86459b09db57c21d26c75..28101967ef5b89b85f492304b1002b8a85e6fe21:/nominatim/tokenizer/sanitizers/config.py diff --git a/nominatim/tokenizer/sanitizers/config.py b/nominatim/tokenizer/sanitizers/config.py index 52cb2c04..79396a75 100644 --- a/nominatim/tokenizer/sanitizers/config.py +++ b/nominatim/tokenizer/sanitizers/config.py @@ -7,7 +7,7 @@ """ Configuration for Sanitizers. """ -from typing import Sequence, Optional, Pattern, Callable, Any, TYPE_CHECKING +from typing import Sequence, Union, Optional, Pattern, Callable, Any, TYPE_CHECKING from collections import UserDict import re @@ -23,8 +23,8 @@ else: class SanitizerConfig(_BaseUserDict): """ The `SanitizerConfig` class is a read-only dictionary with configuration options for the sanitizer. - In addition to the usual dictionary function, the class provides - accessors to standard sanatizer options that are used by many of the + In addition to the usual dictionary functions, the class provides + accessors to standard sanitizer options that are used by many of the sanitizers. """ @@ -33,18 +33,22 @@ class SanitizerConfig(_BaseUserDict): Arguments: param: Name of the configuration parameter. - default: Value to return, when the parameter is missing. + default: Takes a tuple or list of strings which will + be returned if the parameter is missing in the + sanitizer configuration. + Note that if this default parameter is not + provided then an empty list is returned. Returns: If the parameter value is a simple string, it is returned as a - one-item list. If the parameter value does not exist, the given - default is returned. If the parameter value is a list, it is - checked to contain only strings before being returned. + one-item list. If the parameter value does not exist, the given + default is returned. If the parameter value is a list, it is + checked to contain only strings before being returned. """ values = self.data.get(param, None) if values is None: - return None if default is None else list(default) + return list(default) if isinstance(values, str): return [values] if values else [] @@ -74,26 +78,26 @@ class SanitizerConfig(_BaseUserDict): value = self.data.get(param, default) if not isinstance(value, bool): - raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no'.") + raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no').") return value def get_delimiter(self, default: str = ',;') -> Pattern[str]: """ Return the 'delimiters' parameter in the configuration as a - compiled regular expression that can be used to split names on these - delimiters. + compiled regular expression that can be used to split strings on + these delimiters. Arguments: - default: Delimiters to be used, when 'delimiters' parameter + default: Delimiters to be used when 'delimiters' parameter is not explicitly configured. Returns: - A regular expression pattern, which can be used to - split a string. The regular expression makes sure that the - resulting names are stripped and that repeated delimiters - are ignored. It may still create empty fields on occasion. The - code needs to filter those. + A regular expression pattern which can be used to + split a string. The regular expression makes sure that the + resulting names are stripped and that repeated delimiters + are ignored. It may still create empty fields on occasion. The + code needs to filter those. """ delimiter_set = set(self.data.get('delimiters', default)) if not delimiter_set: @@ -102,30 +106,46 @@ class SanitizerConfig(_BaseUserDict): return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set))) - def get_filter_kind(self, *default: str) -> Callable[[str], bool]: - """ Return a filter function for the name kind from the 'filter-kind' - config parameter. + def get_filter(self, param: str, default: Union[str, Sequence[str]] = 'PASS_ALL' + ) -> Callable[[str], bool]: + """ Returns a filter function for the given parameter of the sanitizer + configuration. - If the 'filter-kind' parameter is empty, the filter lets all items - pass. If the parameter is a string, it is interpreted as a single - regular expression that must match the full kind string. - If the parameter is a list then - any of the regular expressions in the list must match to pass. + The value provided for the parameter in sanitizer configuration + should be a string or list of strings, where each string is a regular + expression. These regular expressions will later be used by the + filter function to filter strings. Arguments: - default: Filters to be used, when the 'filter-kind' parameter - is not specified. If omitted then the default is to - let all names pass. + param: The parameter for which the filter function + will be created. + default: Defines the behaviour of filter function if + parameter is missing in the sanitizer configuration. + Takes a string(PASS_ALL or FAIL_ALL) or a list of strings. + Any other value of string or an empty list is not allowed, + and will raise a ValueError. If the value is PASS_ALL, the filter + function will let all strings to pass, if the value is FAIL_ALL, + filter function will let no strings to pass. + If value provided is a list of strings each string + is treated as a regular expression. In this case these regular + expressions will be used by the filter function. + By default allow filter function to let all strings pass. Returns: - A filter function which takes a name string and returns - True when the item passes the filter. + A filter function that takes a target string as the argument and + returns True if it fully matches any of the regular expressions + otherwise returns False. """ - filters = self.get_string_list('filter-kind', default) + filters = self.get_string_list(param) or default - if not filters: + if filters == 'PASS_ALL': return lambda _: True + if filters == 'FAIL_ALL': + return lambda _: False - regexes = [re.compile(regex) for regex in filters] + if filters and isinstance(filters, (list, tuple)): + regexes = [re.compile(regex) for regex in filters] + return lambda target: any(regex.fullmatch(target) for regex in regexes) - return lambda name: any(regex.fullmatch(name) for regex in regexes) + raise ValueError("Default parameter must be a non-empty list or a string value \ + ('PASS_ALL' or 'FAIL_ALL').")