X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/610f2cc254cf442c895351907be6405f03026903..2dafc4cf4fc46ea4be293a458d565fbff645ac28:/nominatim/tokenizer/sanitizers/config.py?ds=sidebyside diff --git a/nominatim/tokenizer/sanitizers/config.py b/nominatim/tokenizer/sanitizers/config.py index 3ff681b9..8b9164c6 100644 --- a/nominatim/tokenizer/sanitizers/config.py +++ b/nominatim/tokenizer/sanitizers/config.py @@ -7,25 +7,39 @@ """ Configuration for Sanitizers. """ +from typing import Sequence, Optional, Pattern, Callable, Any, TYPE_CHECKING from collections import UserDict import re from nominatim.errors import UsageError -class SanitizerConfig(UserDict): - """ Dictionary with configuration options for a sanitizer. - - In addition to the usualy dictionary function, the class provides - accessors to standard sanatizer options that are used by many of the +# working around missing generics in Python < 3.8 +# See https://github.com/python/typing/issues/60#issuecomment-869757075 +if TYPE_CHECKING: + _BaseUserDict = UserDict[str, Any] +else: + _BaseUserDict = UserDict + +class SanitizerConfig(_BaseUserDict): + """ The `SanitizerConfig` class is a read-only dictionary + with configuration options for the sanitizer. + In addition to the usual dictionary functions, the class provides + accessors to standard sanitizer options that are used by many of the sanitizers. """ - def get_string_list(self, param, default=tuple()): + def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Sequence[str]: """ Extract a configuration parameter as a string list. - If the parameter value is a simple string, it is returned as a - one-item list. If the parameter value does not exist, the given - default is returned. If the parameter value is a list, it is checked - to contain only strings before being returned. + + Arguments: + param: Name of the configuration parameter. + default: Value to return, when the parameter is missing. + + Returns: + If the parameter value is a simple string, it is returned as a + one-item list. If the parameter value does not exist, the given + default is returned. If the parameter value is a list, it is + checked to contain only strings before being returned. """ values = self.data.get(param, None) @@ -33,7 +47,7 @@ class SanitizerConfig(UserDict): return None if default is None else list(default) if isinstance(values, str): - return [values] + return [values] if values else [] if not isinstance(values, (list, tuple)): raise UsageError(f"Parameter '{param}' must be string or list of strings.") @@ -44,16 +58,42 @@ class SanitizerConfig(UserDict): return values - def get_delimiter(self, default=',;'): - """ Return the 'delimiter' parameter in the configuration as a - compiled regular expression that can be used to split the names on the - delimiters. The regular expression makes sure that the resulting names - are stripped and that repeated delimiters - are ignored but it will still create empty fields on occasion. The - code needs to filter those. + def get_bool(self, param: str, default: Optional[bool] = None) -> bool: + """ Extract a configuration parameter as a boolean. + + Arguments: + param: Name of the configuration parameter. The parameter must + contain one of the yaml boolean values or an + UsageError will be raised. + default: Value to return, when the parameter is missing. + When set to `None`, the parameter must be defined. - The 'default' parameter defines the delimiter set to be used when - not explicitly configured. + Returns: + Boolean value of the given parameter. + """ + value = self.data.get(param, default) + + if not isinstance(value, bool): + raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no'.") + + return value + + + def get_delimiter(self, default: str = ',;') -> Pattern[str]: + """ Return the 'delimiters' parameter in the configuration as a + compiled regular expression that can be used to split strings on + these delimiters. + + Arguments: + default: Delimiters to be used when 'delimiters' parameter + is not explicitly configured. + + Returns: + A regular expression pattern which can be used to + split a string. The regular expression makes sure that the + resulting names are stripped and that repeated delimiters + are ignored. It may still create empty fields on occasion. The + code needs to filter those. """ delimiter_set = set(self.data.get('delimiters', default)) if not delimiter_set: @@ -62,15 +102,24 @@ class SanitizerConfig(UserDict): return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set))) - def get_filter_kind(self, *default): + def get_filter_kind(self, *default: str) -> Callable[[str], bool]: """ Return a filter function for the name kind from the 'filter-kind' - config parameter. The filter functions takes a name item and returns - True when the item passes the filter. + config parameter. - If the parameter is empty, the filter lets all items pass. If the - paramter is a string, it is interpreted as a single regular expression - that must match the full kind string. If the parameter is a list then + If the 'filter-kind' parameter is empty, the filter lets all items + pass. If the parameter is a string, it is interpreted as a single + regular expression that must match the full kind string. + If the parameter is a list then any of the regular expressions in the list must match to pass. + + Arguments: + default: Filters to be used, when the 'filter-kind' parameter + is not specified. If omitted then the default is to + let all names pass. + + Returns: + A filter function which takes a name string and returns + True when the item passes the filter. """ filters = self.get_string_list('filter-kind', default) @@ -79,4 +128,4 @@ class SanitizerConfig(UserDict): regexes = [re.compile(regex) for regex in filters] - return lambda name: any(regex.fullmatch(name.kind) for regex in regexes) + return lambda name: any(regex.fullmatch(name) for regex in regexes)