1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Configuration for Sanitizers.
10 from typing import Sequence, Optional, Pattern, Callable, Any, TYPE_CHECKING
11 from collections import UserDict
14 from nominatim.errors import UsageError
16 # working around missing generics in Python < 3.8
17 # See https://github.com/python/typing/issues/60#issuecomment-869757075
19 _BaseUserDict = UserDict[str, Any]
21 _BaseUserDict = UserDict
23 class SanitizerConfig(_BaseUserDict):
24 """ Dictionary with configuration options for a sanitizer.
26 In addition to the usual dictionary function, the class provides
27 accessors to standard sanatizer options that are used by many of the
31 def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Sequence[str]:
32 """ Extract a configuration parameter as a string list.
33 If the parameter value is a simple string, it is returned as a
34 one-item list. If the parameter value does not exist, the given
35 default is returned. If the parameter value is a list, it is checked
36 to contain only strings before being returned.
38 values = self.data.get(param, None)
41 return None if default is None else list(default)
43 if isinstance(values, str):
44 return [values] if values else []
46 if not isinstance(values, (list, tuple)):
47 raise UsageError(f"Parameter '{param}' must be string or list of strings.")
49 if any(not isinstance(value, str) for value in values):
50 raise UsageError(f"Parameter '{param}' must be string or list of strings.")
55 def get_bool(self, param: str, default: Optional[bool] = None) -> bool:
56 """ Extract a configuration parameter as a boolean.
57 The parameter must be one of the yaml boolean values or an
58 user error will be raised. If `default` is given, then the parameter
59 may also be missing or empty.
61 value = self.data.get(param, default)
63 if not isinstance(value, bool):
64 raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no'.")
69 def get_delimiter(self, default: str = ',;') -> Pattern[str]:
70 """ Return the 'delimiter' parameter in the configuration as a
71 compiled regular expression that can be used to split the names on the
72 delimiters. The regular expression makes sure that the resulting names
73 are stripped and that repeated delimiters
74 are ignored but it will still create empty fields on occasion. The
75 code needs to filter those.
77 The 'default' parameter defines the delimiter set to be used when
78 not explicitly configured.
80 delimiter_set = set(self.data.get('delimiters', default))
82 raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
84 return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
87 def get_filter_kind(self, *default: str) -> Callable[[str], bool]:
88 """ Return a filter function for the name kind from the 'filter-kind'
89 config parameter. The filter functions takes a name item and returns
90 True when the item passes the filter.
92 If the parameter is empty, the filter lets all items pass. If the
93 paramter is a string, it is interpreted as a single regular expression
94 that must match the full kind string. If the parameter is a list then
95 any of the regular expressions in the list must match to pass.
97 filters = self.get_string_list('filter-kind', default)
100 return lambda _: True
102 regexes = [re.compile(regex) for regex in filters]
104 return lambda name: any(regex.fullmatch(name) for regex in regexes)