1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Configuration for Sanitizers.
10 from typing import Sequence, Union, Optional, Pattern, Callable, Any, TYPE_CHECKING
11 from collections import UserDict
14 from ...errors import UsageError
16 # working around missing generics in Python < 3.8
17 # See https://github.com/python/typing/issues/60#issuecomment-869757075
19 _BaseUserDict = UserDict[str, Any]
21 _BaseUserDict = UserDict
23 class SanitizerConfig(_BaseUserDict):
24 """ The `SanitizerConfig` class is a read-only dictionary
25 with configuration options for the sanitizer.
26 In addition to the usual dictionary functions, the class provides
27 accessors to standard sanitizer options that are used by many of the
31 def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Sequence[str]:
32 """ Extract a configuration parameter as a string list.
35 param: Name of the configuration parameter.
36 default: Takes a tuple or list of strings which will
37 be returned if the parameter is missing in the
38 sanitizer configuration.
39 Note that if this default parameter is not
40 provided then an empty list is returned.
43 If the parameter value is a simple string, it is returned as a
44 one-item list. If the parameter value does not exist, the given
45 default is returned. If the parameter value is a list, it is
46 checked to contain only strings before being returned.
48 values = self.data.get(param, None)
53 if isinstance(values, str):
54 return [values] if values else []
56 if not isinstance(values, (list, tuple)):
57 raise UsageError(f"Parameter '{param}' must be string or list of strings.")
59 if any(not isinstance(value, str) for value in values):
60 raise UsageError(f"Parameter '{param}' must be string or list of strings.")
65 def get_bool(self, param: str, default: Optional[bool] = None) -> bool:
66 """ Extract a configuration parameter as a boolean.
69 param: Name of the configuration parameter. The parameter must
70 contain one of the yaml boolean values or an
71 UsageError will be raised.
72 default: Value to return, when the parameter is missing.
73 When set to `None`, the parameter must be defined.
76 Boolean value of the given parameter.
78 value = self.data.get(param, default)
80 if not isinstance(value, bool):
81 raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no').")
86 def get_delimiter(self, default: str = ',;') -> Pattern[str]:
87 """ Return the 'delimiters' parameter in the configuration as a
88 compiled regular expression that can be used to split strings on
92 default: Delimiters to be used when 'delimiters' parameter
93 is not explicitly configured.
96 A regular expression pattern which can be used to
97 split a string. The regular expression makes sure that the
98 resulting names are stripped and that repeated delimiters
99 are ignored. It may still create empty fields on occasion. The
100 code needs to filter those.
102 delimiter_set = set(self.data.get('delimiters', default))
103 if not delimiter_set:
104 raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
106 return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
109 def get_filter(self, param: str, default: Union[str, Sequence[str]] = 'PASS_ALL'
110 ) -> Callable[[str], bool]:
111 """ Returns a filter function for the given parameter of the sanitizer
114 The value provided for the parameter in sanitizer configuration
115 should be a string or list of strings, where each string is a regular
116 expression. These regular expressions will later be used by the
117 filter function to filter strings.
120 param: The parameter for which the filter function
122 default: Defines the behaviour of filter function if
123 parameter is missing in the sanitizer configuration.
124 Takes a string(PASS_ALL or FAIL_ALL) or a list of strings.
125 Any other value of string or an empty list is not allowed,
126 and will raise a ValueError. If the value is PASS_ALL, the filter
127 function will let all strings to pass, if the value is FAIL_ALL,
128 filter function will let no strings to pass.
129 If value provided is a list of strings each string
130 is treated as a regular expression. In this case these regular
131 expressions will be used by the filter function.
132 By default allow filter function to let all strings pass.
135 A filter function that takes a target string as the argument and
136 returns True if it fully matches any of the regular expressions
137 otherwise returns False.
139 filters = self.get_string_list(param) or default
141 if filters == 'PASS_ALL':
142 return lambda _: True
143 if filters == 'FAIL_ALL':
144 return lambda _: False
146 if filters and isinstance(filters, (list, tuple)):
147 regexes = [re.compile(regex) for regex in filters]
148 return lambda target: any(regex.fullmatch(target) for regex in regexes)
150 raise ValueError("Default parameter must be a non-empty list or a string value \
151 ('PASS_ALL' or 'FAIL_ALL').")