add type hints for sanitizers

author Sarah Hoffmann <lonvia@denofr.de>

Tue, 12 Jul 2022 21:15:19 +0000 (23:15 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Mon, 18 Jul 2022 07:47:57 +0000 (09:47 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Tue, 12 Jul 2022 21:15:19 +0000 (23:15 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Mon, 18 Jul 2022 07:47:57 +0000 (09:47 +0200)
diff --git a/nominatim/data/postcode_format.py b/nominatim/data/postcode_format.py

index 9906596509bc1a97a63cf2b29c9a5f9b441e1928..dad35b7a9965c6a6d4c90149c5879c764ff8e5cc 100644 (file)
--- a/nominatim/data/postcode_format.py
+++ b/nominatim/data/postcode_format.py
@@ -79,7 +79,7 @@ class PostcodeFormatter:
          self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
  
  
          self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
  
  
-    def get_matcher(self, country_code: str) -> Optional[CountryPostcodeMatcher]:
+    def get_matcher(self, country_code: Optional[str]) -> Optional[CountryPostcodeMatcher]:
          """ Return the CountryPostcodeMatcher for the given country.
              Returns None if the country doesn't have a postcode and the
              default matcher if there is no specific matcher configured for
          """ Return the CountryPostcodeMatcher for the given country.
              Returns None if the country doesn't have a postcode and the
              default matcher if there is no specific matcher configured for
@@ -88,10 +88,12 @@ class PostcodeFormatter:
          if country_code in self.country_without_postcode:
              return None
  
          if country_code in self.country_without_postcode:
              return None
  
+        assert country_code is not None
+
          return self.country_matcher.get(country_code, self.default_matcher)
  
  
          return self.country_matcher.get(country_code, self.default_matcher)
  
  
-    def match(self, country_code: str, postcode: str) -> Optional[Match[str]]:
+    def match(self, country_code: Optional[str], postcode: str) -> Optional[Match[str]]:
          """ Match the given postcode against the postcode pattern for this
              matcher. Returns a `re.Match` object if the country has a pattern
              and the match was successful or None if the match failed.
          """ Match the given postcode against the postcode pattern for this
              matcher. Returns a `re.Match` object if the country has a pattern
              and the match was successful or None if the match failed.
@@ -99,6 +101,8 @@ class PostcodeFormatter:
          if country_code in self.country_without_postcode:
              return None
  
          if country_code in self.country_without_postcode:
              return None
  
+        assert country_code is not None
+
          return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
  
  
          return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
  
  
diff --git a/nominatim/tokenizer/place_sanitizer.py b/nominatim/tokenizer/place_sanitizer.py

index 913b363c7f8c8d88e6adc44750706703503943da..3f548e061e4fdc9dc45e5f4711c35b061d66a1cd 100644 (file)
--- a/nominatim/tokenizer/place_sanitizer.py
+++ b/nominatim/tokenizer/place_sanitizer.py
@@ -8,100 +8,13 @@
  Handler for cleaning name and address tags in place information before it
  is handed to the token analysis.
  """
  Handler for cleaning name and address tags in place information before it
  is handed to the token analysis.
  """
+from typing import Optional, List, Mapping, Sequence, Callable, Any, Tuple
  import importlib
  
  from nominatim.errors import UsageError
  from nominatim.tokenizer.sanitizers.config import SanitizerConfig
  import importlib
  
  from nominatim.errors import UsageError
  from nominatim.tokenizer.sanitizers.config import SanitizerConfig
-
-class PlaceName:
-    """ A searchable name for a place together with properties.
-        Every name object saves the name proper and two basic properties:
-        * 'kind' describes the name of the OSM key used without any suffixes
-          (i.e. the part after the colon removed)
-        * 'suffix' contains the suffix of the OSM tag, if any. The suffix
-          is the part of the key after the first colon.
-        In addition to that, the name may have arbitrary additional attributes.
-        Which attributes are used, depends on the token analyser.
-    """
-
-    def __init__(self, name, kind, suffix):
-        self.name = name
-        self.kind = kind
-        self.suffix = suffix
-        self.attr = {}
-
-
-    def __repr__(self):
-        return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
-
-
-    def clone(self, name=None, kind=None, suffix=None, attr=None):
-        """ Create a deep copy of the place name, optionally with the
-            given parameters replaced. In the attribute list only the given
-            keys are updated. The list is not replaced completely.
-            In particular, the function cannot to be used to remove an
-            attribute from a place name.
-        """
-        newobj = PlaceName(name or self.name,
-                           kind or self.kind,
-                           suffix or self.suffix)
-
-        newobj.attr.update(self.attr)
-        if attr:
-            newobj.attr.update(attr)
-
-        return newobj
-
-
-    def set_attr(self, key, value):
-        """ Add the given property to the name. If the property was already
-            set, then the value is overwritten.
-        """
-        self.attr[key] = value
-
-
-    def get_attr(self, key, default=None):
-        """ Return the given property or the value of 'default' if it
-            is not set.
-        """
-        return self.attr.get(key, default)
-
-
-    def has_attr(self, key):
-        """ Check if the given attribute is set.
-        """
-        return key in self.attr
-
-
-class _ProcessInfo:
-    """ Container class for information handed into to handler functions.
-        The 'names' and 'address' members are mutable. A handler must change
-        them by either modifying the lists place or replacing the old content
-        with a new list.
-    """
-
-    def __init__(self, place):
-        self.place = place
-        self.names = self._convert_name_dict(place.name)
-        self.address = self._convert_name_dict(place.address)
-
-
-    @staticmethod
-    def _convert_name_dict(names):
-        """ Convert a dictionary of names into a list of PlaceNames.
-            The dictionary key is split into the primary part of the key
-            and the suffix (the part after an optional colon).
-        """
-        out = []
-
-        if names:
-            for key, value in names.items():
-                parts = key.split(':', 1)
-                out.append(PlaceName(value.strip(),
-                                     parts[0].strip(),
-                                     parts[1].strip() if len(parts) > 1 else None))
-
-        return out
+from nominatim.tokenizer.sanitizers.base import SanitizerHandler, ProcessInfo, PlaceName
+from nominatim.data.place_info import PlaceInfo
  
  
  class PlaceSanitizer:
  
  
  class PlaceSanitizer:
@@ -109,24 +22,24 @@ class PlaceSanitizer:
          names and address before they are used by the token analysers.
      """
  
          names and address before they are used by the token analysers.
      """
  
-    def __init__(self, rules):
-        self.handlers = []
+    def __init__(self, rules: Optional[Sequence[Mapping[str, Any]]]) -> None:
+        self.handlers: List[Callable[[ProcessInfo], None]] = []
  
          if rules:
              for func in rules:
                  if 'step' not in func:
                      raise UsageError("Sanitizer rule is missing the 'step' attribute.")
                  module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
  
          if rules:
              for func in rules:
                  if 'step' not in func:
                      raise UsageError("Sanitizer rule is missing the 'step' attribute.")
                  module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
-                handler_module = importlib.import_module(module_name)
+                handler_module: SanitizerHandler = importlib.import_module(module_name)
                  self.handlers.append(handler_module.create(SanitizerConfig(func)))
  
  
                  self.handlers.append(handler_module.create(SanitizerConfig(func)))
  
  
-    def process_names(self, place):
+    def process_names(self, place: PlaceInfo) -> Tuple[List[PlaceName], List[PlaceName]]:
          """ Extract a sanitized list of names and address parts from the
              given place. The function returns a tuple
              (list of names, list of address names)
          """
          """ Extract a sanitized list of names and address parts from the
              given place. The function returns a tuple
              (list of names, list of address names)
          """
-        obj = _ProcessInfo(place)
+        obj = ProcessInfo(place)
  
          for func in self.handlers:
              func(obj)
  
          for func in self.handlers:
              func(obj)
diff --git a/nominatim/tokenizer/sanitizers/base.py b/nominatim/tokenizer/sanitizers/base.py

new file mode 100644 (file)

index 0000000..f2e1bc4
--- /dev/null
+++ b/nominatim/tokenizer/sanitizers/base.py
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Common data types and protocols for sanitizers.
+"""
+from typing import Optional, Dict, List, Mapping, Callable
+
+from typing_extensions import Protocol, Final
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
+from nominatim.data.place_info import PlaceInfo
+
+class PlaceName:
+    """ A searchable name for a place together with properties.
+        Every name object saves the name proper and two basic properties:
+        * 'kind' describes the name of the OSM key used without any suffixes
+          (i.e. the part after the colon removed)
+        * 'suffix' contains the suffix of the OSM tag, if any. The suffix
+          is the part of the key after the first colon.
+        In addition to that, the name may have arbitrary additional attributes.
+        Which attributes are used, depends on the token analyser.
+    """
+
+    def __init__(self, name: str, kind: str, suffix: Optional[str]):
+        self.name = name
+        self.kind = kind
+        self.suffix = suffix
+        self.attr: Dict[str, str] = {}
+
+
+    def __repr__(self) -> str:
+        return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
+
+
+    def clone(self, name: Optional[str] = None,
+              kind: Optional[str] = None,
+              suffix: Optional[str] = None,
+              attr: Optional[Mapping[str, str]] = None) -> 'PlaceName':
+        """ Create a deep copy of the place name, optionally with the
+            given parameters replaced. In the attribute list only the given
+            keys are updated. The list is not replaced completely.
+            In particular, the function cannot to be used to remove an
+            attribute from a place name.
+        """
+        newobj = PlaceName(name or self.name,
+                           kind or self.kind,
+                           suffix or self.suffix)
+
+        newobj.attr.update(self.attr)
+        if attr:
+            newobj.attr.update(attr)
+
+        return newobj
+
+
+    def set_attr(self, key: str, value: str) -> None:
+        """ Add the given property to the name. If the property was already
+            set, then the value is overwritten.
+        """
+        self.attr[key] = value
+
+
+    def get_attr(self, key: str, default: Optional[str] = None) -> Optional[str]:
+        """ Return the given property or the value of 'default' if it
+            is not set.
+        """
+        return self.attr.get(key, default)
+
+
+    def has_attr(self, key: str) -> bool:
+        """ Check if the given attribute is set.
+        """
+        return key in self.attr
+
+
+class ProcessInfo:
+    """ Container class for information handed into to handler functions.
+        The 'names' and 'address' members are mutable. A handler must change
+        them by either modifying the lists place or replacing the old content
+        with a new list.
+    """
+
+    def __init__(self, place: PlaceInfo):
+        self.place: Final = place
+        self.names = self._convert_name_dict(place.name)
+        self.address = self._convert_name_dict(place.address)
+
+
+    @staticmethod
+    def _convert_name_dict(names: Optional[Mapping[str, str]]) -> List[PlaceName]:
+        """ Convert a dictionary of names into a list of PlaceNames.
+            The dictionary key is split into the primary part of the key
+            and the suffix (the part after an optional colon).
+        """
+        out = []
+
+        if names:
+            for key, value in names.items():
+                parts = key.split(':', 1)
+                out.append(PlaceName(value.strip(),
+                                     parts[0].strip(),
+                                     parts[1].strip() if len(parts) > 1 else None))
+
+        return out
+
+
+class SanitizerHandler(Protocol):
+    """ Protocol for sanitizer modules.
+    """
+
+    def create(self, config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
+        """
+        A sanitizer must define a single function `create`. It takes the
+        dictionary with the configuration information for the sanitizer and
+        returns a function that transforms name and address.
+        """
diff --git a/nominatim/tokenizer/sanitizers/clean_housenumbers.py b/nominatim/tokenizer/sanitizers/clean_housenumbers.py

index c229716f8e8353d8d8788670b767ea0c9cebc22f..5df057d0506a7d4950c5e4db7291b7d7f45dc76f 100644 (file)
--- a/nominatim/tokenizer/sanitizers/clean_housenumbers.py
+++ b/nominatim/tokenizer/sanitizers/clean_housenumbers.py
@@ -24,11 +24,15 @@ Arguments:
                       or a list of strings, where each string is a regular
                       expression that must match the full house number value.
  """
                       or a list of strings, where each string is a regular
                       expression that must match the full house number value.
  """
+from typing import Callable, Iterator, List
  import re
  
  import re
  
+from nominatim.tokenizer.sanitizers.base import ProcessInfo, PlaceName
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
+
  class _HousenumberSanitizer:
  
  class _HousenumberSanitizer:
  
-    def __init__(self, config):
+    def __init__(self, config: SanitizerConfig) -> None:
          self.filter_kind = config.get_filter_kind('housenumber')
          self.split_regexp = config.get_delimiter()
  
          self.filter_kind = config.get_filter_kind('housenumber')
          self.split_regexp = config.get_delimiter()
  
@@ -37,13 +41,13 @@ class _HousenumberSanitizer:
  
  
  
  
  
  
-    def __call__(self, obj):
+    def __call__(self, obj: ProcessInfo) -> None:
          if not obj.address:
              return
  
          if not obj.address:
              return
  
-        new_address = []
+        new_address: List[PlaceName] = []
          for item in obj.address:
          for item in obj.address:
-            if self.filter_kind(item):
+            if self.filter_kind(item.kind):
                  if self._treat_as_name(item.name):
                      obj.names.append(item.clone(kind='housenumber'))
                  else:
                  if self._treat_as_name(item.name):
                      obj.names.append(item.clone(kind='housenumber'))
                  else:
@@ -56,7 +60,7 @@ class _HousenumberSanitizer:
          obj.address = new_address
  
  
          obj.address = new_address
  
  
-    def sanitize(self, value):
+    def sanitize(self, value: str) -> Iterator[str]:
          """ Extract housenumbers in a regularized format from an OSM value.
  
              The function works as a generator that yields all valid housenumbers
          """ Extract housenumbers in a regularized format from an OSM value.
  
              The function works as a generator that yields all valid housenumbers
@@ -67,16 +71,15 @@ class _HousenumberSanitizer:
                  yield from self._regularize(hnr)
  
  
                  yield from self._regularize(hnr)
  
  
-    @staticmethod
-    def _regularize(hnr):
+    def _regularize(self, hnr: str) -> Iterator[str]:
          yield hnr
  
  
          yield hnr
  
  
-    def _treat_as_name(self, housenumber):
+    def _treat_as_name(self, housenumber: str) -> bool:
          return any(r.fullmatch(housenumber) is not None for r in self.is_name_regexp)
  
  
          return any(r.fullmatch(housenumber) is not None for r in self.is_name_regexp)
  
  
-def create(config):
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
      """ Create a housenumber processing function.
      """
  
      """ Create a housenumber processing function.
      """
  
diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py

index 05e90ca122fa71eb4f8eb8f482bd15819fa623c2..cabacff41ee5810f587c3c2cb055c1c995ea81e8 100644 (file)
--- a/nominatim/tokenizer/sanitizers/clean_postcodes.py
+++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py
@@ -20,11 +20,15 @@ Arguments:
                          objects that have no country assigned. These are always
                          assumed to have no postcode.
  """
                          objects that have no country assigned. These are always
                          assumed to have no postcode.
  """
+from typing import Callable, Optional, Tuple
+
  from nominatim.data.postcode_format import PostcodeFormatter
  from nominatim.data.postcode_format import PostcodeFormatter
+from nominatim.tokenizer.sanitizers.base import ProcessInfo
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
  
  class _PostcodeSanitizer:
  
  
  class _PostcodeSanitizer:
  
-    def __init__(self, config):
+    def __init__(self, config: SanitizerConfig) -> None:
          self.convert_to_address = config.get_bool('convert-to-address', True)
          self.matcher = PostcodeFormatter()
  
          self.convert_to_address = config.get_bool('convert-to-address', True)
          self.matcher = PostcodeFormatter()
  
@@ -33,7 +37,7 @@ class _PostcodeSanitizer:
              self.matcher.set_default_pattern(default_pattern)
  
  
              self.matcher.set_default_pattern(default_pattern)
  
  
-    def __call__(self, obj):
+    def __call__(self, obj: ProcessInfo) -> None:
          if not obj.address:
              return
  
          if not obj.address:
              return
  
@@ -52,7 +56,7 @@ class _PostcodeSanitizer:
                  postcode.set_attr('variant', formatted[1])
  
  
                  postcode.set_attr('variant', formatted[1])
  
  
-    def scan(self, postcode, country):
+    def scan(self, postcode: str, country: Optional[str]) -> Optional[Tuple[str, str]]:
          """ Check the postcode for correct formatting and return the
              normalized version. Returns None if the postcode does not
              correspond to the oficial format of the given country.
          """ Check the postcode for correct formatting and return the
              normalized version. Returns None if the postcode does not
              correspond to the oficial format of the given country.
@@ -61,13 +65,15 @@ class _PostcodeSanitizer:
          if match is None:
              return None
  
          if match is None:
              return None
  
+        assert country is not None
+
          return self.matcher.normalize(country, match),\
                 ' '.join(filter(lambda p: p is not None, match.groups()))
  
  
  
  
          return self.matcher.normalize(country, match),\
                 ' '.join(filter(lambda p: p is not None, match.groups()))
  
  
  
  
-def create(config):
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
      """ Create a housenumber processing function.
      """
  
      """ Create a housenumber processing function.
      """
  
diff --git a/nominatim/tokenizer/sanitizers/config.py b/nominatim/tokenizer/sanitizers/config.py

index ce5ce1eb8b5606dd702efb2b582facf1a48a0626..fd05848b9c1420a1b1099ceaa209c130dc48333c 100644 (file)
--- a/nominatim/tokenizer/sanitizers/config.py
+++ b/nominatim/tokenizer/sanitizers/config.py
@@ -7,20 +7,28 @@
  """
  Configuration for Sanitizers.
  """
  """
  Configuration for Sanitizers.
  """
+from typing import Sequence, Optional, Pattern, Callable, Any, TYPE_CHECKING
  from collections import UserDict
  import re
  
  from nominatim.errors import UsageError
  
  from collections import UserDict
  import re
  
  from nominatim.errors import UsageError
  
-class SanitizerConfig(UserDict):
+# working around missing generics in Python < 3.8
+# See https://github.com/python/typing/issues/60#issuecomment-869757075
+if TYPE_CHECKING:
+    _BaseUserDict = UserDict[str, Any]
+else:
+    _BaseUserDict = UserDict
+
+class SanitizerConfig(_BaseUserDict):
      """ Dictionary with configuration options for a sanitizer.
  
      """ Dictionary with configuration options for a sanitizer.
  
-        In addition to the usualy dictionary function, the class provides
+        In addition to the usual dictionary function, the class provides
          accessors to standard sanatizer options that are used by many of the
          sanitizers.
      """
  
          accessors to standard sanatizer options that are used by many of the
          sanitizers.
      """
  
-    def get_string_list(self, param, default=tuple()):
+    def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Sequence[str]:
          """ Extract a configuration parameter as a string list.
              If the parameter value is a simple string, it is returned as a
              one-item list. If the parameter value does not exist, the given
          """ Extract a configuration parameter as a string list.
              If the parameter value is a simple string, it is returned as a
              one-item list. If the parameter value does not exist, the given
@@ -44,7 +52,7 @@ class SanitizerConfig(UserDict):
          return values
  
  
          return values
  
  
-    def get_bool(self, param, default=None):
+    def get_bool(self, param: str, default: Optional[bool] = None) -> bool:
          """ Extract a configuration parameter as a boolean.
              The parameter must be one of the yaml boolean values or an
              user error will be raised. If `default` is given, then the parameter
          """ Extract a configuration parameter as a boolean.
              The parameter must be one of the yaml boolean values or an
              user error will be raised. If `default` is given, then the parameter
@@ -58,7 +66,7 @@ class SanitizerConfig(UserDict):
          return value
  
  
          return value
  
  
-    def get_delimiter(self, default=',;'):
+    def get_delimiter(self, default: str = ',;') -> Pattern[str]:
          """ Return the 'delimiter' parameter in the configuration as a
              compiled regular expression that can be used to split the names on the
              delimiters. The regular expression makes sure that the resulting names
          """ Return the 'delimiter' parameter in the configuration as a
              compiled regular expression that can be used to split the names on the
              delimiters. The regular expression makes sure that the resulting names
@@ -76,7 +84,7 @@ class SanitizerConfig(UserDict):
          return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
  
  
          return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
  
  
-    def get_filter_kind(self, *default):
+    def get_filter_kind(self, *default: str) -> Callable[[str], bool]:
          """ Return a filter function for the name kind from the 'filter-kind'
              config parameter. The filter functions takes a name item and returns
              True when the item passes the filter.
          """ Return a filter function for the name kind from the 'filter-kind'
              config parameter. The filter functions takes a name item and returns
              True when the item passes the filter.
@@ -93,4 +101,4 @@ class SanitizerConfig(UserDict):
  
          regexes = [re.compile(regex) for regex in filters]
  
  
          regexes = [re.compile(regex) for regex in filters]
  
-        return lambda name: any(regex.fullmatch(name.kind) for regex in regexes)
+        return lambda name: any(regex.fullmatch(name) for regex in regexes)
diff --git a/nominatim/tokenizer/sanitizers/split_name_list.py b/nominatim/tokenizer/sanitizers/split_name_list.py

index c9db0a9da83b2e7878133dc5e63e4477dfff29e5..7d0667b4e323ce1aa060c3f0ae2738505173a476 100644 (file)
--- a/nominatim/tokenizer/sanitizers/split_name_list.py
+++ b/nominatim/tokenizer/sanitizers/split_name_list.py
@@ -11,13 +11,18 @@ Arguments:
      delimiters: Define the set of characters to be used for
                  splitting the list. (default: ',;')
  """
      delimiters: Define the set of characters to be used for
                  splitting the list. (default: ',;')
  """
-def create(config):
+from typing import Callable
+
+from nominatim.tokenizer.sanitizers.base import ProcessInfo
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
+
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
      """ Create a name processing function that splits name values with
          multiple values into their components.
      """
      regexp = config.get_delimiter()
  
      """ Create a name processing function that splits name values with
          multiple values into their components.
      """
      regexp = config.get_delimiter()
  
-    def _process(obj):
+    def _process(obj: ProcessInfo) -> None:
          if not obj.names:
              return
  
          if not obj.names:
              return
  
diff --git a/nominatim/tokenizer/sanitizers/strip_brace_terms.py b/nominatim/tokenizer/sanitizers/strip_brace_terms.py

index f8cdd035f5ddf483f0f1c318b069b8d817320f66..119d5693a58beee7ebfa2fa852b03cb312a5b441 100644 (file)
--- a/nominatim/tokenizer/sanitizers/strip_brace_terms.py
+++ b/nominatim/tokenizer/sanitizers/strip_brace_terms.py
@@ -9,12 +9,17 @@ This sanitizer creates additional name variants for names that have
  addendums in brackets (e.g. "Halle (Saale)"). The additional variant contains
  only the main name part with the bracket part removed.
  """
  addendums in brackets (e.g. "Halle (Saale)"). The additional variant contains
  only the main name part with the bracket part removed.
  """
+from typing import Callable
  
  
-def create(_):
+from nominatim.tokenizer.sanitizers.base import ProcessInfo
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
+
+
+def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
      """ Create a name processing function that creates additional name variants
          for bracket addendums.
      """
      """ Create a name processing function that creates additional name variants
          for bracket addendums.
      """
-    def _process(obj):
+    def _process(obj: ProcessInfo) -> None:
          """ Add variants for names that have a bracket extension.
          """
          if obj.names:
          """ Add variants for names that have a bracket extension.
          """
          if obj.names:
diff --git a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py

index d3413c1ac79d39b5be68a8092ded62b43659e589..6d6430f034e0c10dfae13555e137b40ccae19484 100644 (file)
--- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
+++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
@@ -30,13 +30,17 @@ Arguments:
            any analyzer tagged) is retained. (default: replace)
  
  """
            any analyzer tagged) is retained. (default: replace)
  
  """
+from typing import Callable, Dict, Optional, List
+
  from nominatim.data import country_info
  from nominatim.data import country_info
+from nominatim.tokenizer.sanitizers.base import ProcessInfo
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
  
  class _AnalyzerByLanguage:
      """ Processor for tagging the language of names in a place.
      """
  
  
  class _AnalyzerByLanguage:
      """ Processor for tagging the language of names in a place.
      """
  
-    def __init__(self, config):
+    def __init__(self, config: SanitizerConfig) -> None:
          self.filter_kind = config.get_filter_kind()
          self.replace = config.get('mode', 'replace') != 'append'
          self.whitelist = config.get('whitelist')
          self.filter_kind = config.get_filter_kind()
          self.replace = config.get('mode', 'replace') != 'append'
          self.whitelist = config.get('whitelist')
@@ -44,8 +48,8 @@ class _AnalyzerByLanguage:
          self._compute_default_languages(config.get('use-defaults', 'no'))
  
  
          self._compute_default_languages(config.get('use-defaults', 'no'))
  
  
-    def _compute_default_languages(self, use_defaults):
-        self.deflangs = {}
+    def _compute_default_languages(self, use_defaults: str) -> None:
+        self.deflangs: Dict[Optional[str], List[str]] = {}
  
          if use_defaults in ('mono', 'all'):
              for ccode, clangs in country_info.iterate('languages'):
  
          if use_defaults in ('mono', 'all'):
              for ccode, clangs in country_info.iterate('languages'):
@@ -56,21 +60,21 @@ class _AnalyzerByLanguage:
                          self.deflangs[ccode] = clangs
  
  
                          self.deflangs[ccode] = clangs
  
  
-    def _suffix_matches(self, suffix):
+    def _suffix_matches(self, suffix: str) -> bool:
          if self.whitelist is None:
              return len(suffix) in (2, 3) and suffix.islower()
  
          return suffix in self.whitelist
  
  
          if self.whitelist is None:
              return len(suffix) in (2, 3) and suffix.islower()
  
          return suffix in self.whitelist
  
  
-    def __call__(self, obj):
+    def __call__(self, obj: ProcessInfo) -> None:
          if not obj.names:
              return
  
          more_names = []
  
          for name in (n for n in obj.names
          if not obj.names:
              return
  
          more_names = []
  
          for name in (n for n in obj.names
-                     if not n.has_attr('analyzer') and self.filter_kind(n)):
+                     if not n.has_attr('analyzer') and self.filter_kind(n.kind)):
              if name.suffix:
                  langs = [name.suffix] if self._suffix_matches(name.suffix) else None
              else:
              if name.suffix:
                  langs = [name.suffix] if self._suffix_matches(name.suffix) else None
              else:
@@ -88,7 +92,7 @@ class _AnalyzerByLanguage:
          obj.names.extend(more_names)
  
  
          obj.names.extend(more_names)
  
  
-def create(config):
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
      """ Create a function that sets the analyzer property depending on the
          language of the tag.
      """
      """ Create a function that sets the analyzer property depending on the
          language of the tag.
      """
diff --git a/test/python/tokenizer/sanitizers/test_sanitizer_config.py b/test/python/tokenizer/sanitizers/test_sanitizer_config.py

index 02794776cce6e3ed6caffc7262dd343ab5b140a3..0dbbc7a0dfdd7726ec8f1850c8efde23da093e94 100644 (file)
--- a/test/python/tokenizer/sanitizers/test_sanitizer_config.py
+++ b/test/python/tokenizer/sanitizers/test_sanitizer_config.py
@@ -82,32 +82,32 @@ def test_create_split_regex_empty_delimiter():
  def test_create_kind_filter_no_params(inp):
      filt = SanitizerConfig().get_filter_kind()
  
  def test_create_kind_filter_no_params(inp):
      filt = SanitizerConfig().get_filter_kind()
  
-    assert filt(PlaceName('something', inp, ''))
+    assert filt(inp)
  
  
  @pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
  def test_create_kind_filter_custom_regex_positive(kind):
      filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
  
  
  
  @pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
  def test_create_kind_filter_custom_regex_positive(kind):
      filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
  
-    assert filt(PlaceName('something', kind, ''))
+    assert filt(kind)
  
  
  @pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
  def test_create_kind_filter_custom_regex_negative(kind):
      filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
  
  
  
  @pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
  def test_create_kind_filter_custom_regex_negative(kind):
      filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
  
-    assert not filt(PlaceName('something', kind, ''))
+    assert not filt(kind)
  
  
  @pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
  def test_create_kind_filter_many_positive(kind):
      filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
  
  
  
  @pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
  def test_create_kind_filter_many_positive(kind):
      filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
  
-    assert filt(PlaceName('something', kind, ''))
+    assert filt(kind)
  
  
  @pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
  def test_create_kind_filter_many_negative(kind):
      filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
  
  
  
  @pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
  def test_create_kind_filter_many_negative(kind):
      filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
  
-    assert not filt(PlaceName('something', kind, ''))
+    assert not filt(kind)
author	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 12 Jul 2022 21:15:19 +0000 (23:15 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Mon, 18 Jul 2022 07:47:57 +0000 (09:47 +0200)
nominatim/data/postcode_format.py		patch \| blob \| history
nominatim/tokenizer/place_sanitizer.py		patch \| blob \| history
nominatim/tokenizer/sanitizers/base.py	[new file with mode: 0644]	patch \| blob
nominatim/tokenizer/sanitizers/clean_housenumbers.py		patch \| blob \| history
nominatim/tokenizer/sanitizers/clean_postcodes.py		patch \| blob \| history
nominatim/tokenizer/sanitizers/config.py		patch \| blob \| history
nominatim/tokenizer/sanitizers/split_name_list.py		patch \| blob \| history
nominatim/tokenizer/sanitizers/strip_brace_terms.py		patch \| blob \| history
nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py		patch \| blob \| history
test/python/tokenizer/sanitizers/test_sanitizer_config.py		patch \| blob \| history