add type annotations for token analysis

author Sarah Hoffmann <lonvia@denofr.de>

Wed, 13 Jul 2022 15:18:53 +0000 (17:18 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Mon, 18 Jul 2022 07:47:57 +0000 (09:47 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Wed, 13 Jul 2022 15:18:53 +0000 (17:18 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Mon, 18 Jul 2022 07:47:57 +0000 (09:47 +0200)
diff --git a/nominatim/tokenizer/token_analysis/base.py b/nominatim/tokenizer/token_analysis/base.py

new file mode 100644 (file)

index 0000000..b55b4f7
--- /dev/null
+++ b/nominatim/tokenizer/token_analysis/base.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Common data types and protocols for analysers.
+"""
+from typing import TypeVar, Mapping, List, Any
+
+from typing_extensions import Protocol
+
+
+T_config = TypeVar('T_config') # pylint: disable=invalid-name
+
+class Analyser(Protocol):
+    """ Instance of the token analyser.
+    """
+
+    def normalize(self, name: str) -> str:
+        """ Return the normalized form of the name. This is the standard form
+            from which possible variants for the name can be derived.
+        """
+
+    def get_variants_ascii(self, norm_name: str) -> List[str]:
+        """ Compute the spelling variants for the given normalized name
+            and transliterate the result.
+        """
+
+class AnalysisModule(Protocol[T_config]):
+    """ Protocol for analysis modules.
+    """
+
+    def configure(self, rules: Mapping[str, Any], normalization_rules: str) -> T_config:
+        """ Prepare the configuration of the analysis module.
+            This function should prepare all data that can be shared
+            between instances of this analyser.
+        """
+
+    def create(self, normalizer: Any, transliterator: Any, config: T_config) -> Analyser:
+        """ Create a new instance of the analyser.
+            A separate instance of the analyser is created for each thread
+            when used in multi-threading context.
+        """
diff --git a/nominatim/tokenizer/token_analysis/config_variants.py b/nominatim/tokenizer/token_analysis/config_variants.py

index 067c4b5bd5e38eae5e093d0e894bbb2c85770d42..e0d1579d7fab880a94e40e4f07eb4fc654596e57 100644 (file)
--- a/nominatim/tokenizer/token_analysis/config_variants.py
+++ b/nominatim/tokenizer/token_analysis/config_variants.py
@@ -7,7 +7,8 @@
  """
  Parser for configuration for variants.
  """
  """
  Parser for configuration for variants.
  """
-from collections import defaultdict, namedtuple
+from typing import Any, Iterator, Tuple, List, Optional, Set, NamedTuple
+from collections import defaultdict
  import itertools
  import re
  
  import itertools
  import re
  
@@ -16,9 +17,15 @@ from icu import Transliterator
  from nominatim.config import flatten_config_list
  from nominatim.errors import UsageError
  
  from nominatim.config import flatten_config_list
  from nominatim.errors import UsageError
  
-ICUVariant = namedtuple('ICUVariant', ['source', 'replacement'])
+class ICUVariant(NamedTuple):
+    """ A single replacement rule for variant creation.
+    """
+    source: str
+    replacement: str
+
  
  
-def get_variant_config(rules, normalization_rules):
+def get_variant_config(in_rules: Any,
+                       normalization_rules: str) -> Tuple[List[Tuple[str, List[str]]], str]:
      """ Convert the variant definition from the configuration into
          replacement sets.
  
      """ Convert the variant definition from the configuration into
          replacement sets.
  
@@ -26,11 +33,11 @@ def get_variant_config(rules, normalization_rules):
          used in the replacements.
      """
      immediate = defaultdict(list)
          used in the replacements.
      """
      immediate = defaultdict(list)
-    chars = set()
+    chars: Set[str] = set()
  
  
-    if rules:
-        vset = set()
-        rules = flatten_config_list(rules, 'variants')
+    if in_rules:
+        vset: Set[ICUVariant] = set()
+        rules = flatten_config_list(in_rules, 'variants')
  
          vmaker = _VariantMaker(normalization_rules)
  
  
          vmaker = _VariantMaker(normalization_rules)
  
@@ -56,12 +63,12 @@ class _VariantMaker:
          All text in rules is normalized to make sure the variants match later.
      """
  
          All text in rules is normalized to make sure the variants match later.
      """
  
-    def __init__(self, norm_rules):
+    def __init__(self, norm_rules: Any) -> None:
          self.norm = Transliterator.createFromRules("rule_loader_normalization",
                                                     norm_rules)
  
  
          self.norm = Transliterator.createFromRules("rule_loader_normalization",
                                                     norm_rules)
  
  
-    def compute(self, rule):
+    def compute(self, rule: Any) -> Iterator[ICUVariant]:
          """ Generator for all ICUVariant tuples from a single variant rule.
          """
          parts = re.split(r'(\|)?([=-])>', rule)
          """ Generator for all ICUVariant tuples from a single variant rule.
          """
          parts = re.split(r'(\|)?([=-])>', rule)
@@ -85,7 +92,7 @@ class _VariantMaker:
                      yield ICUVariant(froms, tos)
  
  
                      yield ICUVariant(froms, tos)
  
  
-    def _parse_variant_word(self, name):
+    def _parse_variant_word(self, name: str) -> Optional[Tuple[str, str, str]]:
          name = name.strip()
          match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
          if match is None or (match.group(1) == '~' and match.group(3) == '~'):
          name = name.strip()
          match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
          if match is None or (match.group(1) == '~' and match.group(3) == '~'):
@@ -102,7 +109,8 @@ _FLAG_MATCH = {'^': '^ ',
                 '': ' '}
  
  
                 '': ' '}
  
  
-def _create_variants(src, preflag, postflag, repl, decompose):
+def _create_variants(src: str, preflag: str, postflag: str,
+                     repl: str, decompose: bool) -> Iterator[Tuple[str, str]]:
      if preflag == '~':
          postfix = _FLAG_MATCH[postflag]
          # suffix decomposition
      if preflag == '~':
          postfix = _FLAG_MATCH[postflag]
          # suffix decomposition
diff --git a/nominatim/tokenizer/token_analysis/generic.py b/nominatim/tokenizer/token_analysis/generic.py

index 3de915ba5254e1859976dd7e9842247df5a58b98..e14f844c5d3ff969502e014d41a67ef35ef0378c 100644 (file)
--- a/nominatim/tokenizer/token_analysis/generic.py
+++ b/nominatim/tokenizer/token_analysis/generic.py
@@ -7,6 +7,7 @@
  """
  Generic processor for names that creates abbreviation variants.
  """
  """
  Generic processor for names that creates abbreviation variants.
  """
+from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast
  import itertools
  
  import datrie
  import itertools
  
  import datrie
@@ -17,10 +18,10 @@ from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantG
  
  ### Configuration section
  
  
  ### Configuration section
  
-def configure(rules, normalization_rules):
+def configure(rules: Mapping[str, Any], normalization_rules: str) -> Dict[str, Any]:
      """ Extract and preprocess the configuration for this module.
      """
      """ Extract and preprocess the configuration for this module.
      """
-    config = {}
+    config: Dict[str, Any] = {}
  
      config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
                                                                   normalization_rules)
  
      config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
                                                                   normalization_rules)
@@ -47,7 +48,8 @@ def configure(rules, normalization_rules):
  
  ### Analysis section
  
  
  ### Analysis section
  
-def create(normalizer, transliterator, config):
+def create(normalizer: Any, transliterator: Any,
+           config: Mapping[str, Any]) -> 'GenericTokenAnalysis':
      """ Create a new token analysis instance for this module.
      """
      return GenericTokenAnalysis(normalizer, transliterator, config)
      """ Create a new token analysis instance for this module.
      """
      return GenericTokenAnalysis(normalizer, transliterator, config)
@@ -58,7 +60,7 @@ class GenericTokenAnalysis:
          and provides the functions to apply the transformations.
      """
  
          and provides the functions to apply the transformations.
      """
  
-    def __init__(self, norm, to_ascii, config):
+    def __init__(self, norm: Any, to_ascii: Any, config: Mapping[str, Any]) -> None:
          self.norm = norm
          self.to_ascii = to_ascii
          self.variant_only = config['variant_only']
          self.norm = norm
          self.to_ascii = to_ascii
          self.variant_only = config['variant_only']
@@ -75,14 +77,14 @@ class GenericTokenAnalysis:
          self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
  
  
          self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
  
  
-    def normalize(self, name):
+    def normalize(self, name: str) -> str:
          """ Return the normalized form of the name. This is the standard form
              from which possible variants for the name can be derived.
          """
          """ Return the normalized form of the name. This is the standard form
              from which possible variants for the name can be derived.
          """
-        return self.norm.transliterate(name).strip()
+        return cast(str, self.norm.transliterate(name)).strip()
  
  
  
  
-    def get_variants_ascii(self, norm_name):
+    def get_variants_ascii(self, norm_name: str) -> List[str]:
          """ Compute the spelling variants for the given normalized name
              and transliterate the result.
          """
          """ Compute the spelling variants for the given normalized name
              and transliterate the result.
          """
@@ -94,7 +96,8 @@ class GenericTokenAnalysis:
          return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
  
  
          return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
  
  
-    def _transliterate_unique_list(self, norm_name, iterable):
+    def _transliterate_unique_list(self, norm_name: str,
+                                   iterable: Iterable[str]) -> Iterator[Optional[str]]:
          seen = set()
          if self.variant_only:
              seen.add(norm_name)
          seen = set()
          if self.variant_only:
              seen.add(norm_name)
@@ -105,7 +108,7 @@ class GenericTokenAnalysis:
                  yield self.to_ascii.transliterate(variant).strip()
  
  
                  yield self.to_ascii.transliterate(variant).strip()
  
  
-    def _generate_word_variants(self, norm_name):
+    def _generate_word_variants(self, norm_name: str) -> Iterable[str]:
          baseform = '^ ' + norm_name + ' ^'
          baselen = len(baseform)
          partials = ['']
          baseform = '^ ' + norm_name + ' ^'
          baselen = len(baseform)
          partials = ['']
diff --git a/nominatim/tokenizer/token_analysis/generic_mutation.py b/nominatim/tokenizer/token_analysis/generic_mutation.py

index d23d5cd46f7dbf9955588a19995ebbfbbe6c7067..47154537d0928d284aaf3482ea80a53dc028d9f2 100644 (file)
--- a/nominatim/tokenizer/token_analysis/generic_mutation.py
+++ b/nominatim/tokenizer/token_analysis/generic_mutation.py
@@ -7,6 +7,7 @@
  """
  Creator for mutation variants for the generic token analysis.
  """
  """
  Creator for mutation variants for the generic token analysis.
  """
+from typing import Sequence, Iterable, Iterator, Tuple
  import itertools
  import logging
  import re
  import itertools
  import logging
  import re
@@ -15,7 +16,7 @@ from nominatim.errors import UsageError
  
  LOG = logging.getLogger()
  
  
  LOG = logging.getLogger()
  
-def _zigzag(outer, inner):
+def _zigzag(outer: Iterable[str], inner: Iterable[str]) -> Iterator[str]:
      return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue=''))
  
  
      return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue=''))
  
  
@@ -26,7 +27,7 @@ class MutationVariantGenerator:
          patterns.
      """
  
          patterns.
      """
  
-    def __init__(self, pattern, replacements):
+    def __init__(self, pattern: str, replacements: Sequence[str]):
          self.pattern = re.compile(pattern)
          self.replacements = replacements
  
          self.pattern = re.compile(pattern)
          self.replacements = replacements
  
@@ -36,7 +37,7 @@ class MutationVariantGenerator:
              raise UsageError("Bad mutation pattern in configuration.")
  
  
              raise UsageError("Bad mutation pattern in configuration.")
  
  
-    def generate(self, names):
+    def generate(self, names: Iterable[str]) -> Iterator[str]:
          """ Generator function for the name variants. 'names' is an iterable
              over a set of names for which the variants are to be generated.
          """
          """ Generator function for the name variants. 'names' is an iterable
              over a set of names for which the variants are to be generated.
          """
@@ -49,7 +50,7 @@ class MutationVariantGenerator:
                      yield ''.join(_zigzag(parts, seps))
  
  
                      yield ''.join(_zigzag(parts, seps))
  
  
-    def _fillers(self, num_parts):
+    def _fillers(self, num_parts: int) -> Iterator[Tuple[str, ...]]:
          """ Returns a generator for strings to join the given number of string
              parts in all possible combinations.
          """
          """ Returns a generator for strings to join the given number of string
              parts in all possible combinations.
          """
diff --git a/nominatim/tokenizer/token_analysis/housenumbers.py b/nominatim/tokenizer/token_analysis/housenumbers.py

index 96e86b28e79982825898757756cf03cff0981475..a0f4214d55fee1b6862541409b7e2f6bab434b26 100644 (file)
--- a/nominatim/tokenizer/token_analysis/housenumbers.py
+++ b/nominatim/tokenizer/token_analysis/housenumbers.py
@@ -8,6 +8,7 @@
  Specialized processor for housenumbers. Analyses common housenumber patterns
  and creates variants for them.
  """
  Specialized processor for housenumbers. Analyses common housenumber patterns
  and creates variants for them.
  """
+from typing import Mapping, Any, List, cast
  import re
  
  from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
  import re
  
  from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
@@ -19,14 +20,14 @@ RE_NAMED_PART = re.compile(r'[a-z]{4}')
  
  ### Configuration section
  
  
  ### Configuration section
  
-def configure(rules, normalization_rules): # pylint: disable=W0613
+def configure(rules: Mapping[str, Any], normalization_rules: str) -> None: # pylint: disable=W0613
      """ All behaviour is currently hard-coded.
      """
      return None
  
  ### Analysis section
  
      """ All behaviour is currently hard-coded.
      """
      return None
  
  ### Analysis section
  
-def create(normalizer, transliterator, config): # pylint: disable=W0613
+def create(normalizer: Any, transliterator: Any, config: None) -> 'HousenumberTokenAnalysis': # pylint: disable=W0613
      """ Create a new token analysis instance for this module.
      """
      return HousenumberTokenAnalysis(normalizer, transliterator)
      """ Create a new token analysis instance for this module.
      """
      return HousenumberTokenAnalysis(normalizer, transliterator)
@@ -35,20 +36,20 @@ def create(normalizer, transliterator, config): # pylint: disable=W0613
  class HousenumberTokenAnalysis:
      """ Detects common housenumber patterns and normalizes them.
      """
  class HousenumberTokenAnalysis:
      """ Detects common housenumber patterns and normalizes them.
      """
-    def __init__(self, norm, trans):
+    def __init__(self, norm: Any, trans: Any) -> None:
          self.norm = norm
          self.trans = trans
  
          self.mutator = MutationVariantGenerator('␣', (' ', ''))
  
          self.norm = norm
          self.trans = trans
  
          self.mutator = MutationVariantGenerator('␣', (' ', ''))
  
-    def normalize(self, name):
+    def normalize(self, name: str) -> str:
          """ Return the normalized form of the housenumber.
          """
          # shortcut for number-only numbers, which make up 90% of the data.
          if RE_NON_DIGIT.search(name) is None:
              return name
  
          """ Return the normalized form of the housenumber.
          """
          # shortcut for number-only numbers, which make up 90% of the data.
          if RE_NON_DIGIT.search(name) is None:
              return name
  
-        norm = self.trans.transliterate(self.norm.transliterate(name))
+        norm = cast(str, self.trans.transliterate(self.norm.transliterate(name)))
          # If there is a significant non-numeric part, use as is.
          if RE_NAMED_PART.search(norm) is None:
              # Otherwise add optional spaces between digits and letters.
          # If there is a significant non-numeric part, use as is.
          if RE_NAMED_PART.search(norm) is None:
              # Otherwise add optional spaces between digits and letters.
@@ -60,7 +61,7 @@ class HousenumberTokenAnalysis:
  
          return norm
  
  
          return norm
  
-    def get_variants_ascii(self, norm_name):
+    def get_variants_ascii(self, norm_name: str) -> List[str]:
          """ Compute the spelling variants for the given normalized housenumber.
  
              Generates variants for optional spaces (marked with '␣').
          """ Compute the spelling variants for the given normalized housenumber.
  
              Generates variants for optional spaces (marked with '␣').
diff --git a/nominatim/tokenizer/token_analysis/postcodes.py b/nominatim/tokenizer/token_analysis/postcodes.py

index 18fc2a8ded2918b61884639d2ee769617d3ece65..15b20bf915b3f48ba462e55c0441acf18038ceb5 100644 (file)
--- a/nominatim/tokenizer/token_analysis/postcodes.py
+++ b/nominatim/tokenizer/token_analysis/postcodes.py
@@ -8,19 +8,20 @@
  Specialized processor for postcodes. Supports a 'lookup' variant of the
  token, which produces variants with optional spaces.
  """
  Specialized processor for postcodes. Supports a 'lookup' variant of the
  token, which produces variants with optional spaces.
  """
+from typing import Mapping, Any, List
  
  from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
  
  ### Configuration section
  
  
  from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
  
  ### Configuration section
  
-def configure(rules, normalization_rules): # pylint: disable=W0613
+def configure(rules: Mapping[str, Any], normalization_rules: str) -> None: # pylint: disable=W0613
      """ All behaviour is currently hard-coded.
      """
      return None
  
  ### Analysis section
  
      """ All behaviour is currently hard-coded.
      """
      return None
  
  ### Analysis section
  
-def create(normalizer, transliterator, config): # pylint: disable=W0613
+def create(normalizer: Any, transliterator: Any, config: None) -> 'PostcodeTokenAnalysis': # pylint: disable=W0613
      """ Create a new token analysis instance for this module.
      """
      return PostcodeTokenAnalysis(normalizer, transliterator)
      """ Create a new token analysis instance for this module.
      """
      return PostcodeTokenAnalysis(normalizer, transliterator)
@@ -38,20 +39,20 @@ class PostcodeTokenAnalysis:
          and transliteration, so that postcodes are correctly recognised by
          the search algorithm.
      """
          and transliteration, so that postcodes are correctly recognised by
          the search algorithm.
      """
-    def __init__(self, norm, trans):
+    def __init__(self, norm: Any, trans: Any) -> None:
          self.norm = norm
          self.trans = trans
  
          self.mutator = MutationVariantGenerator(' ', (' ', ''))
  
  
          self.norm = norm
          self.trans = trans
  
          self.mutator = MutationVariantGenerator(' ', (' ', ''))
  
  
-    def normalize(self, name):
+    def normalize(self, name: str) -> str:
          """ Return the standard form of the postcode.
          """
          return name.strip().upper()
  
  
          """ Return the standard form of the postcode.
          """
          return name.strip().upper()
  
  
-    def get_variants_ascii(self, norm_name):
+    def get_variants_ascii(self, norm_name: str) -> List[str]:
          """ Compute the spelling variants for the given normalized postcode.
  
              Takes the canonical form of the postcode, normalizes it using the
          """ Compute the spelling variants for the given normalized postcode.
  
              Takes the canonical form of the postcode, normalizes it using the
author	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 13 Jul 2022 15:18:53 +0000 (17:18 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Mon, 18 Jul 2022 07:47:57 +0000 (09:47 +0200)
nominatim/tokenizer/token_analysis/base.py	[new file with mode: 0644]	patch \| blob
nominatim/tokenizer/token_analysis/config_variants.py		patch \| blob \| history
nominatim/tokenizer/token_analysis/generic.py		patch \| blob \| history
nominatim/tokenizer/token_analysis/generic_mutation.py		patch \| blob \| history
nominatim/tokenizer/token_analysis/housenumbers.py		patch \| blob \| history
nominatim/tokenizer/token_analysis/postcodes.py		patch \| blob \| history