From d35e3c25b66d25e4a3dd073c7ba17b0c204ec8e8 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 13 Jul 2022 17:18:53 +0200 Subject: [PATCH 1/1] add type annotations for token analysis No annotations for ICU types yet. --- nominatim/tokenizer/token_analysis/base.py | 45 +++++++++++++++++++ .../token_analysis/config_variants.py | 30 ++++++++----- nominatim/tokenizer/token_analysis/generic.py | 21 +++++---- .../token_analysis/generic_mutation.py | 9 ++-- .../tokenizer/token_analysis/housenumbers.py | 13 +++--- .../tokenizer/token_analysis/postcodes.py | 11 ++--- 6 files changed, 94 insertions(+), 35 deletions(-) create mode 100644 nominatim/tokenizer/token_analysis/base.py diff --git a/nominatim/tokenizer/token_analysis/base.py b/nominatim/tokenizer/token_analysis/base.py new file mode 100644 index 00000000..b55b4f7c --- /dev/null +++ b/nominatim/tokenizer/token_analysis/base.py @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Common data types and protocols for analysers. +""" +from typing import TypeVar, Mapping, List, Any + +from typing_extensions import Protocol + + +T_config = TypeVar('T_config') # pylint: disable=invalid-name + +class Analyser(Protocol): + """ Instance of the token analyser. + """ + + def normalize(self, name: str) -> str: + """ Return the normalized form of the name. This is the standard form + from which possible variants for the name can be derived. + """ + + def get_variants_ascii(self, norm_name: str) -> List[str]: + """ Compute the spelling variants for the given normalized name + and transliterate the result. + """ + +class AnalysisModule(Protocol[T_config]): + """ Protocol for analysis modules. + """ + + def configure(self, rules: Mapping[str, Any], normalization_rules: str) -> T_config: + """ Prepare the configuration of the analysis module. + This function should prepare all data that can be shared + between instances of this analyser. + """ + + def create(self, normalizer: Any, transliterator: Any, config: T_config) -> Analyser: + """ Create a new instance of the analyser. + A separate instance of the analyser is created for each thread + when used in multi-threading context. + """ diff --git a/nominatim/tokenizer/token_analysis/config_variants.py b/nominatim/tokenizer/token_analysis/config_variants.py index 067c4b5b..e0d1579d 100644 --- a/nominatim/tokenizer/token_analysis/config_variants.py +++ b/nominatim/tokenizer/token_analysis/config_variants.py @@ -7,7 +7,8 @@ """ Parser for configuration for variants. """ -from collections import defaultdict, namedtuple +from typing import Any, Iterator, Tuple, List, Optional, Set, NamedTuple +from collections import defaultdict import itertools import re @@ -16,9 +17,15 @@ from icu import Transliterator from nominatim.config import flatten_config_list from nominatim.errors import UsageError -ICUVariant = namedtuple('ICUVariant', ['source', 'replacement']) +class ICUVariant(NamedTuple): + """ A single replacement rule for variant creation. + """ + source: str + replacement: str + -def get_variant_config(rules, normalization_rules): +def get_variant_config(in_rules: Any, + normalization_rules: str) -> Tuple[List[Tuple[str, List[str]]], str]: """ Convert the variant definition from the configuration into replacement sets. @@ -26,11 +33,11 @@ def get_variant_config(rules, normalization_rules): used in the replacements. """ immediate = defaultdict(list) - chars = set() + chars: Set[str] = set() - if rules: - vset = set() - rules = flatten_config_list(rules, 'variants') + if in_rules: + vset: Set[ICUVariant] = set() + rules = flatten_config_list(in_rules, 'variants') vmaker = _VariantMaker(normalization_rules) @@ -56,12 +63,12 @@ class _VariantMaker: All text in rules is normalized to make sure the variants match later. """ - def __init__(self, norm_rules): + def __init__(self, norm_rules: Any) -> None: self.norm = Transliterator.createFromRules("rule_loader_normalization", norm_rules) - def compute(self, rule): + def compute(self, rule: Any) -> Iterator[ICUVariant]: """ Generator for all ICUVariant tuples from a single variant rule. """ parts = re.split(r'(\|)?([=-])>', rule) @@ -85,7 +92,7 @@ class _VariantMaker: yield ICUVariant(froms, tos) - def _parse_variant_word(self, name): + def _parse_variant_word(self, name: str) -> Optional[Tuple[str, str, str]]: name = name.strip() match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name) if match is None or (match.group(1) == '~' and match.group(3) == '~'): @@ -102,7 +109,8 @@ _FLAG_MATCH = {'^': '^ ', '': ' '} -def _create_variants(src, preflag, postflag, repl, decompose): +def _create_variants(src: str, preflag: str, postflag: str, + repl: str, decompose: bool) -> Iterator[Tuple[str, str]]: if preflag == '~': postfix = _FLAG_MATCH[postflag] # suffix decomposition diff --git a/nominatim/tokenizer/token_analysis/generic.py b/nominatim/tokenizer/token_analysis/generic.py index 3de915ba..e14f844c 100644 --- a/nominatim/tokenizer/token_analysis/generic.py +++ b/nominatim/tokenizer/token_analysis/generic.py @@ -7,6 +7,7 @@ """ Generic processor for names that creates abbreviation variants. """ +from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast import itertools import datrie @@ -17,10 +18,10 @@ from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantG ### Configuration section -def configure(rules, normalization_rules): +def configure(rules: Mapping[str, Any], normalization_rules: str) -> Dict[str, Any]: """ Extract and preprocess the configuration for this module. """ - config = {} + config: Dict[str, Any] = {} config['replacements'], config['chars'] = get_variant_config(rules.get('variants'), normalization_rules) @@ -47,7 +48,8 @@ def configure(rules, normalization_rules): ### Analysis section -def create(normalizer, transliterator, config): +def create(normalizer: Any, transliterator: Any, + config: Mapping[str, Any]) -> 'GenericTokenAnalysis': """ Create a new token analysis instance for this module. """ return GenericTokenAnalysis(normalizer, transliterator, config) @@ -58,7 +60,7 @@ class GenericTokenAnalysis: and provides the functions to apply the transformations. """ - def __init__(self, norm, to_ascii, config): + def __init__(self, norm: Any, to_ascii: Any, config: Mapping[str, Any]) -> None: self.norm = norm self.to_ascii = to_ascii self.variant_only = config['variant_only'] @@ -75,14 +77,14 @@ class GenericTokenAnalysis: self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']] - def normalize(self, name): + def normalize(self, name: str) -> str: """ Return the normalized form of the name. This is the standard form from which possible variants for the name can be derived. """ - return self.norm.transliterate(name).strip() + return cast(str, self.norm.transliterate(name)).strip() - def get_variants_ascii(self, norm_name): + def get_variants_ascii(self, norm_name: str) -> List[str]: """ Compute the spelling variants for the given normalized name and transliterate the result. """ @@ -94,7 +96,8 @@ class GenericTokenAnalysis: return [name for name in self._transliterate_unique_list(norm_name, variants) if name] - def _transliterate_unique_list(self, norm_name, iterable): + def _transliterate_unique_list(self, norm_name: str, + iterable: Iterable[str]) -> Iterator[Optional[str]]: seen = set() if self.variant_only: seen.add(norm_name) @@ -105,7 +108,7 @@ class GenericTokenAnalysis: yield self.to_ascii.transliterate(variant).strip() - def _generate_word_variants(self, norm_name): + def _generate_word_variants(self, norm_name: str) -> Iterable[str]: baseform = '^ ' + norm_name + ' ^' baselen = len(baseform) partials = [''] diff --git a/nominatim/tokenizer/token_analysis/generic_mutation.py b/nominatim/tokenizer/token_analysis/generic_mutation.py index d23d5cd4..47154537 100644 --- a/nominatim/tokenizer/token_analysis/generic_mutation.py +++ b/nominatim/tokenizer/token_analysis/generic_mutation.py @@ -7,6 +7,7 @@ """ Creator for mutation variants for the generic token analysis. """ +from typing import Sequence, Iterable, Iterator, Tuple import itertools import logging import re @@ -15,7 +16,7 @@ from nominatim.errors import UsageError LOG = logging.getLogger() -def _zigzag(outer, inner): +def _zigzag(outer: Iterable[str], inner: Iterable[str]) -> Iterator[str]: return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue='')) @@ -26,7 +27,7 @@ class MutationVariantGenerator: patterns. """ - def __init__(self, pattern, replacements): + def __init__(self, pattern: str, replacements: Sequence[str]): self.pattern = re.compile(pattern) self.replacements = replacements @@ -36,7 +37,7 @@ class MutationVariantGenerator: raise UsageError("Bad mutation pattern in configuration.") - def generate(self, names): + def generate(self, names: Iterable[str]) -> Iterator[str]: """ Generator function for the name variants. 'names' is an iterable over a set of names for which the variants are to be generated. """ @@ -49,7 +50,7 @@ class MutationVariantGenerator: yield ''.join(_zigzag(parts, seps)) - def _fillers(self, num_parts): + def _fillers(self, num_parts: int) -> Iterator[Tuple[str, ...]]: """ Returns a generator for strings to join the given number of string parts in all possible combinations. """ diff --git a/nominatim/tokenizer/token_analysis/housenumbers.py b/nominatim/tokenizer/token_analysis/housenumbers.py index 96e86b28..a0f4214d 100644 --- a/nominatim/tokenizer/token_analysis/housenumbers.py +++ b/nominatim/tokenizer/token_analysis/housenumbers.py @@ -8,6 +8,7 @@ Specialized processor for housenumbers. Analyses common housenumber patterns and creates variants for them. """ +from typing import Mapping, Any, List, cast import re from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator @@ -19,14 +20,14 @@ RE_NAMED_PART = re.compile(r'[a-z]{4}') ### Configuration section -def configure(rules, normalization_rules): # pylint: disable=W0613 +def configure(rules: Mapping[str, Any], normalization_rules: str) -> None: # pylint: disable=W0613 """ All behaviour is currently hard-coded. """ return None ### Analysis section -def create(normalizer, transliterator, config): # pylint: disable=W0613 +def create(normalizer: Any, transliterator: Any, config: None) -> 'HousenumberTokenAnalysis': # pylint: disable=W0613 """ Create a new token analysis instance for this module. """ return HousenumberTokenAnalysis(normalizer, transliterator) @@ -35,20 +36,20 @@ def create(normalizer, transliterator, config): # pylint: disable=W0613 class HousenumberTokenAnalysis: """ Detects common housenumber patterns and normalizes them. """ - def __init__(self, norm, trans): + def __init__(self, norm: Any, trans: Any) -> None: self.norm = norm self.trans = trans self.mutator = MutationVariantGenerator('␣', (' ', '')) - def normalize(self, name): + def normalize(self, name: str) -> str: """ Return the normalized form of the housenumber. """ # shortcut for number-only numbers, which make up 90% of the data. if RE_NON_DIGIT.search(name) is None: return name - norm = self.trans.transliterate(self.norm.transliterate(name)) + norm = cast(str, self.trans.transliterate(self.norm.transliterate(name))) # If there is a significant non-numeric part, use as is. if RE_NAMED_PART.search(norm) is None: # Otherwise add optional spaces between digits and letters. @@ -60,7 +61,7 @@ class HousenumberTokenAnalysis: return norm - def get_variants_ascii(self, norm_name): + def get_variants_ascii(self, norm_name: str) -> List[str]: """ Compute the spelling variants for the given normalized housenumber. Generates variants for optional spaces (marked with '␣'). diff --git a/nominatim/tokenizer/token_analysis/postcodes.py b/nominatim/tokenizer/token_analysis/postcodes.py index 18fc2a8d..15b20bf9 100644 --- a/nominatim/tokenizer/token_analysis/postcodes.py +++ b/nominatim/tokenizer/token_analysis/postcodes.py @@ -8,19 +8,20 @@ Specialized processor for postcodes. Supports a 'lookup' variant of the token, which produces variants with optional spaces. """ +from typing import Mapping, Any, List from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator ### Configuration section -def configure(rules, normalization_rules): # pylint: disable=W0613 +def configure(rules: Mapping[str, Any], normalization_rules: str) -> None: # pylint: disable=W0613 """ All behaviour is currently hard-coded. """ return None ### Analysis section -def create(normalizer, transliterator, config): # pylint: disable=W0613 +def create(normalizer: Any, transliterator: Any, config: None) -> 'PostcodeTokenAnalysis': # pylint: disable=W0613 """ Create a new token analysis instance for this module. """ return PostcodeTokenAnalysis(normalizer, transliterator) @@ -38,20 +39,20 @@ class PostcodeTokenAnalysis: and transliteration, so that postcodes are correctly recognised by the search algorithm. """ - def __init__(self, norm, trans): + def __init__(self, norm: Any, trans: Any) -> None: self.norm = norm self.trans = trans self.mutator = MutationVariantGenerator(' ', (' ', '')) - def normalize(self, name): + def normalize(self, name: str) -> str: """ Return the standard form of the postcode. """ return name.strip().upper() - def get_variants_ascii(self, norm_name): + def get_variants_ascii(self, norm_name: str) -> List[str]: """ Compute the spelling variants for the given normalized postcode. Takes the canonical form of the postcode, normalizes it using the -- 2.39.5