X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/a7e048484b0f9066d1c6ba422087c2fd2c635b16..26a5b59c287225515e679941d5fe48d0cc9fce79:/nominatim/tokenizer/token_analysis/config_variants.py?ds=inline diff --git a/nominatim/tokenizer/token_analysis/config_variants.py b/nominatim/tokenizer/token_analysis/config_variants.py index 59ceeb22..1258373e 100644 --- a/nominatim/tokenizer/token_analysis/config_variants.py +++ b/nominatim/tokenizer/token_analysis/config_variants.py @@ -7,18 +7,23 @@ """ Parser for configuration for variants. """ -from collections import defaultdict, namedtuple +from typing import Any, Iterator, Tuple, List, Optional, Set, NamedTuple +from collections import defaultdict import itertools import re -from icu import Transliterator - from nominatim.config import flatten_config_list from nominatim.errors import UsageError -ICUVariant = namedtuple('ICUVariant', ['source', 'replacement']) +class ICUVariant(NamedTuple): + """ A single replacement rule for variant creation. + """ + source: str + replacement: str + -def get_variant_config(rules, normalization_rules): +def get_variant_config(in_rules: Any, + normalizer: Any) -> Tuple[List[Tuple[str, List[str]]], str]: """ Convert the variant definition from the configuration into replacement sets. @@ -26,13 +31,13 @@ def get_variant_config(rules, normalization_rules): used in the replacements. """ immediate = defaultdict(list) - chars = set() + chars: Set[str] = set() - if rules: - vset = set() - rules = flatten_config_list(rules, 'variants') + if in_rules: + vset: Set[ICUVariant] = set() + rules = flatten_config_list(in_rules, 'variants') - vmaker = _VariantMaker(normalization_rules) + vmaker = _VariantMaker(normalizer) for section in rules: for rule in (section.get('words') or []): @@ -51,22 +56,21 @@ def get_variant_config(rules, normalization_rules): class _VariantMaker: - """ Generater for all necessary ICUVariants from a single variant rule. + """ Generator for all necessary ICUVariants from a single variant rule. All text in rules is normalized to make sure the variants match later. """ - def __init__(self, norm_rules): - self.norm = Transliterator.createFromRules("rule_loader_normalization", - norm_rules) + def __init__(self, normalizer: Any) -> None: + self.norm = normalizer - def compute(self, rule): + def compute(self, rule: Any) -> Iterator[ICUVariant]: """ Generator for all ICUVariant tuples from a single variant rule. """ parts = re.split(r'(\|)?([=-])>', rule) if len(parts) != 4: - raise UsageError("Syntax error in variant rule: " + rule) + raise UsageError(f"Syntax error in variant rule: {rule}") decompose = parts[1] is None src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')] @@ -85,11 +89,11 @@ class _VariantMaker: yield ICUVariant(froms, tos) - def _parse_variant_word(self, name): + def _parse_variant_word(self, name: str) -> Optional[Tuple[str, str, str]]: name = name.strip() match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name) if match is None or (match.group(1) == '~' and match.group(3) == '~'): - raise UsageError("Invalid variant word descriptor '{}'".format(name)) + raise UsageError(f"Invalid variant word descriptor '{name}'") norm_name = self.norm.transliterate(match.group(2)).strip() if not norm_name: return None @@ -102,7 +106,8 @@ _FLAG_MATCH = {'^': '^ ', '': ' '} -def _create_variants(src, preflag, postflag, repl, decompose): +def _create_variants(src: str, preflag: str, postflag: str, + repl: str, decompose: bool) -> Iterator[Tuple[str, str]]: if preflag == '~': postfix = _FLAG_MATCH[postflag] # suffix decomposition