X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/c314a3092c5b51c7782015f6fa9ac093b46fa174..bfd1c83cb0db3166a4e2f00b2677477dfcc40bbc:/src/nominatim_db/tokenizer/token_analysis/generic.py?ds=sidebyside diff --git a/src/nominatim_db/tokenizer/token_analysis/generic.py b/src/nominatim_db/tokenizer/token_analysis/generic.py index 30f1944e..b01cebf7 100644 --- a/src/nominatim_db/tokenizer/token_analysis/generic.py +++ b/src/nominatim_db/tokenizer/token_analysis/generic.py @@ -2,30 +2,29 @@ # # This file is part of Nominatim. (https://nominatim.org) # -# Copyright (C) 2024 by the Nominatim developer community. +# Copyright (C) 2025 by the Nominatim developer community. # For a full list of authors see the git log. """ Generic processor for names that creates abbreviation variants. """ -from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast +from typing import Mapping, Dict, Any, Iterable, Optional, List, cast, Tuple import itertools -import datrie - from ...errors import UsageError from ...data.place_name import PlaceName from .config_variants import get_variant_config from .generic_mutation import MutationVariantGenerator +from .simple_trie import SimpleTrie + +# Configuration section -### Configuration section def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, Any]: """ Extract and preprocess the configuration for this module. """ config: Dict[str, Any] = {} - config['replacements'], config['chars'] = get_variant_config(rules.get('variants'), - normalizer) + config['replacements'], _ = get_variant_config(rules.get('variants'), normalizer) config['variant_only'] = rules.get('mode', '') == 'variant-only' # parse mutation rules @@ -47,7 +46,7 @@ def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, An return config -### Analysis section +# Analysis section def create(normalizer: Any, transliterator: Any, config: Mapping[str, Any]) -> 'GenericTokenAnalysis': @@ -67,25 +66,19 @@ class GenericTokenAnalysis: self.variant_only = config['variant_only'] # Set up datrie - if config['replacements']: - self.replacements = datrie.Trie(config['chars']) - for src, repllist in config['replacements']: - self.replacements[src] = repllist - else: - self.replacements = None + self.replacements: Optional[SimpleTrie[List[str]]] = \ + SimpleTrie(config['replacements']) if config['replacements'] else None # set up mutation rules self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']] - def get_canonical_id(self, name: PlaceName) -> str: """ Return the normalized form of the name. This is the standard form from which possible variants for the name can be derived. """ return cast(str, self.norm.transliterate(name.name)).strip() - - def compute_variants(self, norm_name: str) -> List[str]: + def compute_variants(self, norm_name: str) -> Tuple[List[str], List[str]]: """ Compute the spelling variants for the given normalized name and transliterate the result. """ @@ -94,20 +87,20 @@ class GenericTokenAnalysis: for mutation in self.mutations: variants = mutation.generate(variants) - return [name for name in self._transliterate_unique_list(norm_name, variants) if name] - - - def _transliterate_unique_list(self, norm_name: str, - iterable: Iterable[str]) -> Iterator[Optional[str]]: - seen = set() + varset = set(map(str.strip, variants)) if self.variant_only: - seen.add(norm_name) + varset.discard(norm_name) + + trans = [] + norm = [] - for variant in map(str.strip, iterable): - if variant not in seen: - seen.add(variant) - yield self.to_ascii.transliterate(variant).strip() + for var in varset: + t = self.to_ascii.transliterate(var).strip() + if t: + trans.append(t) + norm.append(var) + return trans, norm def _generate_word_variants(self, norm_name: str) -> Iterable[str]: baseform = '^ ' + norm_name + ' ^' @@ -119,10 +112,10 @@ class GenericTokenAnalysis: pos = 0 force_space = False while pos < baselen: - full, repl = self.replacements.longest_prefix_item(baseform[pos:], - (None, None)) - if full is not None: - done = baseform[startpos:pos] + frm = pos + repl, pos = self.replacements.longest_prefix(baseform, pos) + if repl is not None: + done = baseform[startpos:frm] partials = [v + done + r for v, r in itertools.product(partials, repl) if not force_space or r.startswith(' ')] @@ -131,11 +124,10 @@ class GenericTokenAnalysis: # to be helpful. Only use the original term. startpos = 0 break - startpos = pos + len(full) - if full[-1] == ' ': - startpos -= 1 + if baseform[pos - 1] == ' ': + pos -= 1 force_space = True - pos = startpos + startpos = pos else: pos += 1 force_space = False