src/nominatim_db/tokenizer/token_analysis/generic.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2025 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Generic processor for names that creates abbreviation variants.
   9 """
  10 from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast
  11 import itertools
  12
  13 from ...errors import UsageError
  14 from ...data.place_name import PlaceName
  15 from .config_variants import get_variant_config
  16 from .generic_mutation import MutationVariantGenerator
  17 from .simple_trie import SimpleTrie
  18
  19 # Configuration section
  20
  21
  22 def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, Any]:
  23     """ Extract and preprocess the configuration for this module.
  24     """
  25     config: Dict[str, Any] = {}
  26
  27     config['replacements'], _ = get_variant_config(rules.get('variants'), normalizer)
  28     config['variant_only'] = rules.get('mode', '') == 'variant-only'
  29
  30     # parse mutation rules
  31     config['mutations'] = []
  32     for rule in rules.get('mutations', []):
  33         if 'pattern' not in rule:
  34             raise UsageError("Missing field 'pattern' in mutation configuration.")
  35         if not isinstance(rule['pattern'], str):
  36             raise UsageError("Field 'pattern' in mutation configuration "
  37                              "must be a simple text field.")
  38         if 'replacements' not in rule:
  39             raise UsageError("Missing field 'replacements' in mutation configuration.")
  40         if not isinstance(rule['replacements'], list):
  41             raise UsageError("Field 'replacements' in mutation configuration "
  42                              "must be a list of texts.")
  43
  44         config['mutations'].append((rule['pattern'], rule['replacements']))
  45
  46     return config
  47
  48
  49 # Analysis section
  50
  51 def create(normalizer: Any, transliterator: Any,
  52            config: Mapping[str, Any]) -> 'GenericTokenAnalysis':
  53     """ Create a new token analysis instance for this module.
  54     """
  55     return GenericTokenAnalysis(normalizer, transliterator, config)
  56
  57
  58 class GenericTokenAnalysis:
  59     """ Collects the different transformation rules for normalisation of names
  60         and provides the functions to apply the transformations.
  61     """
  62
  63     def __init__(self, norm: Any, to_ascii: Any, config: Mapping[str, Any]) -> None:
  64         self.norm = norm
  65         self.to_ascii = to_ascii
  66         self.variant_only = config['variant_only']
  67
  68         # Set up datrie
  69         self.replacements: Optional[SimpleTrie[List[str]]] = \
  70             SimpleTrie(config['replacements']) if config['replacements'] else None
  71
  72         # set up mutation rules
  73         self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
  74
  75     def get_canonical_id(self, name: PlaceName) -> str:
  76         """ Return the normalized form of the name. This is the standard form
  77             from which possible variants for the name can be derived.
  78         """
  79         return cast(str, self.norm.transliterate(name.name)).strip()
  80
  81     def compute_variants(self, norm_name: str) -> List[str]:
  82         """ Compute the spelling variants for the given normalized name
  83             and transliterate the result.
  84         """
  85         variants = self._generate_word_variants(norm_name)
  86
  87         for mutation in self.mutations:
  88             variants = mutation.generate(variants)
  89
  90         return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
  91
  92     def _transliterate_unique_list(self, norm_name: str,
  93                                    iterable: Iterable[str]) -> Iterator[Optional[str]]:
  94         seen = set()
  95         if self.variant_only:
  96             seen.add(norm_name)
  97
  98         for variant in map(str.strip, iterable):
  99             if variant not in seen:
 100                 seen.add(variant)
 101                 yield self.to_ascii.transliterate(variant).strip()
 102
 103     def _generate_word_variants(self, norm_name: str) -> Iterable[str]:
 104         baseform = '^ ' + norm_name + ' ^'
 105         baselen = len(baseform)
 106         partials = ['']
 107
 108         startpos = 0
 109         if self.replacements is not None:
 110             pos = 0
 111             force_space = False
 112             while pos < baselen:
 113                 frm = pos
 114                 repl, pos = self.replacements.longest_prefix(baseform, pos)
 115                 if repl is not None:
 116                     done = baseform[startpos:frm]
 117                     partials = [v + done + r
 118                                 for v, r in itertools.product(partials, repl)
 119                                 if not force_space or r.startswith(' ')]
 120                     if len(partials) > 128:
 121                         # If too many variants are produced, they are unlikely
 122                         # to be helpful. Only use the original term.
 123                         startpos = 0
 124                         break
 125                     if baseform[pos - 1] == ' ':
 126                         pos -= 1
 127                         force_space = True
 128                     startpos = pos
 129                 else:
 130                     pos += 1
 131                     force_space = False
 132
 133         # No variants detected? Fast return.
 134         if startpos == 0:
 135             return (norm_name, )
 136
 137         if startpos < baselen:
 138             return (part[1:] + baseform[startpos:-1] for part in partials)
 139
 140         return (part[1:-1] for part in partials)