src/nominatim_db/tokenizer/token_analysis/generic.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2025 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Generic processor for names that creates abbreviation variants.
   9 """
  10 from typing import Mapping, Dict, Any, Iterable, Optional, List, cast, Tuple
  11 import itertools
  12
  13 from ...errors import UsageError
  14 from ...data.place_name import PlaceName
  15 from .config_variants import get_variant_config
  16 from .generic_mutation import MutationVariantGenerator
  17 from .simple_trie import SimpleTrie
  18
  19 # Configuration section
  20
  21
  22 def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, Any]:
  23     """ Extract and preprocess the configuration for this module.
  24     """
  25     config: Dict[str, Any] = {}
  26
  27     config['replacements'], _ = get_variant_config(rules.get('variants'), normalizer)
  28     config['variant_only'] = rules.get('mode', '') == 'variant-only'
  29
  30     # parse mutation rules
  31     config['mutations'] = []
  32     for rule in rules.get('mutations', []):
  33         if 'pattern' not in rule:
  34             raise UsageError("Missing field 'pattern' in mutation configuration.")
  35         if not isinstance(rule['pattern'], str):
  36             raise UsageError("Field 'pattern' in mutation configuration "
  37                              "must be a simple text field.")
  38         if 'replacements' not in rule:
  39             raise UsageError("Missing field 'replacements' in mutation configuration.")
  40         if not isinstance(rule['replacements'], list):
  41             raise UsageError("Field 'replacements' in mutation configuration "
  42                              "must be a list of texts.")
  43
  44         config['mutations'].append((rule['pattern'], rule['replacements']))
  45
  46     return config
  47
  48
  49 # Analysis section
  50
  51 def create(normalizer: Any, transliterator: Any,
  52            config: Mapping[str, Any]) -> 'GenericTokenAnalysis':
  53     """ Create a new token analysis instance for this module.
  54     """
  55     return GenericTokenAnalysis(normalizer, transliterator, config)
  56
  57
  58 class GenericTokenAnalysis:
  59     """ Collects the different transformation rules for normalisation of names
  60         and provides the functions to apply the transformations.
  61     """
  62
  63     def __init__(self, norm: Any, to_ascii: Any, config: Mapping[str, Any]) -> None:
  64         self.norm = norm
  65         self.to_ascii = to_ascii
  66         self.variant_only = config['variant_only']
  67
  68         # Set up datrie
  69         self.replacements: Optional[SimpleTrie[List[str]]] = \
  70             SimpleTrie(config['replacements']) if config['replacements'] else None
  71
  72         # set up mutation rules
  73         self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
  74
  75     def get_canonical_id(self, name: PlaceName) -> str:
  76         """ Return the normalized form of the name. This is the standard form
  77             from which possible variants for the name can be derived.
  78         """
  79         return cast(str, self.norm.transliterate(name.name)).strip()
  80
  81     def compute_variants(self, norm_name: str) -> Tuple[List[str], List[str]]:
  82         """ Compute the spelling variants for the given normalized name
  83             and transliterate the result.
  84         """
  85         variants = self._generate_word_variants(norm_name)
  86
  87         for mutation in self.mutations:
  88             variants = mutation.generate(variants)
  89
  90         varset = set(map(str.strip, variants))
  91         if self.variant_only:
  92             varset.discard(norm_name)
  93
  94         trans = []
  95         norm = []
  96
  97         for var in varset:
  98             t = self.to_ascii.transliterate(var).strip()
  99             if t:
 100                 trans.append(t)
 101                 norm.append(var)
 102
 103         return trans, norm
 104
 105     def _generate_word_variants(self, norm_name: str) -> Iterable[str]:
 106         baseform = '^ ' + norm_name + ' ^'
 107         baselen = len(baseform)
 108         partials = ['']
 109
 110         startpos = 0
 111         if self.replacements is not None:
 112             pos = 0
 113             force_space = False
 114             while pos < baselen:
 115                 frm = pos
 116                 repl, pos = self.replacements.longest_prefix(baseform, pos)
 117                 if repl is not None:
 118                     done = baseform[startpos:frm]
 119                     partials = [v + done + r
 120                                 for v, r in itertools.product(partials, repl)
 121                                 if not force_space or r.startswith(' ')]
 122                     if len(partials) > 128:
 123                         # If too many variants are produced, they are unlikely
 124                         # to be helpful. Only use the original term.
 125                         startpos = 0
 126                         break
 127                     if baseform[pos - 1] == ' ':
 128                         pos -= 1
 129                         force_space = True
 130                     startpos = pos
 131                 else:
 132                     pos += 1
 133                     force_space = False
 134
 135         # No variants detected? Fast return.
 136         if startpos == 0:
 137             return (norm_name, )
 138
 139         if startpos < baselen:
 140             return (part[1:] + baseform[startpos:-1] for part in partials)
 141
 142         return (part[1:-1] for part in partials)