nominatim/tokenizer/token_analysis/generic.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Generic processor for names that creates abbreviation variants.
   9 """
  10 import itertools
  11
  12 import datrie
  13
  14 from nominatim.errors import UsageError
  15 from nominatim.tokenizer.token_analysis.config_variants import get_variant_config
  16 from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
  17
  18 ### Configuration section
  19
  20 def configure(rules, normalization_rules):
  21     """ Extract and preprocess the configuration for this module.
  22     """
  23     config = {}
  24
  25     config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
  26                                                                  normalization_rules)
  27     config['variant_only'] = rules.get('mode', '') == 'variant-only'
  28     config['mutations'] = rules.get('mutations', [])
  29
  30     return config
  31
  32
  33 ### Analysis section
  34
  35 def create(transliterator, config):
  36     """ Create a new token analysis instance for this module.
  37     """
  38     return GenericTokenAnalysis(transliterator, config)
  39
  40
  41 class GenericTokenAnalysis:
  42     """ Collects the different transformation rules for normalisation of names
  43         and provides the functions to apply the transformations.
  44     """
  45
  46     def __init__(self, to_ascii, config):
  47         self.to_ascii = to_ascii
  48         self.variant_only = config['variant_only']
  49
  50         # Set up datrie
  51         if config['replacements']:
  52             self.replacements = datrie.Trie(config['chars'])
  53             for src, repllist in config['replacements']:
  54                 self.replacements[src] = repllist
  55         else:
  56             self.replacements = None
  57
  58         # set up mutation rules
  59         self.mutations = []
  60         for cfg in config['mutations']:
  61             if 'pattern' not in cfg:
  62                 raise UsageError("Missing field 'pattern' in mutation configuration.")
  63             if not isinstance(cfg['pattern'], str):
  64                 raise UsageError("Field 'pattern' in mutation configuration "
  65                                  "must be a simple text field.")
  66             if 'replacements' not in cfg:
  67                 raise UsageError("Missing field 'replacements' in mutation configuration.")
  68             if not isinstance(cfg['replacements'], list):
  69                 raise UsageError("Field 'replacements' in mutation configuration "
  70                                  "must be a list of texts.")
  71
  72             self.mutations.append(MutationVariantGenerator(cfg['pattern'],
  73                                                            cfg['replacements']))
  74
  75
  76     def get_variants_ascii(self, norm_name):
  77         """ Compute the spelling variants for the given normalized name
  78             and transliterate the result.
  79         """
  80         variants = self._generate_word_variants(norm_name)
  81
  82         for mutation in self.mutations:
  83             variants = mutation.generate(variants)
  84
  85         return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
  86
  87
  88     def _transliterate_unique_list(self, norm_name, iterable):
  89         seen = set()
  90         if self.variant_only:
  91             seen.add(norm_name)
  92
  93         for variant in map(str.strip, iterable):
  94             if variant not in seen:
  95                 seen.add(variant)
  96                 yield self.to_ascii.transliterate(variant).strip()
  97
  98
  99     def _generate_word_variants(self, norm_name):
 100         baseform = '^ ' + norm_name + ' ^'
 101         baselen = len(baseform)
 102         partials = ['']
 103
 104         startpos = 0
 105         if self.replacements is not None:
 106             pos = 0
 107             force_space = False
 108             while pos < baselen:
 109                 full, repl = self.replacements.longest_prefix_item(baseform[pos:],
 110                                                                    (None, None))
 111                 if full is not None:
 112                     done = baseform[startpos:pos]
 113                     partials = [v + done + r
 114                                 for v, r in itertools.product(partials, repl)
 115                                 if not force_space or r.startswith(' ')]
 116                     if len(partials) > 128:
 117                         # If too many variants are produced, they are unlikely
 118                         # to be helpful. Only use the original term.
 119                         startpos = 0
 120                         break
 121                     startpos = pos + len(full)
 122                     if full[-1] == ' ':
 123                         startpos -= 1
 124                         force_space = True
 125                     pos = startpos
 126                 else:
 127                     pos += 1
 128                     force_space = False
 129
 130         # No variants detected? Fast return.
 131         if startpos == 0:
 132             return (norm_name, )
 133
 134         if startpos < baselen:
 135             return (part[1:] + baseform[startpos:-1] for part in partials)
 136
 137         return (part[1:-1] for part in partials)