nominatim/tokenizer/token_analysis/generic.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Generic processor for names that creates abbreviation variants.
   9 """
  10 import itertools
  11
  12 import datrie
  13
  14 from nominatim.errors import UsageError
  15 from nominatim.tokenizer.token_analysis.config_variants import get_variant_config
  16 from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
  17
  18 ### Configuration section
  19
  20 def configure(rules, normalization_rules):
  21     """ Extract and preprocess the configuration for this module.
  22     """
  23     config = {}
  24
  25     config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
  26                                                                  normalization_rules)
  27     config['variant_only'] = rules.get('mode', '') == 'variant-only'
  28
  29     # parse mutation rules
  30     config['mutations'] = []
  31     for rule in rules.get('mutations', []):
  32         if 'pattern' not in rule:
  33             raise UsageError("Missing field 'pattern' in mutation configuration.")
  34         if not isinstance(rule['pattern'], str):
  35             raise UsageError("Field 'pattern' in mutation configuration "
  36                              "must be a simple text field.")
  37         if 'replacements' not in rule:
  38             raise UsageError("Missing field 'replacements' in mutation configuration.")
  39         if not isinstance(rule['replacements'], list):
  40             raise UsageError("Field 'replacements' in mutation configuration "
  41                              "must be a list of texts.")
  42
  43         config['mutations'].append((rule['pattern'], rule['replacements']))
  44
  45     return config
  46
  47
  48 ### Analysis section
  49
  50 def create(normalizer, transliterator, config):
  51     """ Create a new token analysis instance for this module.
  52     """
  53     return GenericTokenAnalysis(normalizer, transliterator, config)
  54
  55
  56 class GenericTokenAnalysis:
  57     """ Collects the different transformation rules for normalisation of names
  58         and provides the functions to apply the transformations.
  59     """
  60
  61     def __init__(self, norm, to_ascii, config):
  62         self.norm = norm
  63         self.to_ascii = to_ascii
  64         self.variant_only = config['variant_only']
  65
  66         # Set up datrie
  67         if config['replacements']:
  68             self.replacements = datrie.Trie(config['chars'])
  69             for src, repllist in config['replacements']:
  70                 self.replacements[src] = repllist
  71         else:
  72             self.replacements = None
  73
  74         # set up mutation rules
  75         self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
  76
  77
  78     def normalize(self, name):
  79         """ Return the normalized form of the name. This is the standard form
  80             from which possible variants for the name can be derived.
  81         """
  82         return self.norm.transliterate(name).strip()
  83
  84
  85     def get_variants_ascii(self, norm_name):
  86         """ Compute the spelling variants for the given normalized name
  87             and transliterate the result.
  88         """
  89         variants = self._generate_word_variants(norm_name)
  90
  91         for mutation in self.mutations:
  92             variants = mutation.generate(variants)
  93
  94         return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
  95
  96
  97     def _transliterate_unique_list(self, norm_name, iterable):
  98         seen = set()
  99         if self.variant_only:
 100             seen.add(norm_name)
 101
 102         for variant in map(str.strip, iterable):
 103             if variant not in seen:
 104                 seen.add(variant)
 105                 yield self.to_ascii.transliterate(variant).strip()
 106
 107
 108     def _generate_word_variants(self, norm_name):
 109         baseform = '^ ' + norm_name + ' ^'
 110         baselen = len(baseform)
 111         partials = ['']
 112
 113         startpos = 0
 114         if self.replacements is not None:
 115             pos = 0
 116             force_space = False
 117             while pos < baselen:
 118                 full, repl = self.replacements.longest_prefix_item(baseform[pos:],
 119                                                                    (None, None))
 120                 if full is not None:
 121                     done = baseform[startpos:pos]
 122                     partials = [v + done + r
 123                                 for v, r in itertools.product(partials, repl)
 124                                 if not force_space or r.startswith(' ')]
 125                     if len(partials) > 128:
 126                         # If too many variants are produced, they are unlikely
 127                         # to be helpful. Only use the original term.
 128                         startpos = 0
 129                         break
 130                     startpos = pos + len(full)
 131                     if full[-1] == ' ':
 132                         startpos -= 1
 133                         force_space = True
 134                     pos = startpos
 135                 else:
 136                     pos += 1
 137                     force_space = False
 138
 139         # No variants detected? Fast return.
 140         if startpos == 0:
 141             return (norm_name, )
 142
 143         if startpos < baselen:
 144             return (part[1:] + baseform[startpos:-1] for part in partials)
 145
 146         return (part[1:-1] for part in partials)