nominatim/tokenizer/token_analysis/generic.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Generic processor for names that creates abbreviation variants.
   9 """
  10 import itertools
  11
  12 import datrie
  13
  14 from nominatim.errors import UsageError
  15 from nominatim.tokenizer.token_analysis.config_variants import get_variant_config
  16 from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
  17
  18 ### Configuration section
  19
  20 def configure(rules, normalization_rules):
  21     """ Extract and preprocess the configuration for this module.
  22     """
  23     config = {}
  24
  25     config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
  26                                                                  normalization_rules)
  27     config['variant_only'] = rules.get('mode', '') == 'variant-only'
  28
  29     # parse mutation rules
  30     config['mutations'] = []
  31     for rule in rules.get('mutations', []):
  32         if 'pattern' not in rule:
  33             raise UsageError("Missing field 'pattern' in mutation configuration.")
  34         if not isinstance(rule['pattern'], str):
  35             raise UsageError("Field 'pattern' in mutation configuration "
  36                              "must be a simple text field.")
  37         if 'replacements' not in rule:
  38             raise UsageError("Missing field 'replacements' in mutation configuration.")
  39         if not isinstance(rule['replacements'], list):
  40             raise UsageError("Field 'replacements' in mutation configuration "
  41                              "must be a list of texts.")
  42
  43         config['mutations'].append((rule['pattern'], rule['replacements']))
  44
  45     return config
  46
  47
  48 ### Analysis section
  49
  50 def create(transliterator, config):
  51     """ Create a new token analysis instance for this module.
  52     """
  53     return GenericTokenAnalysis(transliterator, config)
  54
  55
  56 class GenericTokenAnalysis:
  57     """ Collects the different transformation rules for normalisation of names
  58         and provides the functions to apply the transformations.
  59     """
  60
  61     def __init__(self, to_ascii, config):
  62         self.to_ascii = to_ascii
  63         self.variant_only = config['variant_only']
  64
  65         # Set up datrie
  66         if config['replacements']:
  67             self.replacements = datrie.Trie(config['chars'])
  68             for src, repllist in config['replacements']:
  69                 self.replacements[src] = repllist
  70         else:
  71             self.replacements = None
  72
  73         # set up mutation rules
  74         self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
  75
  76
  77     def get_variants_ascii(self, norm_name):
  78         """ Compute the spelling variants for the given normalized name
  79             and transliterate the result.
  80         """
  81         variants = self._generate_word_variants(norm_name)
  82
  83         for mutation in self.mutations:
  84             variants = mutation.generate(variants)
  85
  86         return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
  87
  88
  89     def _transliterate_unique_list(self, norm_name, iterable):
  90         seen = set()
  91         if self.variant_only:
  92             seen.add(norm_name)
  93
  94         for variant in map(str.strip, iterable):
  95             if variant not in seen:
  96                 seen.add(variant)
  97                 yield self.to_ascii.transliterate(variant).strip()
  98
  99
 100     def _generate_word_variants(self, norm_name):
 101         baseform = '^ ' + norm_name + ' ^'
 102         baselen = len(baseform)
 103         partials = ['']
 104
 105         startpos = 0
 106         if self.replacements is not None:
 107             pos = 0
 108             force_space = False
 109             while pos < baselen:
 110                 full, repl = self.replacements.longest_prefix_item(baseform[pos:],
 111                                                                    (None, None))
 112                 if full is not None:
 113                     done = baseform[startpos:pos]
 114                     partials = [v + done + r
 115                                 for v, r in itertools.product(partials, repl)
 116                                 if not force_space or r.startswith(' ')]
 117                     if len(partials) > 128:
 118                         # If too many variants are produced, they are unlikely
 119                         # to be helpful. Only use the original term.
 120                         startpos = 0
 121                         break
 122                     startpos = pos + len(full)
 123                     if full[-1] == ' ':
 124                         startpos -= 1
 125                         force_space = True
 126                     pos = startpos
 127                 else:
 128                     pos += 1
 129                     force_space = False
 130
 131         # No variants detected? Fast return.
 132         if startpos == 0:
 133             return (norm_name, )
 134
 135         if startpos < baselen:
 136             return (part[1:] + baseform[startpos:-1] for part in partials)
 137
 138         return (part[1:-1] for part in partials)