nominatim/tokenizer/token_analysis/generic.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Generic processor for names that creates abbreviation variants.
   9 """
  10 from collections import defaultdict, namedtuple
  11 import itertools
  12 import re
  13
  14 from icu import Transliterator
  15 import datrie
  16
  17 from nominatim.config import flatten_config_list
  18 from nominatim.errors import UsageError
  19
  20 ### Configuration section
  21
  22 ICUVariant = namedtuple('ICUVariant', ['source', 'replacement'])
  23
  24 def configure(rules, normalization_rules):
  25     """ Extract and preprocess the configuration for this module.
  26     """
  27     config = {}
  28
  29     config['replacements'], config['chars'] = _get_variant_config(rules.get('variants'),
  30                                                                   normalization_rules)
  31     config['variant_only'] = rules.get('mode', '') == 'variant-only'
  32
  33     return config
  34
  35
  36 def _get_variant_config(rules, normalization_rules):
  37     """ Convert the variant definition from the configuration into
  38         replacement sets.
  39     """
  40     immediate = defaultdict(list)
  41     chars = set()
  42
  43     if rules:
  44         vset = set()
  45         rules = flatten_config_list(rules, 'variants')
  46
  47         vmaker = _VariantMaker(normalization_rules)
  48
  49         for section in rules:
  50             for rule in (section.get('words') or []):
  51                 vset.update(vmaker.compute(rule))
  52
  53         # Intermediate reorder by source. Also compute required character set.
  54         for variant in vset:
  55             if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
  56                 replstr = variant.replacement[:-1]
  57             else:
  58                 replstr = variant.replacement
  59             immediate[variant.source].append(replstr)
  60             chars.update(variant.source)
  61
  62     return list(immediate.items()), ''.join(chars)
  63
  64
  65 class _VariantMaker:
  66     """ Generater for all necessary ICUVariants from a single variant rule.
  67
  68         All text in rules is normalized to make sure the variants match later.
  69     """
  70
  71     def __init__(self, norm_rules):
  72         self.norm = Transliterator.createFromRules("rule_loader_normalization",
  73                                                    norm_rules)
  74
  75
  76     def compute(self, rule):
  77         """ Generator for all ICUVariant tuples from a single variant rule.
  78         """
  79         parts = re.split(r'(\|)?([=-])>', rule)
  80         if len(parts) != 4:
  81             raise UsageError("Syntax error in variant rule: " + rule)
  82
  83         decompose = parts[1] is None
  84         src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
  85         repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(','))
  86
  87         # If the source should be kept, add a 1:1 replacement
  88         if parts[2] == '-':
  89             for src in src_terms:
  90                 if src:
  91                     for froms, tos in _create_variants(*src, src[0], decompose):
  92                         yield ICUVariant(froms, tos)
  93
  94         for src, repl in itertools.product(src_terms, repl_terms):
  95             if src and repl:
  96                 for froms, tos in _create_variants(*src, repl, decompose):
  97                     yield ICUVariant(froms, tos)
  98
  99
 100     def _parse_variant_word(self, name):
 101         name = name.strip()
 102         match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
 103         if match is None or (match.group(1) == '~' and match.group(3) == '~'):
 104             raise UsageError("Invalid variant word descriptor '{}'".format(name))
 105         norm_name = self.norm.transliterate(match.group(2)).strip()
 106         if not norm_name:
 107             return None
 108
 109         return norm_name, match.group(1), match.group(3)
 110
 111
 112 _FLAG_MATCH = {'^': '^ ',
 113                '$': ' ^',
 114                '': ' '}
 115
 116
 117 def _create_variants(src, preflag, postflag, repl, decompose):
 118     if preflag == '~':
 119         postfix = _FLAG_MATCH[postflag]
 120         # suffix decomposition
 121         src = src + postfix
 122         repl = repl + postfix
 123
 124         yield src, repl
 125         yield ' ' + src, ' ' + repl
 126
 127         if decompose:
 128             yield src, ' ' + repl
 129             yield ' ' + src, repl
 130     elif postflag == '~':
 131         # prefix decomposition
 132         prefix = _FLAG_MATCH[preflag]
 133         src = prefix + src
 134         repl = prefix + repl
 135
 136         yield src, repl
 137         yield src + ' ', repl + ' '
 138
 139         if decompose:
 140             yield src, repl + ' '
 141             yield src + ' ', repl
 142     else:
 143         prefix = _FLAG_MATCH[preflag]
 144         postfix = _FLAG_MATCH[postflag]
 145
 146         yield prefix + src + postfix, prefix + repl + postfix
 147
 148
 149 ### Analysis section
 150
 151 def create(transliterator, config):
 152     """ Create a new token analysis instance for this module.
 153     """
 154     return GenericTokenAnalysis(transliterator, config)
 155
 156
 157 class GenericTokenAnalysis:
 158     """ Collects the different transformation rules for normalisation of names
 159         and provides the functions to apply the transformations.
 160     """
 161
 162     def __init__(self, to_ascii, config):
 163         self.to_ascii = to_ascii
 164         self.variant_only = config['variant_only']
 165
 166         # Set up datrie
 167         if config['replacements']:
 168             self.replacements = datrie.Trie(config['chars'])
 169             for src, repllist in config['replacements']:
 170                 self.replacements[src] = repllist
 171         else:
 172             self.replacements = None
 173
 174
 175     def get_variants_ascii(self, norm_name):
 176         """ Compute the spelling variants for the given normalized name
 177             and transliterate the result.
 178         """
 179         results = set()
 180         for variant in self._generate_word_variants(norm_name):
 181             if not self.variant_only or variant.strip() != norm_name:
 182                 trans_name = self.to_ascii.transliterate(variant).strip()
 183                 if trans_name:
 184                     results.add(trans_name)
 185
 186         return list(results)
 187
 188
 189     def _generate_word_variants(self, norm_name):
 190         baseform = '^ ' + norm_name + ' ^'
 191         baselen = len(baseform)
 192         partials = ['']
 193
 194         startpos = 0
 195         if self.replacements is not None:
 196             pos = 0
 197             force_space = False
 198             while pos < baselen:
 199                 full, repl = self.replacements.longest_prefix_item(baseform[pos:],
 200                                                                    (None, None))
 201                 if full is not None:
 202                     done = baseform[startpos:pos]
 203                     partials = [v + done + r
 204                                 for v, r in itertools.product(partials, repl)
 205                                 if not force_space or r.startswith(' ')]
 206                     if len(partials) > 128:
 207                         # If too many variants are produced, they are unlikely
 208                         # to be helpful. Only use the original term.
 209                         startpos = 0
 210                         break
 211                     startpos = pos + len(full)
 212                     if full[-1] == ' ':
 213                         startpos -= 1
 214                         force_space = True
 215                     pos = startpos
 216                 else:
 217                     pos += 1
 218                     force_space = False
 219
 220         # No variants detected? Fast return.
 221         if startpos == 0:
 222             return (norm_name, )
 223
 224         if startpos < baselen:
 225             return (part[1:] + baseform[startpos:-1] for part in partials)
 226
 227         return (part[1:-1] for part in partials)