1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Generic processor for names that creates abbreviation variants.
14 from nominatim.errors import UsageError
15 from nominatim.tokenizer.token_analysis.config_variants import get_variant_config
16 from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
18 ### Configuration section
20 def configure(rules, normalization_rules):
21 """ Extract and preprocess the configuration for this module.
25 config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
27 config['variant_only'] = rules.get('mode', '') == 'variant-only'
29 # parse mutation rules
30 config['mutations'] = []
31 for rule in rules.get('mutations', []):
32 if 'pattern' not in rule:
33 raise UsageError("Missing field 'pattern' in mutation configuration.")
34 if not isinstance(rule['pattern'], str):
35 raise UsageError("Field 'pattern' in mutation configuration "
36 "must be a simple text field.")
37 if 'replacements' not in rule:
38 raise UsageError("Missing field 'replacements' in mutation configuration.")
39 if not isinstance(rule['replacements'], list):
40 raise UsageError("Field 'replacements' in mutation configuration "
41 "must be a list of texts.")
43 config['mutations'].append((rule['pattern'], rule['replacements']))
50 def create(transliterator, config):
51 """ Create a new token analysis instance for this module.
53 return GenericTokenAnalysis(transliterator, config)
56 class GenericTokenAnalysis:
57 """ Collects the different transformation rules for normalisation of names
58 and provides the functions to apply the transformations.
61 def __init__(self, to_ascii, config):
62 self.to_ascii = to_ascii
63 self.variant_only = config['variant_only']
66 if config['replacements']:
67 self.replacements = datrie.Trie(config['chars'])
68 for src, repllist in config['replacements']:
69 self.replacements[src] = repllist
71 self.replacements = None
73 # set up mutation rules
74 self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
77 def get_variants_ascii(self, norm_name):
78 """ Compute the spelling variants for the given normalized name
79 and transliterate the result.
81 variants = self._generate_word_variants(norm_name)
83 for mutation in self.mutations:
84 variants = mutation.generate(variants)
86 return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
89 def _transliterate_unique_list(self, norm_name, iterable):
94 for variant in map(str.strip, iterable):
95 if variant not in seen:
97 yield self.to_ascii.transliterate(variant).strip()
100 def _generate_word_variants(self, norm_name):
101 baseform = '^ ' + norm_name + ' ^'
102 baselen = len(baseform)
106 if self.replacements is not None:
110 full, repl = self.replacements.longest_prefix_item(baseform[pos:],
113 done = baseform[startpos:pos]
114 partials = [v + done + r
115 for v, r in itertools.product(partials, repl)
116 if not force_space or r.startswith(' ')]
117 if len(partials) > 128:
118 # If too many variants are produced, they are unlikely
119 # to be helpful. Only use the original term.
122 startpos = pos + len(full)
131 # No variants detected? Fast return.
135 if startpos < baselen:
136 return (part[1:] + baseform[startpos:-1] for part in partials)
138 return (part[1:-1] for part in partials)