]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/tokenizer/token_analysis/generic.py
Merge remote-tracking branch 'upstream/master'
[nominatim.git] / nominatim / tokenizer / token_analysis / generic.py
1 # SPDX-License-Identifier: GPL-2.0-only
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Generic processor for names that creates abbreviation variants.
9 """
10 import itertools
11
12 import datrie
13
14 from nominatim.errors import UsageError
15 from nominatim.tokenizer.token_analysis.config_variants import get_variant_config
16 from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
17
18 ### Configuration section
19
20 def configure(rules, normalization_rules):
21     """ Extract and preprocess the configuration for this module.
22     """
23     config = {}
24
25     config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
26                                                                  normalization_rules)
27     config['variant_only'] = rules.get('mode', '') == 'variant-only'
28
29     # parse mutation rules
30     config['mutations'] = []
31     for rule in rules.get('mutations', []):
32         if 'pattern' not in rule:
33             raise UsageError("Missing field 'pattern' in mutation configuration.")
34         if not isinstance(rule['pattern'], str):
35             raise UsageError("Field 'pattern' in mutation configuration "
36                              "must be a simple text field.")
37         if 'replacements' not in rule:
38             raise UsageError("Missing field 'replacements' in mutation configuration.")
39         if not isinstance(rule['replacements'], list):
40             raise UsageError("Field 'replacements' in mutation configuration "
41                              "must be a list of texts.")
42
43         config['mutations'].append((rule['pattern'], rule['replacements']))
44
45     return config
46
47
48 ### Analysis section
49
50 def create(normalizer, transliterator, config):
51     """ Create a new token analysis instance for this module.
52     """
53     return GenericTokenAnalysis(normalizer, transliterator, config)
54
55
56 class GenericTokenAnalysis:
57     """ Collects the different transformation rules for normalisation of names
58         and provides the functions to apply the transformations.
59     """
60
61     def __init__(self, norm, to_ascii, config):
62         self.norm = norm
63         self.to_ascii = to_ascii
64         self.variant_only = config['variant_only']
65
66         # Set up datrie
67         if config['replacements']:
68             self.replacements = datrie.Trie(config['chars'])
69             for src, repllist in config['replacements']:
70                 self.replacements[src] = repllist
71         else:
72             self.replacements = None
73
74         # set up mutation rules
75         self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
76
77
78     def normalize(self, name):
79         """ Return the normalized form of the name. This is the standard form
80             from which possible variants for the name can be derived.
81         """
82         return self.norm.transliterate(name).strip()
83
84
85     def get_variants_ascii(self, norm_name):
86         """ Compute the spelling variants for the given normalized name
87             and transliterate the result.
88         """
89         variants = self._generate_word_variants(norm_name)
90
91         for mutation in self.mutations:
92             variants = mutation.generate(variants)
93
94         return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
95
96
97     def _transliterate_unique_list(self, norm_name, iterable):
98         seen = set()
99         if self.variant_only:
100             seen.add(norm_name)
101
102         for variant in map(str.strip, iterable):
103             if variant not in seen:
104                 seen.add(variant)
105                 yield self.to_ascii.transliterate(variant).strip()
106
107
108     def _generate_word_variants(self, norm_name):
109         baseform = '^ ' + norm_name + ' ^'
110         baselen = len(baseform)
111         partials = ['']
112
113         startpos = 0
114         if self.replacements is not None:
115             pos = 0
116             force_space = False
117             while pos < baselen:
118                 full, repl = self.replacements.longest_prefix_item(baseform[pos:],
119                                                                    (None, None))
120                 if full is not None:
121                     done = baseform[startpos:pos]
122                     partials = [v + done + r
123                                 for v, r in itertools.product(partials, repl)
124                                 if not force_space or r.startswith(' ')]
125                     if len(partials) > 128:
126                         # If too many variants are produced, they are unlikely
127                         # to be helpful. Only use the original term.
128                         startpos = 0
129                         break
130                     startpos = pos + len(full)
131                     if full[-1] == ' ':
132                         startpos -= 1
133                         force_space = True
134                     pos = startpos
135                 else:
136                     pos += 1
137                     force_space = False
138
139         # No variants detected? Fast return.
140         if startpos == 0:
141             return (norm_name, )
142
143         if startpos < baselen:
144             return (part[1:] + baseform[startpos:-1] for part in partials)
145
146         return (part[1:-1] for part in partials)