]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/tokenizer/token_analysis/generic.py
clean_housenumbers: make kinds and delimiters configurable
[nominatim.git] / nominatim / tokenizer / token_analysis / generic.py
1 # SPDX-License-Identifier: GPL-2.0-only
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Generic processor for names that creates abbreviation variants.
9 """
10 import itertools
11
12 import datrie
13
14 from nominatim.errors import UsageError
15 from nominatim.tokenizer.token_analysis.config_variants import get_variant_config
16 from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
17
18 ### Configuration section
19
20 def configure(rules, normalization_rules):
21     """ Extract and preprocess the configuration for this module.
22     """
23     config = {}
24
25     config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
26                                                                  normalization_rules)
27     config['variant_only'] = rules.get('mode', '') == 'variant-only'
28
29     # parse mutation rules
30     config['mutations'] = []
31     for rule in rules.get('mutations', []):
32         if 'pattern' not in rule:
33             raise UsageError("Missing field 'pattern' in mutation configuration.")
34         if not isinstance(rule['pattern'], str):
35             raise UsageError("Field 'pattern' in mutation configuration "
36                              "must be a simple text field.")
37         if 'replacements' not in rule:
38             raise UsageError("Missing field 'replacements' in mutation configuration.")
39         if not isinstance(rule['replacements'], list):
40             raise UsageError("Field 'replacements' in mutation configuration "
41                              "must be a list of texts.")
42
43         config['mutations'].append((rule['pattern'], rule['replacements']))
44
45     return config
46
47
48 ### Analysis section
49
50 def create(transliterator, config):
51     """ Create a new token analysis instance for this module.
52     """
53     return GenericTokenAnalysis(transliterator, config)
54
55
56 class GenericTokenAnalysis:
57     """ Collects the different transformation rules for normalisation of names
58         and provides the functions to apply the transformations.
59     """
60
61     def __init__(self, to_ascii, config):
62         self.to_ascii = to_ascii
63         self.variant_only = config['variant_only']
64
65         # Set up datrie
66         if config['replacements']:
67             self.replacements = datrie.Trie(config['chars'])
68             for src, repllist in config['replacements']:
69                 self.replacements[src] = repllist
70         else:
71             self.replacements = None
72
73         # set up mutation rules
74         self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
75
76
77     def get_variants_ascii(self, norm_name):
78         """ Compute the spelling variants for the given normalized name
79             and transliterate the result.
80         """
81         variants = self._generate_word_variants(norm_name)
82
83         for mutation in self.mutations:
84             variants = mutation.generate(variants)
85
86         return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
87
88
89     def _transliterate_unique_list(self, norm_name, iterable):
90         seen = set()
91         if self.variant_only:
92             seen.add(norm_name)
93
94         for variant in map(str.strip, iterable):
95             if variant not in seen:
96                 seen.add(variant)
97                 yield self.to_ascii.transliterate(variant).strip()
98
99
100     def _generate_word_variants(self, norm_name):
101         baseform = '^ ' + norm_name + ' ^'
102         baselen = len(baseform)
103         partials = ['']
104
105         startpos = 0
106         if self.replacements is not None:
107             pos = 0
108             force_space = False
109             while pos < baselen:
110                 full, repl = self.replacements.longest_prefix_item(baseform[pos:],
111                                                                    (None, None))
112                 if full is not None:
113                     done = baseform[startpos:pos]
114                     partials = [v + done + r
115                                 for v, r in itertools.product(partials, repl)
116                                 if not force_space or r.startswith(' ')]
117                     if len(partials) > 128:
118                         # If too many variants are produced, they are unlikely
119                         # to be helpful. Only use the original term.
120                         startpos = 0
121                         break
122                     startpos = pos + len(full)
123                     if full[-1] == ' ':
124                         startpos -= 1
125                         force_space = True
126                     pos = startpos
127                 else:
128                     pos += 1
129                     force_space = False
130
131         # No variants detected? Fast return.
132         if startpos == 0:
133             return (norm_name, )
134
135         if startpos < baselen:
136             return (part[1:] + baseform[startpos:-1] for part in partials)
137
138         return (part[1:-1] for part in partials)