2 Processor for names that are imported into the database based on the
5 from collections import defaultdict
8 from icu import Transliterator
11 from nominatim.db.properties import set_property, get_property
12 from nominatim.tokenizer import icu_variants as variants
14 DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
15 DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
16 DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements"
17 DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization"
20 class ICUNameProcessorRules:
21 """ Data object that saves the rules needed for the name processor.
23 The rules can either be initialised through an ICURuleLoader or
24 be loaded from a database when a connection is given.
26 def __init__(self, loader=None, conn=None):
27 if loader is not None:
28 self.norm_rules = loader.get_normalization_rules()
29 self.trans_rules = loader.get_transliteration_rules()
30 self.replacements = loader.get_replacement_pairs()
31 self.search_rules = loader.get_search_rules()
32 elif conn is not None:
33 self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
34 self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
36 variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
37 self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
39 assert False, "Parameter loader or conn required."
42 def save_rules(self, conn):
43 """ Save the rules in the property table of the given database.
44 the rules can be loaded again by handing in a connection into
45 the constructor of the class.
47 set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
48 set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
49 set_property(conn, DBCFG_IMPORT_REPLACEMENTS,
50 variants.pickle_variant_set(self.replacements))
51 set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
54 class ICUNameProcessor:
55 """ Collects the different transformation rules for normalisation of names
56 and provides the functions to aply the transformations.
59 def __init__(self, rules):
60 self.normalizer = Transliterator.createFromRules("icu_normalization",
62 self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
65 self.search = Transliterator.createFromRules("icu_search",
68 # Intermediate reorder by source. Also compute required character set.
69 immediate = defaultdict(list)
71 for variant in rules.replacements:
72 if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
73 replstr = variant.replacement[:-1]
75 replstr = variant.replacement
76 immediate[variant.source].append(replstr)
77 chars.update(variant.source)
79 self.replacements = datrie.Trie(''.join(chars))
80 for src, repllist in immediate.items():
81 self.replacements[src] = repllist
84 def get_normalized(self, name):
85 """ Normalize the given name, i.e. remove all elements not relevant
88 return self.normalizer.transliterate(name).strip()
90 def get_variants_ascii(self, norm_name):
91 """ Compute the spelling variants for the given normalized name
92 and transliterate the result.
94 baseform = '^ ' + norm_name + ' ^'
100 while pos < len(baseform):
101 full, repl = self.replacements.longest_prefix_item(baseform[pos:],
104 done = baseform[startpos:pos]
105 partials = [v + done + r
106 for v, r in itertools.product(partials, repl)
107 if not force_space or r.startswith(' ')]
108 if len(partials) > 128:
109 # If too many variants are produced, they are unlikely
110 # to be helpful. Only use the original term.
113 startpos = pos + len(full)
122 # No variants detected? Fast return.
124 trans_name = self.to_ascii.transliterate(norm_name).strip()
125 return [trans_name] if trans_name else []
127 return self._compute_result_set(partials, baseform[startpos:])
130 def _compute_result_set(self, partials, prefix):
133 for variant in partials:
134 vname = variant + prefix
135 trans_name = self.to_ascii.transliterate(vname[1:-1]).strip()
137 results.add(trans_name)
142 def get_search_normalized(self, name):
143 """ Return the normalized version of the name (including transliteration)
144 to be applied at search time.
146 return self.search.transliterate(' ' + name + ' ').strip()