2 Processor for names that are imported into the database based on the
5 from collections import defaultdict
8 from icu import Transliterator
11 from nominatim.db.properties import set_property, get_property
12 from nominatim.tokenizer import icu_variants as variants
14 DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
15 DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
16 DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements"
17 DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization"
20 class ICUNameProcessorRules:
21 """ Data object that saves the rules needed for the name processor.
23 The rules can either be initialised through an ICURuleLoader or
24 be loaded from a database when a connection is given.
26 def __init__(self, loader=None, conn=None):
27 if loader is not None:
28 self.norm_rules = loader.get_normalization_rules()
29 self.trans_rules = loader.get_transliteration_rules()
30 self.replacements = loader.get_replacement_pairs()
31 self.search_rules = loader.get_search_rules()
32 elif conn is not None:
33 self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
34 self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
36 variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
37 self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
39 assert False, "Parameter loader or conn required."
42 def save_rules(self, conn):
43 """ Save the rules in the property table of the given database.
44 the rules can be loaded again by handing in a connection into
45 the constructor of the class.
47 set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
48 set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
49 set_property(conn, DBCFG_IMPORT_REPLACEMENTS,
50 variants.pickle_variant_set(self.replacements))
51 set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
54 class ICUNameProcessor:
55 """ Collects the different transformation rules for normalisation of names
56 and provides the functions to aply the transformations.
59 def __init__(self, rules):
60 self.normalizer = Transliterator.createFromRules("icu_normalization",
62 self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
64 self.search = Transliterator.createFromRules("icu_search",
67 # Intermediate reorder by source. Also compute required character set.
68 immediate = defaultdict(list)
70 for variant in rules.replacements:
71 immediate[variant.source].append(variant)
72 chars.update(variant.source)
74 self.replacements = datrie.Trie(''.join(chars))
75 for src, repllist in immediate.items():
76 self.replacements[src] = repllist
79 def get_normalized(self, name):
80 """ Normalize the given name, i.e. remove all elements not relevant
83 return self.normalizer.transliterate(name).strip()
85 def get_variants_ascii(self, norm_name):
86 """ Compute the spelling variants for the given normalized name
87 and transliterate the result.
89 baseform = '^ ' + norm_name + ' ^'
94 while pos < len(baseform):
95 full, repl = self.replacements.longest_prefix_item(baseform[pos:],
98 done = baseform[startpos:pos]
99 partials = [v + done + r.replacement
100 for v, r in itertools.product(partials, repl)]
101 startpos = pos + len(full)
109 trans_name = self.to_ascii.transliterate(norm_name).strip()
111 results.append(trans_name)
113 for variant in partials:
114 name = variant[1:] + baseform[startpos:-1]
115 trans_name = self.to_ascii.transliterate(name).strip()
117 results.append(trans_name)
122 def get_search_normalized(self, name):
123 """ Return the normalized version of the name (including transliteration)
124 to be applied at search time.
126 return self.search.transliterate(' ' + name + ' ').strip()