X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/2e3c5d4c5b39e29af57a9398f20fdf5cad0e9045..7b0f6b7905cc16d8e8dfb7619fe09de05e828f1d:/nominatim/tokenizer/icu_name_processor.py diff --git a/nominatim/tokenizer/icu_name_processor.py b/nominatim/tokenizer/icu_name_processor.py index a0f22974..1888a716 100644 --- a/nominatim/tokenizer/icu_name_processor.py +++ b/nominatim/tokenizer/icu_name_processor.py @@ -2,13 +2,14 @@ Processor for names that are imported into the database based on the ICU library. """ -import json +from collections import defaultdict import itertools from icu import Transliterator import datrie from nominatim.db.properties import set_property, get_property +from nominatim.tokenizer import icu_variants as variants DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation" DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration" @@ -31,20 +32,12 @@ class ICUNameProcessorRules: elif conn is not None: self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES) self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES) - self.replacements = json.loads(get_property(conn, DBCFG_IMPORT_REPLACEMENTS)) + self.replacements = \ + variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS)) self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES) else: assert False, "Parameter loader or conn required." - # Compute the set of characters used in the replacement list. - # We need this later when computing the tree. - chars = set() - for full, repl in self.replacements: - chars.update(full) - for word in repl: - chars.update(word) - self.replacement_charset = ''.join(chars) - def save_rules(self, conn): """ Save the rules in the property table of the given database. @@ -53,11 +46,15 @@ class ICUNameProcessorRules: """ set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules) set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules) - set_property(conn, DBCFG_IMPORT_REPLACEMENTS, json.dumps(self.replacements)) + set_property(conn, DBCFG_IMPORT_REPLACEMENTS, + variants.pickle_variant_set(self.replacements)) set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules) class ICUNameProcessor: + """ Collects the different transformation rules for normalisation of names + and provides the functions to aply the transformations. + """ def __init__(self, rules): self.normalizer = Transliterator.createFromRules("icu_normalization", @@ -67,9 +64,16 @@ class ICUNameProcessor: self.search = Transliterator.createFromRules("icu_search", rules.search_rules) - self.replacements = datrie.Trie(rules.replacement_charset) - for full, repl in rules.replacements: - self.replacements[full] = repl + # Intermediate reorder by source. Also compute required character set. + immediate = defaultdict(list) + chars = set() + for variant in rules.replacements: + immediate[variant.source].append(variant) + chars.update(variant.source) + # Then copy to datrie + self.replacements = datrie.Trie(''.join(chars)) + for src, repllist in immediate.items(): + self.replacements[src] = repllist def get_normalized(self, name): @@ -82,8 +86,8 @@ class ICUNameProcessor: """ Compute the spelling variants for the given normalized name and transliterate the result. """ - baseform = ' ' + norm_name + ' ' - variants = [''] + baseform = '^ ' + norm_name + ' ^' + partials = [''] startpos = 0 pos = 0 @@ -92,16 +96,27 @@ class ICUNameProcessor: (None, None)) if full is not None: done = baseform[startpos:pos] - variants = [v + done + r for v, r in itertools.product(variants, repl)] + partials = [v + done + r.replacement + for v, r in itertools.product(partials, repl)] startpos = pos + len(full) pos = startpos else: pos += 1 + results = [] + if startpos == 0: - return [self.to_ascii.transliterate(norm_name)] + trans_name = self.to_ascii.transliterate(norm_name).strip() + if trans_name: + results.append(trans_name) + else: + for variant in partials: + name = variant[1:] + baseform[startpos:-1] + trans_name = self.to_ascii.transliterate(name).strip() + if trans_name: + results.append(trans_name) - return [self.to_ascii.transliterate(v + baseform[startpos:pos]).strip() for v in variants] + return results def get_search_normalized(self, name):