X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/2e3c5d4c5b39e29af57a9398f20fdf5cad0e9045..1e9f37ab82db1758235bedf83c659693f4ca6c3e:/nominatim/tokenizer/icu_name_processor.py?ds=sidebyside diff --git a/nominatim/tokenizer/icu_name_processor.py b/nominatim/tokenizer/icu_name_processor.py index a0f22974..93d2b0ff 100644 --- a/nominatim/tokenizer/icu_name_processor.py +++ b/nominatim/tokenizer/icu_name_processor.py @@ -2,13 +2,14 @@ Processor for names that are imported into the database based on the ICU library. """ -import json +from collections import defaultdict import itertools from icu import Transliterator import datrie from nominatim.db.properties import set_property, get_property +from nominatim.tokenizer import icu_variants as variants DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation" DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration" @@ -31,20 +32,12 @@ class ICUNameProcessorRules: elif conn is not None: self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES) self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES) - self.replacements = json.loads(get_property(conn, DBCFG_IMPORT_REPLACEMENTS)) + self.replacements = \ + variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS)) self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES) else: assert False, "Parameter loader or conn required." - # Compute the set of characters used in the replacement list. - # We need this later when computing the tree. - chars = set() - for full, repl in self.replacements: - chars.update(full) - for word in repl: - chars.update(word) - self.replacement_charset = ''.join(chars) - def save_rules(self, conn): """ Save the rules in the property table of the given database. @@ -53,23 +46,39 @@ class ICUNameProcessorRules: """ set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules) set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules) - set_property(conn, DBCFG_IMPORT_REPLACEMENTS, json.dumps(self.replacements)) + set_property(conn, DBCFG_IMPORT_REPLACEMENTS, + variants.pickle_variant_set(self.replacements)) set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules) class ICUNameProcessor: + """ Collects the different transformation rules for normalisation of names + and provides the functions to aply the transformations. + """ def __init__(self, rules): self.normalizer = Transliterator.createFromRules("icu_normalization", rules.norm_rules) self.to_ascii = Transliterator.createFromRules("icu_to_ascii", - rules.trans_rules) + rules.trans_rules + + ";[:Space:]+ > ' '") self.search = Transliterator.createFromRules("icu_search", rules.search_rules) - self.replacements = datrie.Trie(rules.replacement_charset) - for full, repl in rules.replacements: - self.replacements[full] = repl + # Intermediate reorder by source. Also compute required character set. + immediate = defaultdict(list) + chars = set() + for variant in rules.replacements: + if variant.source[-1] == ' ' and variant.replacement[-1] == ' ': + replstr = variant.replacement[:-1] + else: + replstr = variant.replacement + immediate[variant.source].append(replstr) + chars.update(variant.source) + # Then copy to datrie + self.replacements = datrie.Trie(''.join(chars)) + for src, repllist in immediate.items(): + self.replacements[src] = repllist def get_normalized(self, name): @@ -82,26 +91,52 @@ class ICUNameProcessor: """ Compute the spelling variants for the given normalized name and transliterate the result. """ - baseform = ' ' + norm_name + ' ' - variants = [''] + baseform = '^ ' + norm_name + ' ^' + partials = [''] startpos = 0 pos = 0 + force_space = False while pos < len(baseform): full, repl = self.replacements.longest_prefix_item(baseform[pos:], (None, None)) if full is not None: done = baseform[startpos:pos] - variants = [v + done + r for v, r in itertools.product(variants, repl)] + partials = [v + done + r + for v, r in itertools.product(partials, repl) + if not force_space or r.startswith(' ')] + if len(partials) > 128: + # If too many variants are produced, they are unlikely + # to be helpful. Only use the original term. + startpos = 0 + break startpos = pos + len(full) + if full[-1] == ' ': + startpos -= 1 + force_space = True pos = startpos else: pos += 1 + force_space = False + # No variants detected? Fast return. if startpos == 0: - return [self.to_ascii.transliterate(norm_name)] + trans_name = self.to_ascii.transliterate(norm_name).strip() + return [trans_name] if trans_name else [] + + return self._compute_result_set(partials, baseform[startpos:]) + + + def _compute_result_set(self, partials, prefix): + results = set() + + for variant in partials: + vname = variant + prefix + trans_name = self.to_ascii.transliterate(vname[1:-1]).strip() + if trans_name: + results.add(trans_name) - return [self.to_ascii.transliterate(v + baseform[startpos:pos]).strip() for v in variants] + return list(results) def get_search_normalized(self, name):