From: Sarah Hoffmann Date: Thu, 1 Jul 2021 15:56:23 +0000 (+0200) Subject: fix subsequent replacements X-Git-Tag: v4.0.0~58^2~3 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/e85f7e7aa9b9c297b6b5f266d811c935af8cbb9e fix subsequent replacements Two replacement words directly following each other did not work as expected because each expects a space at the beginning/end while there was only one space available. Also forbit composing a word after a space was added in the end by a previous replacement. --- diff --git a/nominatim/tokenizer/icu_name_processor.py b/nominatim/tokenizer/icu_name_processor.py index 1888a716..6ead712e 100644 --- a/nominatim/tokenizer/icu_name_processor.py +++ b/nominatim/tokenizer/icu_name_processor.py @@ -60,7 +60,8 @@ class ICUNameProcessor: self.normalizer = Transliterator.createFromRules("icu_normalization", rules.norm_rules) self.to_ascii = Transliterator.createFromRules("icu_to_ascii", - rules.trans_rules) + rules.trans_rules + + ";[:Space:]+ > ' '") self.search = Transliterator.createFromRules("icu_search", rules.search_rules) @@ -68,7 +69,11 @@ class ICUNameProcessor: immediate = defaultdict(list) chars = set() for variant in rules.replacements: - immediate[variant.source].append(variant) + if variant.source[-1] == ' ' and variant.replacement[-1] == ' ': + replstr = variant.replacement[:-1] + else: + replstr = variant.replacement + immediate[variant.source].append(replstr) chars.update(variant.source) # Then copy to datrie self.replacements = datrie.Trie(''.join(chars)) @@ -91,32 +96,38 @@ class ICUNameProcessor: startpos = 0 pos = 0 + force_space = False while pos < len(baseform): full, repl = self.replacements.longest_prefix_item(baseform[pos:], (None, None)) if full is not None: done = baseform[startpos:pos] - partials = [v + done + r.replacement - for v, r in itertools.product(partials, repl)] + partials = [v + done + r + for v, r in itertools.product(partials, repl) + if not force_space or r.startswith(' ')] startpos = pos + len(full) + if full[-1] == ' ': + startpos -= 1 + force_space = True pos = startpos else: pos += 1 + force_space = False - results = [] + results = set() if startpos == 0: trans_name = self.to_ascii.transliterate(norm_name).strip() if trans_name: - results.append(trans_name) + results.add(trans_name) else: for variant in partials: - name = variant[1:] + baseform[startpos:-1] - trans_name = self.to_ascii.transliterate(name).strip() + name = variant + baseform[startpos:] + trans_name = self.to_ascii.transliterate(name[1:-1]).strip() if trans_name: - results.append(trans_name) + results.add(trans_name) - return results + return list(results) def get_search_normalized(self, name): diff --git a/settings/legacy_icu_tokenizer.yaml b/settings/legacy_icu_tokenizer.yaml index 192117ed..5fd30bd8 100644 --- a/settings/legacy_icu_tokenizer.yaml +++ b/settings/legacy_icu_tokenizer.yaml @@ -23,7 +23,6 @@ transliteration: - "[^[:Ascii:]] >" - ":: lower ()" - ":: NFC ()" - - "[:Space:]+ > ' '" variants: - !include icu-rules/variants-bg.yaml - !include icu-rules/variants-ca.yaml diff --git a/test/python/test_tokenizer_icu_name_processor.py b/test/python/test_tokenizer_icu_name_processor.py index c1ad7675..553d25c5 100644 --- a/test/python/test_tokenizer_icu_name_processor.py +++ b/test/python/test_tokenizer_icu_name_processor.py @@ -39,23 +39,6 @@ def cfgfile(tmp_path, suffix='.yaml'): def get_normalized_variants(proc, name): return proc.get_variants_ascii(proc.get_normalized(name)) -def test_simple_variants(cfgfile): - fpath = cfgfile('~strasse,~straße -> str', - '~weg => weg', - 'prospekt -> pr') - - rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath)) - proc = ICUNameProcessor(rules) - - assert set(get_normalized_variants(proc, "Bauwegstraße")) \ - == {'bauweg straße', 'bauweg str', 'bauwegstraße', 'bauwegstr'} - assert get_normalized_variants(proc, "Bauwegstr") == ['bauwegstr'] - assert set(get_normalized_variants(proc, "holzweg")) \ - == {'holz weg', 'holzweg'} - assert set(get_normalized_variants(proc, "Meier Weg")) \ - == {'meier weg', 'meierweg'} - assert get_normalized_variants(proc, "hallo") == ['hallo'] - def test_variants_empty(cfgfile): fpath = cfgfile('saint -> 🜵', 'street -> st') @@ -68,15 +51,44 @@ def test_variants_empty(cfgfile): assert get_normalized_variants(proc, 'saint') == ['saint'] -def test_multiple_replacements(cfgfile): - fpath = cfgfile('saint -> s,st', 'street -> st') - - rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath)) - proc = ICUNameProcessor(rules) - - assert set(get_normalized_variants(proc, "Saint Johns Street")) == \ - {'saint johns street', 's johns street', 'st johns street', - 'saint johns st', 's johns st', 'st johns st'} +VARIANT_TESTS = [ +(('~strasse,~straße -> str', '~weg => weg'), "hallo", {'hallo'}), +(('weg => wg',), "holzweg", {'holzweg'}), +(('weg -> wg',), "holzweg", {'holzweg'}), +(('~weg => weg',), "holzweg", {'holz weg', 'holzweg'}), +(('~weg -> weg',), "holzweg", {'holz weg', 'holzweg'}), +(('~weg => w',), "holzweg", {'holz w', 'holzw'}), +(('~weg -> w',), "holzweg", {'holz weg', 'holzweg', 'holz w', 'holzw'}), +(('~weg => weg',), "Meier Weg", {'meier weg', 'meierweg'}), +(('~weg -> weg',), "Meier Weg", {'meier weg', 'meierweg'}), +(('~weg => w',), "Meier Weg", {'meier w', 'meierw'}), +(('~weg -> w',), "Meier Weg", {'meier weg', 'meierweg', 'meier w', 'meierw'}), +(('weg => wg',), "Meier Weg", {'meier wg'}), +(('weg -> wg',), "Meier Weg", {'meier weg', 'meier wg'}), +(('~strasse,~straße -> str', '~weg => weg'), "Bauwegstraße", + {'bauweg straße', 'bauweg str', 'bauwegstraße', 'bauwegstr'}), +(('am => a', 'bach => b'), "am bach", {'a b'}), +(('am => a', '~bach => b'), "am bach", {'a b'}), +(('am -> a', '~bach -> b'), "am bach", {'am bach', 'a bach', 'am b', 'a b'}), +(('am -> a', '~bach -> b'), "ambach", {'ambach', 'am bach', 'amb', 'am b'}), +(('saint -> s,st', 'street -> st'), "Saint Johns Street", + {'saint johns street', 's johns street', 'st johns street', + 'saint johns st', 's johns st', 'st johns st'}), +(('river$ -> r',), "River Bend Road", {'river bend road'}), +(('river$ -> r',), "Bent River", {'bent river', 'bent r'}), +(('^north => n',), "North 2nd Street", {'n 2nd street'}), +(('^north => n',), "Airport North", {'airport north'}), +] + +@pytest.mark.parametrize("rules,name,variants", VARIANT_TESTS) +def test_variants(cfgfile, rules, name, variants): + fpath = cfgfile(*rules) + proc = ICUNameProcessor(ICUNameProcessorRules(loader=ICURuleLoader(fpath))) + + result = get_normalized_variants(proc, name) + + assert len(result) == len(set(result)) + assert set(get_normalized_variants(proc, name)) == variants def test_search_normalized(cfgfile):