From: Sarah Hoffmann Date: Wed, 12 Jan 2022 15:25:47 +0000 (+0100) Subject: introduce mutation variants to generic token analyser X-Git-Tag: v4.1.0~95^2~3 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/b453b0ea95e7b1244912b7bc9fc26f58acb8ec80 introduce mutation variants to generic token analyser Mutations are regular-expression-based replacements that are applied after variants have been computed. They are meant to be used for variations on character level. Add spelling variations for German umlauts. --- diff --git a/nominatim/tokenizer/token_analysis/generic.py b/nominatim/tokenizer/token_analysis/generic.py index b9bd9bdf..1e7b75a1 100644 --- a/nominatim/tokenizer/token_analysis/generic.py +++ b/nominatim/tokenizer/token_analysis/generic.py @@ -11,7 +11,9 @@ import itertools import datrie +from nominatim.errors import UsageError from nominatim.tokenizer.token_analysis.config_variants import get_variant_config +from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator ### Configuration section @@ -23,6 +25,7 @@ def configure(rules, normalization_rules): config['replacements'], config['chars'] = get_variant_config(rules.get('variants'), normalization_rules) config['variant_only'] = rules.get('mode', '') == 'variant-only' + config['mutations'] = rules.get('mutations', []) return config @@ -52,19 +55,45 @@ class GenericTokenAnalysis: else: self.replacements = None + # set up mutation rules + self.mutations = [] + for cfg in config['mutations']: + if 'pattern' not in cfg: + raise UsageError("Missing field 'pattern' in mutation configuration.") + if not isinstance(cfg['pattern'], str): + raise UsageError("Field 'pattern' in mutation configuration " + "must be a simple text field.") + if 'replacements' not in cfg: + raise UsageError("Missing field 'replacements' in mutation configuration.") + if not isinstance(cfg['replacements'], list): + raise UsageError("Field 'replacements' in mutation configuration " + "must be a list of texts.") + + self.mutations.append(MutationVariantGenerator(cfg['pattern'], + cfg['replacements'])) + def get_variants_ascii(self, norm_name): """ Compute the spelling variants for the given normalized name and transliterate the result. """ - results = set() - for variant in self._generate_word_variants(norm_name): - if not self.variant_only or variant.strip() != norm_name: - trans_name = self.to_ascii.transliterate(variant).strip() - if trans_name: - results.add(trans_name) - - return list(results) + variants = self._generate_word_variants(norm_name) + + for mutation in self.mutations: + variants = mutation.generate(variants) + + return [name for name in self._transliterate_unique_list(norm_name, variants) if name] + + + def _transliterate_unique_list(self, norm_name, iterable): + seen = set() + if self.variant_only: + seen.add(norm_name) + + for variant in map(str.strip, iterable): + if variant not in seen: + seen.add(variant) + yield self.to_ascii.transliterate(variant).strip() def _generate_word_variants(self, norm_name): diff --git a/nominatim/tokenizer/token_analysis/generic_mutation.py b/nominatim/tokenizer/token_analysis/generic_mutation.py new file mode 100644 index 00000000..d23d5cd4 --- /dev/null +++ b/nominatim/tokenizer/token_analysis/generic_mutation.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Creator for mutation variants for the generic token analysis. +""" +import itertools +import logging +import re + +from nominatim.errors import UsageError + +LOG = logging.getLogger() + +def _zigzag(outer, inner): + return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue='')) + + +class MutationVariantGenerator: + """ Generates name variants by applying a regular expression to the name + and replacing it with one or more variants. When the regular expression + matches more than once, each occurence is replaced with all replacement + patterns. + """ + + def __init__(self, pattern, replacements): + self.pattern = re.compile(pattern) + self.replacements = replacements + + if self.pattern.groups > 0: + LOG.fatal("The mutation pattern %s contains a capturing group. " + "This is not allowed.", pattern) + raise UsageError("Bad mutation pattern in configuration.") + + + def generate(self, names): + """ Generator function for the name variants. 'names' is an iterable + over a set of names for which the variants are to be generated. + """ + for name in names: + parts = self.pattern.split(name) + if len(parts) == 1: + yield name + else: + for seps in self._fillers(len(parts)): + yield ''.join(_zigzag(parts, seps)) + + + def _fillers(self, num_parts): + """ Returns a generator for strings to join the given number of string + parts in all possible combinations. + """ + return itertools.product(self.replacements, repeat=num_parts - 1) diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index a3c62e67..c6601faf 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -59,6 +59,13 @@ token-analysis: mode: variant-only variants: - !include icu-rules/variants-de.yaml + mutations: + - pattern: ä + replacements: ["ä", "ae"] + - pattern: ö + replacements: ["ö", "oe"] + - pattern: ü + replacements: ["ü", "ue"] - id: el analyzer: generic mode: variant-only diff --git a/test/bdd/db/import/naming.feature b/test/bdd/db/import/naming.feature index bb29d2a3..b739cbae 100644 --- a/test/bdd/db/import/naming.feature +++ b/test/bdd/db/import/naming.feature @@ -58,3 +58,48 @@ Feature: Import and search of names | រាជធានីភ្នំពេញ | | 東京都 | | ပုဗ္ဗသီရိ | + + + Scenario: German umlauts can be found when expanded + Given the places + | osm | class | type | name+name:de | + | N1 | place | city | Münster | + | N2 | place | city | Köln | + | N3 | place | city | Gräfenroda | + When importing + When sending search query "münster" + Then results contain + | osm | + | N1 | + When sending search query "muenster" + Then results contain + | osm | + | N1 | + When sending search query "munster" + Then results contain + | osm | + | N1 | + When sending search query "Köln" + Then results contain + | osm | + | N2 | + When sending search query "Koeln" + Then results contain + | osm | + | N2 | + When sending search query "Koln" + Then results contain + | osm | + | N2 | + When sending search query "gräfenroda" + Then results contain + | osm | + | N3 | + When sending search query "graefenroda" + Then results contain + | osm | + | N3 | + When sending search query "grafenroda" + Then results contain + | osm | + | N3 | diff --git a/test/python/tokenizer/token_analysis/test_generic_mutation.py b/test/python/tokenizer/token_analysis/test_generic_mutation.py new file mode 100644 index 00000000..757f0311 --- /dev/null +++ b/test/python/tokenizer/token_analysis/test_generic_mutation.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for generic token analysis, mutation part. +""" +import pytest + +from icu import Transliterator + +import nominatim.tokenizer.token_analysis.generic as module +from nominatim.errors import UsageError + +DEFAULT_NORMALIZATION = """ '🜳' > ' '; + [[:Nonspacing Mark:] [:Cf:]] >; + :: lower (); + [[:Punctuation:][:Space:]]+ > ' ' + """ + +DEFAULT_TRANSLITERATION = """ :: Latin (); + '🜵' > ' '; + """ + +class TestMutationNoVariants: + + def make_analyser(self, *mutations): + rules = { 'analyzer': 'generic', + 'mutations': [ {'pattern': m[0], 'replacements': m[1]} + for m in mutations] + } + config = module.configure(rules, DEFAULT_NORMALIZATION) + trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) + + self.analysis = module.create(trans, config) + + + def variants(self, name): + norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) + return set(self.analysis.get_variants_ascii(norm.transliterate(name).strip())) + + + @pytest.mark.parametrize('pattern', ('(capture)', ['a list'])) + def test_bad_pattern(self, pattern): + with pytest.raises(UsageError): + self.make_analyser((pattern, ['b'])) + + + @pytest.mark.parametrize('replacements', (None, 'a string')) + def test_bad_replacement(self, replacements): + with pytest.raises(UsageError): + self.make_analyser(('a', replacements)) + + + def test_simple_replacement(self): + self.make_analyser(('a', ['b'])) + + assert self.variants('none') == {'none'} + assert self.variants('abba') == {'bbbb'} + assert self.variants('2 aar') == {'2 bbr'} + + + def test_multichar_replacement(self): + self.make_analyser(('1 1', ['1 1 1'])) + + assert self.variants('1 1456') == {'1 1 1456'} + assert self.variants('1 1 1') == {'1 1 1 1'} + + + def test_removement_replacement(self): + self.make_analyser((' ', [' ', ''])) + + assert self.variants('A 345') == {'a 345', 'a345'} + assert self.variants('a g b') == {'a g b', 'ag b', 'a gb', 'agb'} + + + def test_regex_pattern(self): + self.make_analyser(('[^a-z]+', ['XXX', ' '])) + + assert self.variants('a-34n12') == {'aXXXnXXX', 'aXXXn', 'a nXXX', 'a n'} + + + def test_multiple_mutations(self): + self.make_analyser(('ä', ['ä', 'ae']), ('ö', ['ö', 'oe'])) + + assert self.variants('Längenöhr') == {'längenöhr', 'laengenöhr', + 'längenoehr', 'laengenoehr'}