From 52847b61a3e1bc0791dd23809dc3c50fe6810df2 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 4 Oct 2021 16:40:28 +0200 Subject: [PATCH] extend ICU config to accomodate multiple analysers Adds parsing of multiple variant lists from the configuration. Every entry except one must have a unique 'id' paramter to distinguish the entries. The entry without id is considered the default. Currently only the list without an id is used for analysis. --- nominatim/tokenizer/icu_rule_loader.py | 62 ++++++++++++------ settings/icu_tokenizer.yaml | 63 ++++++++++--------- test/python/test_tokenizer_icu.py | 8 +-- .../test_tokenizer_icu_name_processor.py | 6 +- test/python/test_tokenizer_icu_rule_loader.py | 17 ++--- 5 files changed, 92 insertions(+), 64 deletions(-) diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py index 7719f211..cf725209 100644 --- a/nominatim/tokenizer/icu_rule_loader.py +++ b/nominatim/tokenizer/icu_rule_loader.py @@ -43,12 +43,10 @@ class ICURuleLoader: rules = config.load_sub_configuration('icu_tokenizer.yaml', config='TOKENIZER_CONFIG') - self.variants = set() - self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization') self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration') - self.analysis_rules = self._get_section(rules, 'variants') - self._parse_variant_list() + self.analysis_rules = self._get_section(rules, 'token-analysis') + self._setup_analysis() # Load optional sanitizer rule set. self.sanitizer_rules = rules.get('sanitizers', []) @@ -61,7 +59,7 @@ class ICURuleLoader: self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES) self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES) self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES)) - self._parse_variant_list() + self._setup_analysis() def save_config_to_db(self, conn): @@ -82,9 +80,8 @@ class ICURuleLoader: def make_token_analysis(self): """ Create a token analyser from the reviouly loaded rules. """ - return ICUNameProcessor(self.normalization_rules, - self.transliteration_rules, - self.variants) + return self.analysis[None].create(self.normalization_rules, + self.transliteration_rules) def get_search_rules(self): @@ -99,23 +96,37 @@ class ICURuleLoader: rules.write(self.transliteration_rules) return rules.getvalue() + def get_normalization_rules(self): """ Return rules for normalisation of a term. """ return self.normalization_rules + def get_transliteration_rules(self): """ Return the rules for converting a string into its asciii representation. """ return self.transliteration_rules - def get_replacement_pairs(self): - """ Return the list of possible compound decompositions with - application of abbreviations included. - The result is a list of pairs: the first item is the sequence to - replace, the second is a list of replacements. + + def _setup_analysis(self): + """ Process the rules used for creating the various token analyzers. """ - return self.variants + self.analysis = {} + + if not isinstance(self.analysis_rules, list): + raise UsageError("Configuration section 'token-analysis' must be a list.") + + for section in self.analysis_rules: + name = section.get('id', None) + if name in self.analysis: + if name is None: + LOG.fatal("ICU tokenizer configuration has two default token analyzers.") + else: + LOG.fatal("ICU tokenizer configuration has two token " + "analyzers with id '%s'.", name) + UsageError("Syntax error in ICU tokenizer config.") + self.analysis[name] = TokenAnalyzerRule(section, self.normalization_rules) @staticmethod @@ -145,17 +156,32 @@ class ICURuleLoader: return ';'.join(flatten_config_list(content, section)) + ';' - def _parse_variant_list(self): - rules = self.analysis_rules +class TokenAnalyzerRule: + """ Factory for a single analysis module. The class saves the configuration + and creates a new token analyzer on request. + """ + + def __init__(self, rules, normalization_rules): + self._parse_variant_list(rules.get('variants'), normalization_rules) + + + def create(self, normalization_rules, transliteration_rules): + """ Create an analyzer from the given rules. + """ + return ICUNameProcessor(normalization_rules, + transliteration_rules, + self.variants) - self.variants.clear() + + def _parse_variant_list(self, rules, normalization_rules): + self.variants = set() if not rules: return rules = flatten_config_list(rules, 'variants') - vmaker = _VariantMaker(self.normalization_rules) + vmaker = _VariantMaker(normalization_rules) properties = [] for section in rules: diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index 08b7a7ff..f85c33ff 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -27,34 +27,35 @@ transliteration: sanitizers: - step: split-name-list - step: strip-brace-terms -variants: - - !include icu-rules/variants-bg.yaml - - !include icu-rules/variants-ca.yaml - - !include icu-rules/variants-cs.yaml - - !include icu-rules/variants-da.yaml - - !include icu-rules/variants-de.yaml - - !include icu-rules/variants-el.yaml - - !include icu-rules/variants-en.yaml - - !include icu-rules/variants-es.yaml - - !include icu-rules/variants-et.yaml - - !include icu-rules/variants-eu.yaml - - !include icu-rules/variants-fi.yaml - - !include icu-rules/variants-fr.yaml - - !include icu-rules/variants-gl.yaml - - !include icu-rules/variants-hu.yaml - - !include icu-rules/variants-it.yaml - - !include icu-rules/variants-ja.yaml - - !include icu-rules/variants-mg.yaml - - !include icu-rules/variants-ms.yaml - - !include icu-rules/variants-nl.yaml - - !include icu-rules/variants-no.yaml - - !include icu-rules/variants-pl.yaml - - !include icu-rules/variants-pt.yaml - - !include icu-rules/variants-ro.yaml - - !include icu-rules/variants-ru.yaml - - !include icu-rules/variants-sk.yaml - - !include icu-rules/variants-sl.yaml - - !include icu-rules/variants-sv.yaml - - !include icu-rules/variants-tr.yaml - - !include icu-rules/variants-uk.yaml - - !include icu-rules/variants-vi.yaml +token-analysis: + - variants: + - !include icu-rules/variants-bg.yaml + - !include icu-rules/variants-ca.yaml + - !include icu-rules/variants-cs.yaml + - !include icu-rules/variants-da.yaml + - !include icu-rules/variants-de.yaml + - !include icu-rules/variants-el.yaml + - !include icu-rules/variants-en.yaml + - !include icu-rules/variants-es.yaml + - !include icu-rules/variants-et.yaml + - !include icu-rules/variants-eu.yaml + - !include icu-rules/variants-fi.yaml + - !include icu-rules/variants-fr.yaml + - !include icu-rules/variants-gl.yaml + - !include icu-rules/variants-hu.yaml + - !include icu-rules/variants-it.yaml + - !include icu-rules/variants-ja.yaml + - !include icu-rules/variants-mg.yaml + - !include icu-rules/variants-ms.yaml + - !include icu-rules/variants-nl.yaml + - !include icu-rules/variants-no.yaml + - !include icu-rules/variants-pl.yaml + - !include icu-rules/variants-pt.yaml + - !include icu-rules/variants-ro.yaml + - !include icu-rules/variants-ru.yaml + - !include icu-rules/variants-sk.yaml + - !include icu-rules/variants-sl.yaml + - !include icu-rules/variants-sv.yaml + - !include icu-rules/variants-tr.yaml + - !include icu-rules/variants-uk.yaml + - !include icu-rules/variants-vi.yaml diff --git a/test/python/test_tokenizer_icu.py b/test/python/test_tokenizer_icu.py index 9a6f5a94..16caf3ed 100644 --- a/test/python/test_tokenizer_icu.py +++ b/test/python/test_tokenizer_icu.py @@ -69,10 +69,10 @@ def analyzer(tokenizer_factory, test_config, monkeypatch, def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',), variants=('~gasse -> gasse', 'street => st', ), sanitizers=[]): - cfgstr = {'normalization' : list(norm), - 'sanitizers' : sanitizers, - 'transliteration' : list(trans), - 'variants' : [ {'words': list(variants)}]} + cfgstr = {'normalization': list(norm), + 'sanitizers': sanitizers, + 'transliteration': list(trans), + 'token-analysis': [{'variants': [{'words': list(variants)}]}]} (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr)) tok.loader = ICURuleLoader(test_config) diff --git a/test/python/test_tokenizer_icu_name_processor.py b/test/python/test_tokenizer_icu_name_processor.py index d0ed21ec..366d2aee 100644 --- a/test/python/test_tokenizer_icu_name_processor.py +++ b/test/python/test_tokenizer_icu_name_processor.py @@ -28,10 +28,10 @@ def cfgfile(def_config, tmp_path): - ":: Latin ()" - "'🜵' > ' '" """) - content += "variants:\n - words:\n" - content += '\n'.join((" - " + s for s in variants)) + '\n' + content += "token-analysis:\n - variants:\n - words:\n" + content += '\n'.join((" - " + s for s in variants)) + '\n' for k, v in kwargs: - content += " {}: {}\n".format(k, v) + content += " {}: {}\n".format(k, v) (project_dir / 'icu_tokenizer.yaml').write_text(content) return def_config diff --git a/test/python/test_tokenizer_icu_rule_loader.py b/test/python/test_tokenizer_icu_rule_loader.py index 6ec53edc..5d931043 100644 --- a/test/python/test_tokenizer_icu_rule_loader.py +++ b/test/python/test_tokenizer_icu_rule_loader.py @@ -34,8 +34,8 @@ def cfgrules(test_config): - ":: Latin ()" - "[[:Punctuation:][:Space:]]+ > ' '" """) - content += "variants:\n - words:\n" - content += '\n'.join((" - " + s for s in variants)) + '\n' + content += "token-analysis:\n - variants:\n - words:\n" + content += '\n'.join((" - " + s for s in variants)) + '\n' for k, v in kwargs: content += " {}: {}\n".format(k, v) (test_config.project_dir / 'icu_tokenizer.yaml').write_text(content) @@ -49,20 +49,20 @@ def test_empty_rule_set(test_config): (test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\ normalization: transliteration: - variants: + token-analysis: + - variants: """)) rules = ICURuleLoader(test_config) assert rules.get_search_rules() == '' assert rules.get_normalization_rules() == '' assert rules.get_transliteration_rules() == '' - assert list(rules.get_replacement_pairs()) == [] -CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants') +CONFIG_SECTIONS = ('normalization', 'transliteration', 'token-analysis') @pytest.mark.parametrize("section", CONFIG_SECTIONS) def test_missing_section(section, test_config): - rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section} + rule_cfg = { s: [] for s in CONFIG_SECTIONS if s != section} (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg)) with pytest.raises(UsageError): @@ -107,7 +107,8 @@ def test_transliteration_rules_from_file(test_config): transliteration: - "'ax' > 'b'" - !include transliteration.yaml - variants: + token-analysis: + - variants: """)) transpath = test_config.project_dir / ('transliteration.yaml') transpath.write_text('- "x > y"') @@ -127,7 +128,7 @@ class TestGetReplacements: def get_replacements(self, *variants): loader = ICURuleLoader(self.cfgrules(*variants)) - rules = loader.get_replacement_pairs() + rules = loader.analysis[None].variants return set((v.source, v.replacement) for v in rules) -- 2.39.5