From: Sarah Hoffmann Date: Fri, 29 Jul 2022 08:43:07 +0000 (+0200) Subject: harmonize interface of token analysis module X-Git-Tag: v4.1.0~4^2~5 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/c8873d34af083a7cb117214c6d852ef78e05acb7 harmonize interface of token analysis module The configure() function now receives a Transliterator object instead of the ICU rules. This harmonizes the parameters with the create function. --- diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py index f461a1f1..aeb8a323 100644 --- a/nominatim/tokenizer/icu_rule_loader.py +++ b/nominatim/tokenizer/icu_rule_loader.py @@ -12,6 +12,8 @@ import io import json import logging +from icu import Transliterator + from nominatim.config import flatten_config_list, Configuration from nominatim.db.properties import set_property, get_property from nominatim.db.connection import Connection @@ -135,6 +137,11 @@ class ICURuleLoader: if not isinstance(self.analysis_rules, list): raise UsageError("Configuration section 'token-analysis' must be a list.") + norm = Transliterator.createFromRules("rule_loader_normalization", + self.normalization_rules) + trans = Transliterator.createFromRules("rule_loader_transliteration", + self.transliteration_rules) + for section in self.analysis_rules: name = section.get('id', None) if name in self.analysis: @@ -144,8 +151,7 @@ class ICURuleLoader: LOG.fatal("ICU tokenizer configuration has two token " "analyzers with id '%s'.", name) raise UsageError("Syntax error in ICU tokenizer config.") - self.analysis[name] = TokenAnalyzerRule(section, - self.normalization_rules, + self.analysis[name] = TokenAnalyzerRule(section, norm, trans, self.config) @@ -170,7 +176,8 @@ class TokenAnalyzerRule: and creates a new token analyzer on request. """ - def __init__(self, rules: Mapping[str, Any], normalization_rules: str, + def __init__(self, rules: Mapping[str, Any], + normalizer: Any, transliterator: Any, config: Configuration) -> None: analyzer_name = _get_section(rules, 'analyzer') if not analyzer_name or not isinstance(analyzer_name, str): @@ -179,7 +186,8 @@ class TokenAnalyzerRule: self._analysis_mod: AnalysisModule = \ config.load_plugin_module(analyzer_name, 'nominatim.tokenizer.token_analysis') - self.config = self._analysis_mod.configure(rules, normalization_rules) + self.config = self._analysis_mod.configure(rules, normalizer, + transliterator) def create(self, normalizer: Any, transliterator: Any) -> Analyser: diff --git a/nominatim/tokenizer/token_analysis/base.py b/nominatim/tokenizer/token_analysis/base.py index 53264b94..d17a626c 100644 --- a/nominatim/tokenizer/token_analysis/base.py +++ b/nominatim/tokenizer/token_analysis/base.py @@ -30,7 +30,8 @@ class AnalysisModule(Protocol): """ Protocol for analysis modules. """ - def configure(self, rules: Mapping[str, Any], normalization_rules: str) -> Any: + def configure(self, rules: Mapping[str, Any], + normalizer: Any, transliterator: Any) -> Any: """ Prepare the configuration of the analysis module. This function should prepare all data that can be shared between instances of this analyser. @@ -38,8 +39,10 @@ class AnalysisModule(Protocol): Arguments: rules: A dictionary with the additional configuration options as specified in the tokenizer configuration. - normalization_rules: ICU rules for normalization as a string - that can be used with createFromRules(). + normalizer: an ICU Transliterator with the compiled normalization + rules. + transliterator: an ICU tranliterator with the compiled + transliteration rules. Returns: A data object with the configuration that was set up. May be diff --git a/nominatim/tokenizer/token_analysis/config_variants.py b/nominatim/tokenizer/token_analysis/config_variants.py index d86d8072..1258373e 100644 --- a/nominatim/tokenizer/token_analysis/config_variants.py +++ b/nominatim/tokenizer/token_analysis/config_variants.py @@ -12,8 +12,6 @@ from collections import defaultdict import itertools import re -from icu import Transliterator - from nominatim.config import flatten_config_list from nominatim.errors import UsageError @@ -25,7 +23,7 @@ class ICUVariant(NamedTuple): def get_variant_config(in_rules: Any, - normalization_rules: str) -> Tuple[List[Tuple[str, List[str]]], str]: + normalizer: Any) -> Tuple[List[Tuple[str, List[str]]], str]: """ Convert the variant definition from the configuration into replacement sets. @@ -39,7 +37,7 @@ def get_variant_config(in_rules: Any, vset: Set[ICUVariant] = set() rules = flatten_config_list(in_rules, 'variants') - vmaker = _VariantMaker(normalization_rules) + vmaker = _VariantMaker(normalizer) for section in rules: for rule in (section.get('words') or []): @@ -63,9 +61,8 @@ class _VariantMaker: All text in rules is normalized to make sure the variants match later. """ - def __init__(self, norm_rules: Any) -> None: - self.norm = Transliterator.createFromRules("rule_loader_normalization", - norm_rules) + def __init__(self, normalizer: Any) -> None: + self.norm = normalizer def compute(self, rule: Any) -> Iterator[ICUVariant]: diff --git a/nominatim/tokenizer/token_analysis/generic.py b/nominatim/tokenizer/token_analysis/generic.py index e14f844c..28cd0d94 100644 --- a/nominatim/tokenizer/token_analysis/generic.py +++ b/nominatim/tokenizer/token_analysis/generic.py @@ -18,13 +18,13 @@ from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantG ### Configuration section -def configure(rules: Mapping[str, Any], normalization_rules: str) -> Dict[str, Any]: +def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, Any]: """ Extract and preprocess the configuration for this module. """ config: Dict[str, Any] = {} config['replacements'], config['chars'] = get_variant_config(rules.get('variants'), - normalization_rules) + normalizer) config['variant_only'] = rules.get('mode', '') == 'variant-only' # parse mutation rules diff --git a/nominatim/tokenizer/token_analysis/housenumbers.py b/nominatim/tokenizer/token_analysis/housenumbers.py index a0f4214d..e3048a09 100644 --- a/nominatim/tokenizer/token_analysis/housenumbers.py +++ b/nominatim/tokenizer/token_analysis/housenumbers.py @@ -8,7 +8,7 @@ Specialized processor for housenumbers. Analyses common housenumber patterns and creates variants for them. """ -from typing import Mapping, Any, List, cast +from typing import Any, List, cast import re from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator @@ -20,7 +20,7 @@ RE_NAMED_PART = re.compile(r'[a-z]{4}') ### Configuration section -def configure(rules: Mapping[str, Any], normalization_rules: str) -> None: # pylint: disable=W0613 +def configure(*_: Any) -> None: """ All behaviour is currently hard-coded. """ return None diff --git a/nominatim/tokenizer/token_analysis/postcodes.py b/nominatim/tokenizer/token_analysis/postcodes.py index 15b20bf9..f5b5b9c4 100644 --- a/nominatim/tokenizer/token_analysis/postcodes.py +++ b/nominatim/tokenizer/token_analysis/postcodes.py @@ -8,13 +8,13 @@ Specialized processor for postcodes. Supports a 'lookup' variant of the token, which produces variants with optional spaces. """ -from typing import Mapping, Any, List +from typing import Any, List from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator ### Configuration section -def configure(rules: Mapping[str, Any], normalization_rules: str) -> None: # pylint: disable=W0613 +def configure(*_: Any) -> None: """ All behaviour is currently hard-coded. """ return None diff --git a/test/python/tokenizer/token_analysis/test_generic.py b/test/python/tokenizer/token_analysis/test_generic.py index afbd5e9b..18ed109b 100644 --- a/test/python/tokenizer/token_analysis/test_generic.py +++ b/test/python/tokenizer/token_analysis/test_generic.py @@ -30,9 +30,9 @@ def make_analyser(*variants, variant_only=False): rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]} if variant_only: rules['mode'] = 'variant-only' - config = module.configure(rules, DEFAULT_NORMALIZATION) trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) + config = module.configure(rules, norm, trans) return module.create(norm, trans, config) @@ -44,9 +44,9 @@ def get_normalized_variants(proc, name): def test_no_variants(): rules = { 'analyzer': 'generic' } - config = module.configure(rules, DEFAULT_NORMALIZATION) trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) + config = module.configure(rules, norm, trans) proc = module.create(norm, trans, config) @@ -123,7 +123,9 @@ class TestGetReplacements: @staticmethod def configure_rules(*variants): rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]} - return module.configure(rules, DEFAULT_NORMALIZATION) + trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) + norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) + return module.configure(rules, norm, trans) def get_replacements(self, *variants): diff --git a/test/python/tokenizer/token_analysis/test_generic_mutation.py b/test/python/tokenizer/token_analysis/test_generic_mutation.py index abe31f6d..ee842355 100644 --- a/test/python/tokenizer/token_analysis/test_generic_mutation.py +++ b/test/python/tokenizer/token_analysis/test_generic_mutation.py @@ -31,9 +31,9 @@ class TestMutationNoVariants: 'mutations': [ {'pattern': m[0], 'replacements': m[1]} for m in mutations] } - config = module.configure(rules, DEFAULT_NORMALIZATION) trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION) norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION) + config = module.configure(rules, norm, trans) self.analysis = module.create(norm, trans, config)