import json
import logging
+from icu import Transliterator
+
from nominatim.config import flatten_config_list, Configuration
from nominatim.db.properties import set_property, get_property
from nominatim.db.connection import Connection
if not isinstance(self.analysis_rules, list):
raise UsageError("Configuration section 'token-analysis' must be a list.")
+ norm = Transliterator.createFromRules("rule_loader_normalization",
+ self.normalization_rules)
+ trans = Transliterator.createFromRules("rule_loader_transliteration",
+ self.transliteration_rules)
+
for section in self.analysis_rules:
name = section.get('id', None)
if name in self.analysis:
LOG.fatal("ICU tokenizer configuration has two token "
"analyzers with id '%s'.", name)
raise UsageError("Syntax error in ICU tokenizer config.")
- self.analysis[name] = TokenAnalyzerRule(section,
- self.normalization_rules,
+ self.analysis[name] = TokenAnalyzerRule(section, norm, trans,
self.config)
and creates a new token analyzer on request.
"""
- def __init__(self, rules: Mapping[str, Any], normalization_rules: str,
+ def __init__(self, rules: Mapping[str, Any],
+ normalizer: Any, transliterator: Any,
config: Configuration) -> None:
analyzer_name = _get_section(rules, 'analyzer')
if not analyzer_name or not isinstance(analyzer_name, str):
self._analysis_mod: AnalysisModule = \
config.load_plugin_module(analyzer_name, 'nominatim.tokenizer.token_analysis')
- self.config = self._analysis_mod.configure(rules, normalization_rules)
+ self.config = self._analysis_mod.configure(rules, normalizer,
+ transliterator)
def create(self, normalizer: Any, transliterator: Any) -> Analyser:
""" Protocol for analysis modules.
"""
- def configure(self, rules: Mapping[str, Any], normalization_rules: str) -> Any:
+ def configure(self, rules: Mapping[str, Any],
+ normalizer: Any, transliterator: Any) -> Any:
""" Prepare the configuration of the analysis module.
This function should prepare all data that can be shared
between instances of this analyser.
Arguments:
rules: A dictionary with the additional configuration options
as specified in the tokenizer configuration.
- normalization_rules: ICU rules for normalization as a string
- that can be used with createFromRules().
+ normalizer: an ICU Transliterator with the compiled normalization
+ rules.
+ transliterator: an ICU tranliterator with the compiled
+ transliteration rules.
Returns:
A data object with the configuration that was set up. May be
import itertools
import re
-from icu import Transliterator
-
from nominatim.config import flatten_config_list
from nominatim.errors import UsageError
def get_variant_config(in_rules: Any,
- normalization_rules: str) -> Tuple[List[Tuple[str, List[str]]], str]:
+ normalizer: Any) -> Tuple[List[Tuple[str, List[str]]], str]:
""" Convert the variant definition from the configuration into
replacement sets.
vset: Set[ICUVariant] = set()
rules = flatten_config_list(in_rules, 'variants')
- vmaker = _VariantMaker(normalization_rules)
+ vmaker = _VariantMaker(normalizer)
for section in rules:
for rule in (section.get('words') or []):
All text in rules is normalized to make sure the variants match later.
"""
- def __init__(self, norm_rules: Any) -> None:
- self.norm = Transliterator.createFromRules("rule_loader_normalization",
- norm_rules)
+ def __init__(self, normalizer: Any) -> None:
+ self.norm = normalizer
def compute(self, rule: Any) -> Iterator[ICUVariant]:
### Configuration section
-def configure(rules: Mapping[str, Any], normalization_rules: str) -> Dict[str, Any]:
+def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, Any]:
""" Extract and preprocess the configuration for this module.
"""
config: Dict[str, Any] = {}
config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
- normalization_rules)
+ normalizer)
config['variant_only'] = rules.get('mode', '') == 'variant-only'
# parse mutation rules
Specialized processor for housenumbers. Analyses common housenumber patterns
and creates variants for them.
"""
-from typing import Mapping, Any, List, cast
+from typing import Any, List, cast
import re
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
### Configuration section
-def configure(rules: Mapping[str, Any], normalization_rules: str) -> None: # pylint: disable=W0613
+def configure(*_: Any) -> None:
""" All behaviour is currently hard-coded.
"""
return None
Specialized processor for postcodes. Supports a 'lookup' variant of the
token, which produces variants with optional spaces.
"""
-from typing import Mapping, Any, List
+from typing import Any, List
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
### Configuration section
-def configure(rules: Mapping[str, Any], normalization_rules: str) -> None: # pylint: disable=W0613
+def configure(*_: Any) -> None:
""" All behaviour is currently hard-coded.
"""
return None
rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]}
if variant_only:
rules['mode'] = 'variant-only'
- config = module.configure(rules, DEFAULT_NORMALIZATION)
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
+ config = module.configure(rules, norm, trans)
return module.create(norm, trans, config)
def test_no_variants():
rules = { 'analyzer': 'generic' }
- config = module.configure(rules, DEFAULT_NORMALIZATION)
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
+ config = module.configure(rules, norm, trans)
proc = module.create(norm, trans, config)
@staticmethod
def configure_rules(*variants):
rules = { 'analyzer': 'generic', 'variants': [{'words': variants}]}
- return module.configure(rules, DEFAULT_NORMALIZATION)
+ trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
+ norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
+ return module.configure(rules, norm, trans)
def get_replacements(self, *variants):
'mutations': [ {'pattern': m[0], 'replacements': m[1]}
for m in mutations]
}
- config = module.configure(rules, DEFAULT_NORMALIZATION)
trans = Transliterator.createFromRules("test_trans", DEFAULT_TRANSLITERATION)
norm = Transliterator.createFromRules("test_norm", DEFAULT_NORMALIZATION)
+ config = module.configure(rules, norm, trans)
self.analysis = module.create(norm, trans, config)