X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/4c52777ef03738803845f9ee58d269d93bbb9c3d..16daa57e4757e4daeffec1e61630f989727dc563:/nominatim/tokenizer/icu_rule_loader.py diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py index d3141bf7..bd0739f2 100644 --- a/nominatim/tokenizer/icu_rule_loader.py +++ b/nominatim/tokenizer/icu_rule_loader.py @@ -2,30 +2,36 @@ Helper class to create ICU rules from a configuration file. """ import io +import json import logging import itertools -from pathlib import Path import re -import yaml from icu import Transliterator +from nominatim.db.properties import set_property, get_property from nominatim.errors import UsageError +from nominatim.tokenizer.icu_name_processor import ICUNameProcessor import nominatim.tokenizer.icu_variants as variants LOG = logging.getLogger() -def _flatten_yaml_list(content): +DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation" +DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration" +DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules" + + +def _flatten_config_list(content): if not content: return [] if not isinstance(content, list): - raise UsageError("List expected in ICU yaml configuration.") + raise UsageError("List expected in ICU configuration.") output = [] for ele in content: if isinstance(ele, list): - output.extend(_flatten_yaml_list(ele)) + output.extend(_flatten_config_list(ele)) else: output.append(ele) @@ -48,14 +54,43 @@ class ICURuleLoader: """ Compiler for ICU rules from a tokenizer configuration file. """ - def __init__(self, configfile): - self.configfile = configfile + def __init__(self, config): + rules = config.load_sub_configuration('icu_tokenizer.yaml', + config='TOKENIZER_CONFIG') + self.variants = set() - if configfile.suffix == '.yaml': - self._load_from_yaml() - else: - raise UsageError("Unknown format of tokenizer configuration.") + self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization') + self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration') + self.analysis_rules = self._get_section(rules, 'variants') + self._parse_variant_list() + + + def load_config_from_db(self, conn): + """ Get previously saved parts of the configuration from the + database. + """ + self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES) + self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES) + self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES)) + self._parse_variant_list() + + + def save_config_to_db(self, conn): + """ Save the part of the configuration that cannot be changed into + the database. + """ + set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules) + set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules) + set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules)) + + + def make_token_analysis(self): + """ Create a token analyser from the reviouly loaded rules. + """ + return ICUNameProcessor(self.normalization_rules, + self.transliteration_rules, + self.variants) def get_search_rules(self): @@ -88,34 +123,14 @@ class ICURuleLoader: """ return self.variants - def _yaml_include_representer(self, loader, node): - value = loader.construct_scalar(node) - - if Path(value).is_absolute(): - content = Path(value).read_text() - else: - content = (self.configfile.parent / value).read_text() - - return yaml.safe_load(content) - - def _load_from_yaml(self): - yaml.add_constructor('!include', self._yaml_include_representer, - Loader=yaml.SafeLoader) - rules = yaml.safe_load(self.configfile.read_text()) - - self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization') - self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration') - self._parse_variant_list(self._get_section(rules, 'variants')) - - - def _get_section(self, rules, section): + @staticmethod + def _get_section(rules, section): """ Get the section named 'section' from the rules. If the section does not exist, raise a usage error with a meaningful message. """ if section not in rules: - LOG.fatal("Section '%s' not found in tokenizer config '%s'.", - section, str(self.configfile)) + LOG.fatal("Section '%s' not found in tokenizer config.", section) raise UsageError("Syntax error in tokenizer configuration file.") return rules[section] @@ -133,16 +148,18 @@ class ICURuleLoader: if content is None: return '' - return ';'.join(_flatten_yaml_list(content)) + ';' + return ';'.join(_flatten_config_list(content)) + ';' + + def _parse_variant_list(self): + rules = self.analysis_rules - def _parse_variant_list(self, rules): self.variants.clear() if not rules: return - rules = _flatten_yaml_list(rules) + rules = _flatten_config_list(rules) vmaker = _VariantMaker(self.normalization_rules)