From 16daa57e4757e4daeffec1e61630f989727dc563 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 29 Sep 2021 17:37:04 +0200 Subject: [PATCH 1/1] unify ICUNameProcessorRules and ICURuleLoader There is no need for the additional layer of indirection that the ICUNameProcessorRules class adds. The ICURuleLoader can fill the database properties directly. --- nominatim/tokenizer/base.py | 10 +++- nominatim/tokenizer/factory.py | 2 +- nominatim/tokenizer/icu_name_processor.py | 54 +++---------------- nominatim/tokenizer/icu_rule_loader.py | 47 ++++++++++++++-- nominatim/tokenizer/icu_tokenizer.py | 29 +++++----- nominatim/tokenizer/icu_variants.py | 32 ----------- nominatim/tokenizer/legacy_tokenizer.py | 4 +- nominatim/tools/check_database.py | 2 +- test/python/dummy_tokenizer.py | 4 +- test/python/test_tokenizer_icu.py | 8 +-- .../test_tokenizer_icu_name_processor.py | 27 +++++----- test/python/test_tokenizer_icu_rule_loader.py | 35 +++++++----- test/python/test_tokenizer_legacy.py | 4 +- test/python/test_tools_check_database.py | 2 +- 14 files changed, 123 insertions(+), 137 deletions(-) diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py index e126507b..53289c78 100644 --- a/nominatim/tokenizer/base.py +++ b/nominatim/tokenizer/base.py @@ -149,11 +149,14 @@ class AbstractTokenizer(ABC): @abstractmethod - def init_from_project(self) -> None: + def init_from_project(self, config: Configuration) -> None: """ Initialise the tokenizer from an existing database setup. The function should load all previously saved configuration from the project directory and/or the property table. + + Arguments: + config: Read-only object with configuration options. """ pass @@ -187,7 +190,7 @@ class AbstractTokenizer(ABC): @abstractmethod - def check_database(self) -> str: + def check_database(self, config: Configuration) -> str: """ Check that the database is set up correctly and ready for being queried. @@ -196,6 +199,9 @@ class AbstractTokenizer(ABC): description of the issue as well as hints for the user on how to resolve the issue. + Arguments: + config: Read-only object with configuration options. + Return `None`, if no issue was found. """ pass diff --git a/nominatim/tokenizer/factory.py b/nominatim/tokenizer/factory.py index 069672d4..dc3e7411 100644 --- a/nominatim/tokenizer/factory.py +++ b/nominatim/tokenizer/factory.py @@ -85,6 +85,6 @@ def get_tokenizer_for_db(config): tokenizer_module = _import_tokenizer(name) tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir) - tokenizer.init_from_project() + tokenizer.init_from_project(config) return tokenizer diff --git a/nominatim/tokenizer/icu_name_processor.py b/nominatim/tokenizer/icu_name_processor.py index 93d2b0ff..544f5ebc 100644 --- a/nominatim/tokenizer/icu_name_processor.py +++ b/nominatim/tokenizer/icu_name_processor.py @@ -8,67 +8,25 @@ import itertools from icu import Transliterator import datrie -from nominatim.db.properties import set_property, get_property -from nominatim.tokenizer import icu_variants as variants - -DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation" -DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration" -DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements" -DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization" - - -class ICUNameProcessorRules: - """ Data object that saves the rules needed for the name processor. - - The rules can either be initialised through an ICURuleLoader or - be loaded from a database when a connection is given. - """ - def __init__(self, loader=None, conn=None): - if loader is not None: - self.norm_rules = loader.get_normalization_rules() - self.trans_rules = loader.get_transliteration_rules() - self.replacements = loader.get_replacement_pairs() - self.search_rules = loader.get_search_rules() - elif conn is not None: - self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES) - self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES) - self.replacements = \ - variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS)) - self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES) - else: - assert False, "Parameter loader or conn required." - - - def save_rules(self, conn): - """ Save the rules in the property table of the given database. - the rules can be loaded again by handing in a connection into - the constructor of the class. - """ - set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules) - set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules) - set_property(conn, DBCFG_IMPORT_REPLACEMENTS, - variants.pickle_variant_set(self.replacements)) - set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules) - class ICUNameProcessor: """ Collects the different transformation rules for normalisation of names - and provides the functions to aply the transformations. + and provides the functions to apply the transformations. """ - def __init__(self, rules): + def __init__(self, norm_rules, trans_rules, replacements): self.normalizer = Transliterator.createFromRules("icu_normalization", - rules.norm_rules) + norm_rules) self.to_ascii = Transliterator.createFromRules("icu_to_ascii", - rules.trans_rules + + trans_rules + ";[:Space:]+ > ' '") self.search = Transliterator.createFromRules("icu_search", - rules.search_rules) + norm_rules + trans_rules) # Intermediate reorder by source. Also compute required character set. immediate = defaultdict(list) chars = set() - for variant in rules.replacements: + for variant in replacements: if variant.source[-1] == ' ' and variant.replacement[-1] == ' ': replstr = variant.replacement[:-1] else: diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py index 0e6e40b4..bd0739f2 100644 --- a/nominatim/tokenizer/icu_rule_loader.py +++ b/nominatim/tokenizer/icu_rule_loader.py @@ -2,17 +2,25 @@ Helper class to create ICU rules from a configuration file. """ import io +import json import logging import itertools import re from icu import Transliterator +from nominatim.db.properties import set_property, get_property from nominatim.errors import UsageError +from nominatim.tokenizer.icu_name_processor import ICUNameProcessor import nominatim.tokenizer.icu_variants as variants LOG = logging.getLogger() +DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation" +DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration" +DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules" + + def _flatten_config_list(content): if not content: return [] @@ -46,12 +54,43 @@ class ICURuleLoader: """ Compiler for ICU rules from a tokenizer configuration file. """ - def __init__(self, rules): + def __init__(self, config): + rules = config.load_sub_configuration('icu_tokenizer.yaml', + config='TOKENIZER_CONFIG') + self.variants = set() self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization') self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration') - self._parse_variant_list(self._get_section(rules, 'variants')) + self.analysis_rules = self._get_section(rules, 'variants') + self._parse_variant_list() + + + def load_config_from_db(self, conn): + """ Get previously saved parts of the configuration from the + database. + """ + self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES) + self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES) + self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES)) + self._parse_variant_list() + + + def save_config_to_db(self, conn): + """ Save the part of the configuration that cannot be changed into + the database. + """ + set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules) + set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules) + set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules)) + + + def make_token_analysis(self): + """ Create a token analyser from the reviouly loaded rules. + """ + return ICUNameProcessor(self.normalization_rules, + self.transliteration_rules, + self.variants) def get_search_rules(self): @@ -112,7 +151,9 @@ class ICURuleLoader: return ';'.join(_flatten_config_list(content)) + ';' - def _parse_variant_list(self, rules): + def _parse_variant_list(self): + rules = self.analysis_rules + self.variants.clear() if not rules: diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index fbaa2596..87906d71 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -14,7 +14,6 @@ from nominatim.db.properties import set_property, get_property from nominatim.db.utils import CopyBuffer from nominatim.db.sql_preprocessor import SQLPreprocessor from nominatim.tokenizer.icu_rule_loader import ICURuleLoader -from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization" @@ -36,7 +35,7 @@ class LegacyICUTokenizer(AbstractTokenizer): def __init__(self, dsn, data_dir): self.dsn = dsn self.data_dir = data_dir - self.naming_rules = None + self.loader = None self.term_normalization = None @@ -46,9 +45,8 @@ class LegacyICUTokenizer(AbstractTokenizer): This copies all necessary data in the project directory to make sure the tokenizer remains stable even over updates. """ - loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml', - config='TOKENIZER_CONFIG')) - self.naming_rules = ICUNameProcessorRules(loader=loader) + self.loader = ICURuleLoader(config) + self.term_normalization = config.TERM_NORMALIZATION self._install_php(config.lib_dir.php) @@ -59,11 +57,13 @@ class LegacyICUTokenizer(AbstractTokenizer): self._init_db_tables(config) - def init_from_project(self): + def init_from_project(self, config): """ Initialise the tokenizer from the project directory. """ + self.loader = ICURuleLoader(config) + with connect(self.dsn) as conn: - self.naming_rules = ICUNameProcessorRules(conn=conn) + self.loader.load_config_from_db(conn) self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION) @@ -81,12 +81,12 @@ class LegacyICUTokenizer(AbstractTokenizer): sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql') - def check_database(self): + def check_database(self, config): """ Check that the tokenizer is set up correctly. """ - self.init_from_project() + self.init_from_project(config) - if self.naming_rules is None: + if self.term_normalization is None: return "Configuration for tokenizer 'icu' are missing." return None @@ -107,7 +107,7 @@ class LegacyICUTokenizer(AbstractTokenizer): Analyzers are not thread-safe. You need to instantiate one per thread. """ - return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules)) + return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis()) def _install_php(self, phpdir): @@ -118,7 +118,7 @@ class LegacyICUTokenizer(AbstractTokenizer): 🜵', 'street -> st') + config = cfgfile('saint -> 🜵', 'street -> st') - rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath)) - proc = ICUNameProcessor(rules) + proc = ICURuleLoader(config).make_token_analysis() assert get_normalized_variants(proc, '🜵') == [] assert get_normalized_variants(proc, '🜳') == [] @@ -83,8 +86,8 @@ VARIANT_TESTS = [ @pytest.mark.parametrize("rules,name,variants", VARIANT_TESTS) def test_variants(cfgfile, rules, name, variants): - fpath = cfgfile(*rules) - proc = ICUNameProcessor(ICUNameProcessorRules(loader=ICURuleLoader(fpath))) + config = cfgfile(*rules) + proc = ICURuleLoader(config).make_token_analysis() result = get_normalized_variants(proc, name) @@ -93,10 +96,8 @@ def test_variants(cfgfile, rules, name, variants): def test_search_normalized(cfgfile): - fpath = cfgfile('~street => s,st', 'master => mstr') - - rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath)) - proc = ICUNameProcessor(rules) + config = cfgfile('~street => s,st', 'master => mstr') + proc = ICURuleLoader(config).make_token_analysis() assert proc.get_search_normalized('Master Street') == 'master street' assert proc.get_search_normalized('Earnes St') == 'earnes st' diff --git a/test/python/test_tokenizer_icu_rule_loader.py b/test/python/test_tokenizer_icu_rule_loader.py index c3480de8..6ec53edc 100644 --- a/test/python/test_tokenizer_icu_rule_loader.py +++ b/test/python/test_tokenizer_icu_rule_loader.py @@ -12,7 +12,16 @@ from nominatim.errors import UsageError from icu import Transliterator @pytest.fixture -def cfgrules(): +def test_config(def_config, tmp_path): + project_dir = tmp_path / 'project_dir' + project_dir.mkdir() + def_config.project_dir = project_dir + + return def_config + + +@pytest.fixture +def cfgrules(test_config): def _create_config(*variants, **kwargs): content = dedent("""\ normalization: @@ -29,19 +38,21 @@ def cfgrules(): content += '\n'.join((" - " + s for s in variants)) + '\n' for k, v in kwargs: content += " {}: {}\n".format(k, v) - return yaml.safe_load(content) + (test_config.project_dir / 'icu_tokenizer.yaml').write_text(content) + + return test_config return _create_config -def test_empty_rule_set(): - rule_cfg = yaml.safe_load(dedent("""\ +def test_empty_rule_set(test_config): + (test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\ normalization: transliteration: variants: """)) - rules = ICURuleLoader(rule_cfg) + rules = ICURuleLoader(test_config) assert rules.get_search_rules() == '' assert rules.get_normalization_rules() == '' assert rules.get_transliteration_rules() == '' @@ -50,11 +61,12 @@ def test_empty_rule_set(): CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants') @pytest.mark.parametrize("section", CONFIG_SECTIONS) -def test_missing_section(section): +def test_missing_section(section, test_config): rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section} + (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg)) with pytest.raises(UsageError): - ICURuleLoader(rule_cfg) + ICURuleLoader(test_config) def test_get_search_rules(cfgrules): @@ -88,9 +100,8 @@ def test_get_transliteration_rules(cfgrules): assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt " -def test_transliteration_rules_from_file(def_config, tmp_path): - def_config.project_dir = tmp_path - cfgpath = tmp_path / ('test_config.yaml') +def test_transliteration_rules_from_file(test_config): + cfgpath = test_config.project_dir / ('icu_tokenizer.yaml') cfgpath.write_text(dedent("""\ normalization: transliteration: @@ -98,10 +109,10 @@ def test_transliteration_rules_from_file(def_config, tmp_path): - !include transliteration.yaml variants: """)) - transpath = tmp_path / ('transliteration.yaml') + transpath = test_config.project_dir / ('transliteration.yaml') transpath.write_text('- "x > y"') - loader = ICURuleLoader(def_config.load_sub_configuration('test_config.yaml')) + loader = ICURuleLoader(test_config) rules = loader.get_transliteration_rules() trans = Transliterator.createFromRules("test", rules) diff --git a/test/python/test_tokenizer_legacy.py b/test/python/test_tokenizer_legacy.py index 2545c2db..53d45c1c 100644 --- a/test/python/test_tokenizer_legacy.py +++ b/test/python/test_tokenizer_legacy.py @@ -132,10 +132,10 @@ def test_init_module_custom(tokenizer_factory, test_config, assert not (test_config.project_dir / 'module').exists() -def test_init_from_project(tokenizer_setup, tokenizer_factory): +def test_init_from_project(tokenizer_setup, tokenizer_factory, test_config): tok = tokenizer_factory() - tok.init_from_project() + tok.init_from_project(test_config) assert tok.normalization is not None diff --git a/test/python/test_tools_check_database.py b/test/python/test_tools_check_database.py index aed5cb7e..edba3236 100644 --- a/test/python/test_tools_check_database.py +++ b/test/python/test_tools_check_database.py @@ -53,7 +53,7 @@ def test_check_tokenizer(temp_db_conn, def_config, monkeypatch, check_result, state): class _TestTokenizer: @staticmethod - def check_database(): + def check_database(_): return check_result monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db', -- 2.39.5