@abstractmethod
- def init_from_project(self) -> None:
+ def init_from_project(self, config: Configuration) -> None:
""" Initialise the tokenizer from an existing database setup.
The function should load all previously saved configuration from
the project directory and/or the property table.
+
+ Arguments:
+ config: Read-only object with configuration options.
"""
pass
@abstractmethod
- def check_database(self) -> str:
+ def check_database(self, config: Configuration) -> str:
""" Check that the database is set up correctly and ready for being
queried.
description of the issue as well as hints for the user on
how to resolve the issue.
+ Arguments:
+ config: Read-only object with configuration options.
+
Return `None`, if no issue was found.
"""
pass
tokenizer_module = _import_tokenizer(name)
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
- tokenizer.init_from_project()
+ tokenizer.init_from_project(config)
return tokenizer
from icu import Transliterator
import datrie
-from nominatim.db.properties import set_property, get_property
-from nominatim.tokenizer import icu_variants as variants
-
-DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
-DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
-DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements"
-DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization"
-
-
-class ICUNameProcessorRules:
- """ Data object that saves the rules needed for the name processor.
-
- The rules can either be initialised through an ICURuleLoader or
- be loaded from a database when a connection is given.
- """
- def __init__(self, loader=None, conn=None):
- if loader is not None:
- self.norm_rules = loader.get_normalization_rules()
- self.trans_rules = loader.get_transliteration_rules()
- self.replacements = loader.get_replacement_pairs()
- self.search_rules = loader.get_search_rules()
- elif conn is not None:
- self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
- self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
- self.replacements = \
- variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
- self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
- else:
- assert False, "Parameter loader or conn required."
-
-
- def save_rules(self, conn):
- """ Save the rules in the property table of the given database.
- the rules can be loaded again by handing in a connection into
- the constructor of the class.
- """
- set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
- set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
- set_property(conn, DBCFG_IMPORT_REPLACEMENTS,
- variants.pickle_variant_set(self.replacements))
- set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
-
class ICUNameProcessor:
""" Collects the different transformation rules for normalisation of names
- and provides the functions to aply the transformations.
+ and provides the functions to apply the transformations.
"""
- def __init__(self, rules):
+ def __init__(self, norm_rules, trans_rules, replacements):
self.normalizer = Transliterator.createFromRules("icu_normalization",
- rules.norm_rules)
+ norm_rules)
self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
- rules.trans_rules +
+ trans_rules +
";[:Space:]+ > ' '")
self.search = Transliterator.createFromRules("icu_search",
- rules.search_rules)
+ norm_rules + trans_rules)
# Intermediate reorder by source. Also compute required character set.
immediate = defaultdict(list)
chars = set()
- for variant in rules.replacements:
+ for variant in replacements:
if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
replstr = variant.replacement[:-1]
else:
Helper class to create ICU rules from a configuration file.
"""
import io
+import json
import logging
import itertools
import re
from icu import Transliterator
+from nominatim.db.properties import set_property, get_property
from nominatim.errors import UsageError
+from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
import nominatim.tokenizer.icu_variants as variants
LOG = logging.getLogger()
+DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
+DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
+DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
+
+
def _flatten_config_list(content):
if not content:
return []
""" Compiler for ICU rules from a tokenizer configuration file.
"""
- def __init__(self, rules):
+ def __init__(self, config):
+ rules = config.load_sub_configuration('icu_tokenizer.yaml',
+ config='TOKENIZER_CONFIG')
+
self.variants = set()
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
- self._parse_variant_list(self._get_section(rules, 'variants'))
+ self.analysis_rules = self._get_section(rules, 'variants')
+ self._parse_variant_list()
+
+
+ def load_config_from_db(self, conn):
+ """ Get previously saved parts of the configuration from the
+ database.
+ """
+ self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
+ self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
+ self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
+ self._parse_variant_list()
+
+
+ def save_config_to_db(self, conn):
+ """ Save the part of the configuration that cannot be changed into
+ the database.
+ """
+ set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules)
+ set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules)
+ set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
+
+
+ def make_token_analysis(self):
+ """ Create a token analyser from the reviouly loaded rules.
+ """
+ return ICUNameProcessor(self.normalization_rules,
+ self.transliteration_rules,
+ self.variants)
def get_search_rules(self):
return ';'.join(_flatten_config_list(content)) + ';'
- def _parse_variant_list(self, rules):
+ def _parse_variant_list(self):
+ rules = self.analysis_rules
+
self.variants.clear()
if not rules:
from nominatim.db.utils import CopyBuffer
from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
def __init__(self, dsn, data_dir):
self.dsn = dsn
self.data_dir = data_dir
- self.naming_rules = None
+ self.loader = None
self.term_normalization = None
This copies all necessary data in the project directory to make
sure the tokenizer remains stable even over updates.
"""
- loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
- config='TOKENIZER_CONFIG'))
- self.naming_rules = ICUNameProcessorRules(loader=loader)
+ self.loader = ICURuleLoader(config)
+
self.term_normalization = config.TERM_NORMALIZATION
self._install_php(config.lib_dir.php)
self._init_db_tables(config)
- def init_from_project(self):
+ def init_from_project(self, config):
""" Initialise the tokenizer from the project directory.
"""
+ self.loader = ICURuleLoader(config)
+
with connect(self.dsn) as conn:
- self.naming_rules = ICUNameProcessorRules(conn=conn)
+ self.loader.load_config_from_db(conn)
self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
- def check_database(self):
+ def check_database(self, config):
""" Check that the tokenizer is set up correctly.
"""
- self.init_from_project()
+ self.init_from_project(config)
- if self.naming_rules is None:
+ if self.term_normalization is None:
return "Configuration for tokenizer 'icu' are missing."
return None
Analyzers are not thread-safe. You need to instantiate one per thread.
"""
- return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
+ return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis())
def _install_php(self, phpdir):
<?php
@define('CONST_Max_Word_Frequency', 10000000);
@define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
- @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
+ @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
database as database properties.
"""
with connect(self.dsn) as conn:
- self.naming_rules.save_rules(conn)
-
+ self.loader.save_config_to_db(conn)
set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
""" Count the partial terms from the names in the place table.
"""
words = Counter()
- name_proc = ICUNameProcessor(self.naming_rules)
+ name_proc = self.loader.make_token_analysis()
with conn.cursor(name="words") as cur:
cur.execute(""" SELECT v, count(*) FROM
Data structures for saving variant expansions for ICU tokenizer.
"""
from collections import namedtuple
-import json
_ICU_VARIANT_PORPERTY_FIELDS = ['lang']
ICUVariant = namedtuple('ICUVariant', ['source', 'replacement', 'properties'])
-
-
-def pickle_variant_set(variants):
- """ Serializes an iterable of variant rules to a string.
- """
- # Create a list of property sets. So they don't need to be duplicated
- properties = {}
- pid = 1
- for variant in variants:
- if variant.properties not in properties:
- properties[variant.properties] = pid
- pid += 1
-
- # Convert the variants into a simple list.
- variants = [(v.source, v.replacement, properties[v.properties]) for v in variants]
-
- # Convert everythin to json.
- return json.dumps({'properties': {v: k._asdict() for k, v in properties.items()},
- 'variants': variants})
-
-
-def unpickle_variant_set(variant_string):
- """ Deserializes a variant string that was previously created with
- pickle_variant_set() into a set of ICUVariants.
- """
- data = json.loads(variant_string)
-
- properties = {int(k): ICUVariantProperties.from_rules(v)
- for k, v in data['properties'].items()}
-
- return set((ICUVariant(src, repl, properties[pid]) for src, repl, pid in data['variants']))
self._init_db_tables(config)
- def init_from_project(self):
+ def init_from_project(self, _):
""" Initialise the tokenizer from the project directory.
"""
with connect(self.dsn) as conn:
modulepath=modulepath)
- def check_database(self):
+ def check_database(self, _):
""" Check that the tokenizer is set up correctly.
"""
hint = """\
return CheckState.FAIL, dict(msg="""\
Cannot load tokenizer. Did the import finish sucessfully?""")
- result = tokenizer.check_database()
+ result = tokenizer.check_database(config)
if result is None:
return CheckState.OK
Tokenizer for testing.
"""
from nominatim.indexer.place_info import PlaceInfo
+from nominatim.config import Configuration
def create(dsn, data_dir):
""" Create a new instance of the tokenizer provided by this module.
self.init_state = "new"
- def init_from_project(self):
+ def init_from_project(self, config):
+ assert isinstance(config, Configuration)
assert self.init_state is None
self.init_state = "loaded"
import pytest
from nominatim.tokenizer import icu_tokenizer
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.db import properties
from nominatim.db.sql_preprocessor import SQLPreprocessor
cfgstr = {'normalization' : list(norm),
'transliteration' : list(trans),
'variants' : [ {'words': list(variants)}]}
- tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgstr))
+ (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
+ tok.loader = ICURuleLoader(test_config)
return tok.name_analyzer()
monkeypatch.undo()
tok = tokenizer_factory()
- tok.init_from_project()
+ tok.init_from_project(test_config)
- assert tok.naming_rules is not None
+ assert tok.loader is not None
assert tok.term_normalization == ':: lower();'
from textwrap import dedent
import pytest
-import yaml
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
from nominatim.errors import UsageError
@pytest.fixture
-def cfgfile():
+def cfgfile(def_config, tmp_path):
+ project_dir = tmp_path / 'project_dir'
+ project_dir.mkdir()
+ def_config.project_dir = project_dir
+
def _create_config(*variants, **kwargs):
content = dedent("""\
normalization:
content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs:
content += " {}: {}\n".format(k, v)
- return yaml.safe_load(content)
+ (project_dir / 'icu_tokenizer.yaml').write_text(content)
+
+ return def_config
return _create_config
def test_variants_empty(cfgfile):
- fpath = cfgfile('saint -> 🜵', 'street -> st')
+ config = cfgfile('saint -> 🜵', 'street -> st')
- rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
- proc = ICUNameProcessor(rules)
+ proc = ICURuleLoader(config).make_token_analysis()
assert get_normalized_variants(proc, '🜵') == []
assert get_normalized_variants(proc, '🜳') == []
@pytest.mark.parametrize("rules,name,variants", VARIANT_TESTS)
def test_variants(cfgfile, rules, name, variants):
- fpath = cfgfile(*rules)
- proc = ICUNameProcessor(ICUNameProcessorRules(loader=ICURuleLoader(fpath)))
+ config = cfgfile(*rules)
+ proc = ICURuleLoader(config).make_token_analysis()
result = get_normalized_variants(proc, name)
def test_search_normalized(cfgfile):
- fpath = cfgfile('~street => s,st', 'master => mstr')
-
- rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
- proc = ICUNameProcessor(rules)
+ config = cfgfile('~street => s,st', 'master => mstr')
+ proc = ICURuleLoader(config).make_token_analysis()
assert proc.get_search_normalized('Master Street') == 'master street'
assert proc.get_search_normalized('Earnes St') == 'earnes st'
from icu import Transliterator
@pytest.fixture
-def cfgrules():
+def test_config(def_config, tmp_path):
+ project_dir = tmp_path / 'project_dir'
+ project_dir.mkdir()
+ def_config.project_dir = project_dir
+
+ return def_config
+
+
+@pytest.fixture
+def cfgrules(test_config):
def _create_config(*variants, **kwargs):
content = dedent("""\
normalization:
content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs:
content += " {}: {}\n".format(k, v)
- return yaml.safe_load(content)
+ (test_config.project_dir / 'icu_tokenizer.yaml').write_text(content)
+
+ return test_config
return _create_config
-def test_empty_rule_set():
- rule_cfg = yaml.safe_load(dedent("""\
+def test_empty_rule_set(test_config):
+ (test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\
normalization:
transliteration:
variants:
"""))
- rules = ICURuleLoader(rule_cfg)
+ rules = ICURuleLoader(test_config)
assert rules.get_search_rules() == ''
assert rules.get_normalization_rules() == ''
assert rules.get_transliteration_rules() == ''
CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
@pytest.mark.parametrize("section", CONFIG_SECTIONS)
-def test_missing_section(section):
+def test_missing_section(section, test_config):
rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
+ (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg))
with pytest.raises(UsageError):
- ICURuleLoader(rule_cfg)
+ ICURuleLoader(test_config)
def test_get_search_rules(cfgrules):
assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
-def test_transliteration_rules_from_file(def_config, tmp_path):
- def_config.project_dir = tmp_path
- cfgpath = tmp_path / ('test_config.yaml')
+def test_transliteration_rules_from_file(test_config):
+ cfgpath = test_config.project_dir / ('icu_tokenizer.yaml')
cfgpath.write_text(dedent("""\
normalization:
transliteration:
- !include transliteration.yaml
variants:
"""))
- transpath = tmp_path / ('transliteration.yaml')
+ transpath = test_config.project_dir / ('transliteration.yaml')
transpath.write_text('- "x > y"')
- loader = ICURuleLoader(def_config.load_sub_configuration('test_config.yaml'))
+ loader = ICURuleLoader(test_config)
rules = loader.get_transliteration_rules()
trans = Transliterator.createFromRules("test", rules)
assert not (test_config.project_dir / 'module').exists()
-def test_init_from_project(tokenizer_setup, tokenizer_factory):
+def test_init_from_project(tokenizer_setup, tokenizer_factory, test_config):
tok = tokenizer_factory()
- tok.init_from_project()
+ tok.init_from_project(test_config)
assert tok.normalization is not None
check_result, state):
class _TestTokenizer:
@staticmethod
- def check_database():
+ def check_database(_):
return check_result
monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db',