]> git.openstreetmap.org Git - nominatim.git/commitdiff
unify ICUNameProcessorRules and ICURuleLoader
authorSarah Hoffmann <lonvia@denofr.de>
Wed, 29 Sep 2021 15:37:04 +0000 (17:37 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Fri, 1 Oct 2021 10:27:24 +0000 (12:27 +0200)
There is no need for the additional layer of indirection that
the ICUNameProcessorRules class adds. The ICURuleLoader can
fill the database properties directly.

14 files changed:
nominatim/tokenizer/base.py
nominatim/tokenizer/factory.py
nominatim/tokenizer/icu_name_processor.py
nominatim/tokenizer/icu_rule_loader.py
nominatim/tokenizer/icu_tokenizer.py
nominatim/tokenizer/icu_variants.py
nominatim/tokenizer/legacy_tokenizer.py
nominatim/tools/check_database.py
test/python/dummy_tokenizer.py
test/python/test_tokenizer_icu.py
test/python/test_tokenizer_icu_name_processor.py
test/python/test_tokenizer_icu_rule_loader.py
test/python/test_tokenizer_legacy.py
test/python/test_tools_check_database.py

index e126507bde1c07be7653072e05349d0a7959fc55..53289c7816e9a32ca671c2205d35f19b217d5d68 100644 (file)
@@ -149,11 +149,14 @@ class AbstractTokenizer(ABC):
 
 
     @abstractmethod
 
 
     @abstractmethod
-    def init_from_project(self) -> None:
+    def init_from_project(self, config: Configuration) -> None:
         """ Initialise the tokenizer from an existing database setup.
 
             The function should load all previously saved configuration from
             the project directory and/or the property table.
         """ Initialise the tokenizer from an existing database setup.
 
             The function should load all previously saved configuration from
             the project directory and/or the property table.
+
+            Arguments:
+              config: Read-only object with configuration options.
         """
         pass
 
         """
         pass
 
@@ -187,7 +190,7 @@ class AbstractTokenizer(ABC):
 
 
     @abstractmethod
 
 
     @abstractmethod
-    def check_database(self) -> str:
+    def check_database(self, config: Configuration) -> str:
         """ Check that the database is set up correctly and ready for being
             queried.
 
         """ Check that the database is set up correctly and ready for being
             queried.
 
@@ -196,6 +199,9 @@ class AbstractTokenizer(ABC):
               description of the issue as well as hints for the user on
               how to resolve the issue.
 
               description of the issue as well as hints for the user on
               how to resolve the issue.
 
+            Arguments:
+              config: Read-only object with configuration options.
+
               Return `None`, if no issue was found.
         """
         pass
               Return `None`, if no issue was found.
         """
         pass
index 069672d4a1fd4d9b874943b5d44a367d4f2ef9e8..dc3e7411fa4e865c62e356b6187c910c4ea72b4c 100644 (file)
@@ -85,6 +85,6 @@ def get_tokenizer_for_db(config):
     tokenizer_module = _import_tokenizer(name)
 
     tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
     tokenizer_module = _import_tokenizer(name)
 
     tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
-    tokenizer.init_from_project()
+    tokenizer.init_from_project(config)
 
     return tokenizer
 
     return tokenizer
index 93d2b0ffa26b9151ccba1928c0e7d0745ce4380a..544f5ebce9bf8f3d9b9e477d5689e1a142e65853 100644 (file)
@@ -8,67 +8,25 @@ import itertools
 from icu import Transliterator
 import datrie
 
 from icu import Transliterator
 import datrie
 
-from nominatim.db.properties import set_property, get_property
-from nominatim.tokenizer import icu_variants as variants
-
-DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
-DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
-DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements"
-DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization"
-
-
-class ICUNameProcessorRules:
-    """ Data object that saves the rules needed for the name processor.
-
-        The rules can either be initialised through an ICURuleLoader or
-        be loaded from a database when a connection is given.
-    """
-    def __init__(self, loader=None, conn=None):
-        if loader is not None:
-            self.norm_rules = loader.get_normalization_rules()
-            self.trans_rules = loader.get_transliteration_rules()
-            self.replacements = loader.get_replacement_pairs()
-            self.search_rules = loader.get_search_rules()
-        elif conn is not None:
-            self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
-            self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
-            self.replacements = \
-                variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
-            self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
-        else:
-            assert False, "Parameter loader or conn required."
-
-
-    def save_rules(self, conn):
-        """ Save the rules in the property table of the given database.
-            the rules can be loaded again by handing in a connection into
-            the constructor of the class.
-        """
-        set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
-        set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
-        set_property(conn, DBCFG_IMPORT_REPLACEMENTS,
-                     variants.pickle_variant_set(self.replacements))
-        set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
-
 
 class ICUNameProcessor:
     """ Collects the different transformation rules for normalisation of names
 
 class ICUNameProcessor:
     """ Collects the different transformation rules for normalisation of names
-        and provides the functions to aply the transformations.
+        and provides the functions to apply the transformations.
     """
 
     """
 
-    def __init__(self, rules):
+    def __init__(self, norm_rules, trans_rules, replacements):
         self.normalizer = Transliterator.createFromRules("icu_normalization",
         self.normalizer = Transliterator.createFromRules("icu_normalization",
-                                                         rules.norm_rules)
+                                                         norm_rules)
         self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
         self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
-                                                       rules.trans_rules +
+                                                       trans_rules +
                                                        ";[:Space:]+ > ' '")
         self.search = Transliterator.createFromRules("icu_search",
                                                        ";[:Space:]+ > ' '")
         self.search = Transliterator.createFromRules("icu_search",
-                                                     rules.search_rules)
+                                                     norm_rules + trans_rules)
 
         # Intermediate reorder by source. Also compute required character set.
         immediate = defaultdict(list)
         chars = set()
 
         # Intermediate reorder by source. Also compute required character set.
         immediate = defaultdict(list)
         chars = set()
-        for variant in rules.replacements:
+        for variant in replacements:
             if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
                 replstr = variant.replacement[:-1]
             else:
             if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
                 replstr = variant.replacement[:-1]
             else:
index 0e6e40b4c88dc3109e5aa9fa60cb27925458454b..bd0739f2f40e0b8023a6e45fb8fd2dfe03ad1c2f 100644 (file)
@@ -2,17 +2,25 @@
 Helper class to create ICU rules from a configuration file.
 """
 import io
 Helper class to create ICU rules from a configuration file.
 """
 import io
+import json
 import logging
 import itertools
 import re
 
 from icu import Transliterator
 
 import logging
 import itertools
 import re
 
 from icu import Transliterator
 
+from nominatim.db.properties import set_property, get_property
 from nominatim.errors import UsageError
 from nominatim.errors import UsageError
+from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
 import nominatim.tokenizer.icu_variants as variants
 
 LOG = logging.getLogger()
 
 import nominatim.tokenizer.icu_variants as variants
 
 LOG = logging.getLogger()
 
+DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
+DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
+DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
+
+
 def _flatten_config_list(content):
     if not content:
         return []
 def _flatten_config_list(content):
     if not content:
         return []
@@ -46,12 +54,43 @@ class ICURuleLoader:
     """ Compiler for ICU rules from a tokenizer configuration file.
     """
 
     """ Compiler for ICU rules from a tokenizer configuration file.
     """
 
-    def __init__(self, rules):
+    def __init__(self, config):
+        rules = config.load_sub_configuration('icu_tokenizer.yaml',
+                                              config='TOKENIZER_CONFIG')
+
         self.variants = set()
 
         self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
         self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
         self.variants = set()
 
         self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
         self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
-        self._parse_variant_list(self._get_section(rules, 'variants'))
+        self.analysis_rules = self._get_section(rules, 'variants')
+        self._parse_variant_list()
+
+
+    def load_config_from_db(self, conn):
+        """ Get previously saved parts of the configuration from the
+            database.
+        """
+        self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
+        self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
+        self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
+        self._parse_variant_list()
+
+
+    def save_config_to_db(self, conn):
+        """ Save the part of the configuration that cannot be changed into
+            the database.
+        """
+        set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules)
+        set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules)
+        set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
+
+
+    def make_token_analysis(self):
+        """ Create a token analyser from the reviouly loaded rules.
+        """
+        return ICUNameProcessor(self.normalization_rules,
+                                self.transliteration_rules,
+                                self.variants)
 
 
     def get_search_rules(self):
 
 
     def get_search_rules(self):
@@ -112,7 +151,9 @@ class ICURuleLoader:
         return ';'.join(_flatten_config_list(content)) + ';'
 
 
         return ';'.join(_flatten_config_list(content)) + ';'
 
 
-    def _parse_variant_list(self, rules):
+    def _parse_variant_list(self):
+        rules = self.analysis_rules
+
         self.variants.clear()
 
         if not rules:
         self.variants.clear()
 
         if not rules:
index fbaa25969dec5436a159ccb5d663e72aa1fc72ad..87906d71d75484639078c56d7b1dd9c0295a8572 100644 (file)
@@ -14,7 +14,6 @@ from nominatim.db.properties import set_property, get_property
 from nominatim.db.utils import CopyBuffer
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.db.utils import CopyBuffer
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 
 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 
 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
@@ -36,7 +35,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
     def __init__(self, dsn, data_dir):
         self.dsn = dsn
         self.data_dir = data_dir
     def __init__(self, dsn, data_dir):
         self.dsn = dsn
         self.data_dir = data_dir
-        self.naming_rules = None
+        self.loader = None
         self.term_normalization = None
 
 
         self.term_normalization = None
 
 
@@ -46,9 +45,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
             This copies all necessary data in the project directory to make
             sure the tokenizer remains stable even over updates.
         """
             This copies all necessary data in the project directory to make
             sure the tokenizer remains stable even over updates.
         """
-        loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
-                                                             config='TOKENIZER_CONFIG'))
-        self.naming_rules = ICUNameProcessorRules(loader=loader)
+        self.loader = ICURuleLoader(config)
+
         self.term_normalization = config.TERM_NORMALIZATION
 
         self._install_php(config.lib_dir.php)
         self.term_normalization = config.TERM_NORMALIZATION
 
         self._install_php(config.lib_dir.php)
@@ -59,11 +57,13 @@ class LegacyICUTokenizer(AbstractTokenizer):
             self._init_db_tables(config)
 
 
             self._init_db_tables(config)
 
 
-    def init_from_project(self):
+    def init_from_project(self, config):
         """ Initialise the tokenizer from the project directory.
         """
         """ Initialise the tokenizer from the project directory.
         """
+        self.loader = ICURuleLoader(config)
+
         with connect(self.dsn) as conn:
         with connect(self.dsn) as conn:
-            self.naming_rules = ICUNameProcessorRules(conn=conn)
+            self.loader.load_config_from_db(conn)
             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
 
 
             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
 
 
@@ -81,12 +81,12 @@ class LegacyICUTokenizer(AbstractTokenizer):
             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
 
 
             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
 
 
-    def check_database(self):
+    def check_database(self, config):
         """ Check that the tokenizer is set up correctly.
         """
         """ Check that the tokenizer is set up correctly.
         """
-        self.init_from_project()
+        self.init_from_project(config)
 
 
-        if self.naming_rules is None:
+        if self.term_normalization is None:
             return "Configuration for tokenizer 'icu' are missing."
 
         return None
             return "Configuration for tokenizer 'icu' are missing."
 
         return None
@@ -107,7 +107,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
 
             Analyzers are not thread-safe. You need to instantiate one per thread.
         """
 
             Analyzers are not thread-safe. You need to instantiate one per thread.
         """
-        return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
+        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis())
 
 
     def _install_php(self, phpdir):
 
 
     def _install_php(self, phpdir):
@@ -118,7 +118,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
             <?php
             @define('CONST_Max_Word_Frequency', 10000000);
             @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
             <?php
             @define('CONST_Max_Word_Frequency', 10000000);
             @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
-            @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
+            @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 
 
             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 
 
@@ -127,8 +127,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
             database as database properties.
         """
         with connect(self.dsn) as conn:
             database as database properties.
         """
         with connect(self.dsn) as conn:
-            self.naming_rules.save_rules(conn)
-
+            self.loader.save_config_to_db(conn)
             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 
 
             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 
 
@@ -163,7 +162,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
         """ Count the partial terms from the names in the place table.
         """
         words = Counter()
         """ Count the partial terms from the names in the place table.
         """
         words = Counter()
-        name_proc = ICUNameProcessor(self.naming_rules)
+        name_proc = self.loader.make_token_analysis()
 
         with conn.cursor(name="words") as cur:
             cur.execute(""" SELECT v, count(*) FROM
 
         with conn.cursor(name="words") as cur:
             cur.execute(""" SELECT v, count(*) FROM
index 9ebe36849e50efbcc8896794bc3c983e65320da5..93272f584e703b48a28cae0293eb19281cdcc6e8 100644 (file)
@@ -2,7 +2,6 @@
 Data structures for saving variant expansions for ICU tokenizer.
 """
 from collections import namedtuple
 Data structures for saving variant expansions for ICU tokenizer.
 """
 from collections import namedtuple
-import json
 
 _ICU_VARIANT_PORPERTY_FIELDS = ['lang']
 
 
 _ICU_VARIANT_PORPERTY_FIELDS = ['lang']
 
@@ -24,34 +23,3 @@ class ICUVariantProperties(namedtuple('_ICUVariantProperties', _ICU_VARIANT_PORP
 
 
 ICUVariant = namedtuple('ICUVariant', ['source', 'replacement', 'properties'])
 
 
 ICUVariant = namedtuple('ICUVariant', ['source', 'replacement', 'properties'])
-
-
-def pickle_variant_set(variants):
-    """ Serializes an iterable of variant rules to a string.
-    """
-    # Create a list of property sets. So they don't need to be duplicated
-    properties = {}
-    pid = 1
-    for variant in variants:
-        if variant.properties not in properties:
-            properties[variant.properties] = pid
-            pid += 1
-
-    # Convert the variants into a simple list.
-    variants = [(v.source, v.replacement, properties[v.properties]) for v in variants]
-
-    # Convert everythin to json.
-    return json.dumps({'properties': {v: k._asdict() for k, v in properties.items()},
-                       'variants': variants})
-
-
-def unpickle_variant_set(variant_string):
-    """ Deserializes a variant string that was previously created with
-        pickle_variant_set() into a set of ICUVariants.
-    """
-    data = json.loads(variant_string)
-
-    properties = {int(k): ICUVariantProperties.from_rules(v)
-                  for k, v in data['properties'].items()}
-
-    return set((ICUVariant(src, repl, properties[pid]) for src, repl, pid in data['variants']))
index dc6972dc2cc8cee28959152c2419606f94b2efba..c935f20d4a9836e0f1c97ab74a5ce93a98b99ba1 100644 (file)
@@ -113,7 +113,7 @@ class LegacyTokenizer(AbstractTokenizer):
             self._init_db_tables(config)
 
 
             self._init_db_tables(config)
 
 
-    def init_from_project(self):
+    def init_from_project(self, _):
         """ Initialise the tokenizer from the project directory.
         """
         with connect(self.dsn) as conn:
         """ Initialise the tokenizer from the project directory.
         """
         with connect(self.dsn) as conn:
@@ -142,7 +142,7 @@ class LegacyTokenizer(AbstractTokenizer):
                               modulepath=modulepath)
 
 
                               modulepath=modulepath)
 
 
-    def check_database(self):
+    def check_database(self, _):
         """ Check that the tokenizer is set up correctly.
         """
         hint = """\
         """ Check that the tokenizer is set up correctly.
         """
         hint = """\
index d116554fea20f6e9b5e261adc2a48b0434fa5531..30b27d1f60accd64c5ece68d3d82d9841b6cc656 100644 (file)
@@ -166,7 +166,7 @@ def check_tokenizer(_, config):
         return CheckState.FAIL, dict(msg="""\
             Cannot load tokenizer. Did the import finish sucessfully?""")
 
         return CheckState.FAIL, dict(msg="""\
             Cannot load tokenizer. Did the import finish sucessfully?""")
 
-    result = tokenizer.check_database()
+    result = tokenizer.check_database(config)
 
     if result is None:
         return CheckState.OK
 
     if result is None:
         return CheckState.OK
index db0f32cda6a1f95b1e590a8b7b1ef4be83975659..6029eb7c6620b5f088f831dec66dede835664340 100644 (file)
@@ -2,6 +2,7 @@
 Tokenizer for testing.
 """
 from nominatim.indexer.place_info import PlaceInfo
 Tokenizer for testing.
 """
 from nominatim.indexer.place_info import PlaceInfo
+from nominatim.config import Configuration
 
 def create(dsn, data_dir):
     """ Create a new instance of the tokenizer provided by this module.
 
 def create(dsn, data_dir):
     """ Create a new instance of the tokenizer provided by this module.
@@ -22,7 +23,8 @@ class DummyTokenizer:
         self.init_state = "new"
 
 
         self.init_state = "new"
 
 
-    def init_from_project(self):
+    def init_from_project(self, config):
+        assert isinstance(config, Configuration)
         assert self.init_state is None
         self.init_state = "loaded"
 
         assert self.init_state is None
         self.init_state = "loaded"
 
index bbfc0b120d0a98405fb77aabdc1e48266ae541a2..4b7c56d58778e577af4dc96663c1c73fca020990 100644 (file)
@@ -7,7 +7,6 @@ import yaml
 import pytest
 
 from nominatim.tokenizer import icu_tokenizer
 import pytest
 
 from nominatim.tokenizer import icu_tokenizer
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.db import properties
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.db import properties
 from nominatim.db.sql_preprocessor import SQLPreprocessor
@@ -72,7 +71,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
         cfgstr = {'normalization' : list(norm),
                    'transliteration' : list(trans),
                    'variants' : [ {'words': list(variants)}]}
         cfgstr = {'normalization' : list(norm),
                    'transliteration' : list(trans),
                    'variants' : [ {'words': list(variants)}]}
-        tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgstr))
+        (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
+        tok.loader = ICURuleLoader(test_config)
 
         return tok.name_analyzer()
 
 
         return tok.name_analyzer()
 
@@ -178,9 +178,9 @@ def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
     monkeypatch.undo()
 
     tok = tokenizer_factory()
     monkeypatch.undo()
 
     tok = tokenizer_factory()
-    tok.init_from_project()
+    tok.init_from_project(test_config)
 
 
-    assert tok.naming_rules is not None
+    assert tok.loader is not None
     assert tok.term_normalization == ':: lower();'
 
 
     assert tok.term_normalization == ':: lower();'
 
 
index ae05988ae42ce4a69ab9942ef8ca39305b151924..d0ed21ecd8b6d8625a967b5b4a7a158360afcec7 100644 (file)
@@ -4,15 +4,17 @@ Tests for import name normalisation and variant generation.
 from textwrap import dedent
 
 import pytest
 from textwrap import dedent
 
 import pytest
-import yaml
 
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
 
 from nominatim.errors import UsageError
 
 @pytest.fixture
 
 from nominatim.errors import UsageError
 
 @pytest.fixture
-def cfgfile():
+def cfgfile(def_config, tmp_path):
+    project_dir = tmp_path / 'project_dir'
+    project_dir.mkdir()
+    def_config.project_dir = project_dir
+
     def _create_config(*variants, **kwargs):
         content = dedent("""\
         normalization:
     def _create_config(*variants, **kwargs):
         content = dedent("""\
         normalization:
@@ -30,7 +32,9 @@ def cfgfile():
         content += '\n'.join(("      - " + s for s in variants)) + '\n'
         for k, v in kwargs:
             content += "    {}: {}\n".format(k, v)
         content += '\n'.join(("      - " + s for s in variants)) + '\n'
         for k, v in kwargs:
             content += "    {}: {}\n".format(k, v)
-        return yaml.safe_load(content)
+        (project_dir / 'icu_tokenizer.yaml').write_text(content)
+
+        return def_config
 
     return _create_config
 
 
     return _create_config
 
@@ -40,10 +44,9 @@ def get_normalized_variants(proc, name):
 
 
 def test_variants_empty(cfgfile):
 
 
 def test_variants_empty(cfgfile):
-    fpath = cfgfile('saint -> 🜵', 'street -> st')
+    config = cfgfile('saint -> 🜵', 'street -> st')
 
 
-    rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
-    proc = ICUNameProcessor(rules)
+    proc = ICURuleLoader(config).make_token_analysis()
 
     assert get_normalized_variants(proc, '🜵') == []
     assert get_normalized_variants(proc, '🜳') == []
 
     assert get_normalized_variants(proc, '🜵') == []
     assert get_normalized_variants(proc, '🜳') == []
@@ -83,8 +86,8 @@ VARIANT_TESTS = [
 
 @pytest.mark.parametrize("rules,name,variants", VARIANT_TESTS)
 def test_variants(cfgfile, rules, name, variants):
 
 @pytest.mark.parametrize("rules,name,variants", VARIANT_TESTS)
 def test_variants(cfgfile, rules, name, variants):
-    fpath = cfgfile(*rules)
-    proc = ICUNameProcessor(ICUNameProcessorRules(loader=ICURuleLoader(fpath)))
+    config = cfgfile(*rules)
+    proc = ICURuleLoader(config).make_token_analysis()
 
     result = get_normalized_variants(proc, name)
 
 
     result = get_normalized_variants(proc, name)
 
@@ -93,10 +96,8 @@ def test_variants(cfgfile, rules, name, variants):
 
 
 def test_search_normalized(cfgfile):
 
 
 def test_search_normalized(cfgfile):
-    fpath = cfgfile('~street => s,st', 'master => mstr')
-
-    rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
-    proc = ICUNameProcessor(rules)
+    config = cfgfile('~street => s,st', 'master => mstr')
+    proc = ICURuleLoader(config).make_token_analysis()
 
     assert proc.get_search_normalized('Master Street') == 'master street'
     assert proc.get_search_normalized('Earnes St') == 'earnes st'
 
     assert proc.get_search_normalized('Master Street') == 'master street'
     assert proc.get_search_normalized('Earnes St') == 'earnes st'
index c3480de87ac08a1b251666c0b61fb31f6405cfba..6ec53edcfa10ca0f403d7ebfa308b4cc555d9d7f 100644 (file)
@@ -12,7 +12,16 @@ from nominatim.errors import UsageError
 from icu import Transliterator
 
 @pytest.fixture
 from icu import Transliterator
 
 @pytest.fixture
-def cfgrules():
+def test_config(def_config, tmp_path):
+    project_dir = tmp_path / 'project_dir'
+    project_dir.mkdir()
+    def_config.project_dir = project_dir
+
+    return def_config
+
+
+@pytest.fixture
+def cfgrules(test_config):
     def _create_config(*variants, **kwargs):
         content = dedent("""\
         normalization:
     def _create_config(*variants, **kwargs):
         content = dedent("""\
         normalization:
@@ -29,19 +38,21 @@ def cfgrules():
         content += '\n'.join(("      - " + s for s in variants)) + '\n'
         for k, v in kwargs:
             content += "    {}: {}\n".format(k, v)
         content += '\n'.join(("      - " + s for s in variants)) + '\n'
         for k, v in kwargs:
             content += "    {}: {}\n".format(k, v)
-        return yaml.safe_load(content)
+        (test_config.project_dir / 'icu_tokenizer.yaml').write_text(content)
+
+        return test_config
 
     return _create_config
 
 
 
     return _create_config
 
 
-def test_empty_rule_set():
-    rule_cfg = yaml.safe_load(dedent("""\
+def test_empty_rule_set(test_config):
+    (test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\
         normalization:
         transliteration:
         variants:
         """))
 
         normalization:
         transliteration:
         variants:
         """))
 
-    rules = ICURuleLoader(rule_cfg)
+    rules = ICURuleLoader(test_config)
     assert rules.get_search_rules() == ''
     assert rules.get_normalization_rules() == ''
     assert rules.get_transliteration_rules() == ''
     assert rules.get_search_rules() == ''
     assert rules.get_normalization_rules() == ''
     assert rules.get_transliteration_rules() == ''
@@ -50,11 +61,12 @@ def test_empty_rule_set():
 CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
 
 @pytest.mark.parametrize("section", CONFIG_SECTIONS)
 CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
 
 @pytest.mark.parametrize("section", CONFIG_SECTIONS)
-def test_missing_section(section):
+def test_missing_section(section, test_config):
     rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
     rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
+    (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg))
 
     with pytest.raises(UsageError):
 
     with pytest.raises(UsageError):
-        ICURuleLoader(rule_cfg)
+        ICURuleLoader(test_config)
 
 
 def test_get_search_rules(cfgrules):
 
 
 def test_get_search_rules(cfgrules):
@@ -88,9 +100,8 @@ def test_get_transliteration_rules(cfgrules):
     assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
 
 
     assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
 
 
-def test_transliteration_rules_from_file(def_config, tmp_path):
-    def_config.project_dir = tmp_path
-    cfgpath = tmp_path / ('test_config.yaml')
+def test_transliteration_rules_from_file(test_config):
+    cfgpath = test_config.project_dir / ('icu_tokenizer.yaml')
     cfgpath.write_text(dedent("""\
         normalization:
         transliteration:
     cfgpath.write_text(dedent("""\
         normalization:
         transliteration:
@@ -98,10 +109,10 @@ def test_transliteration_rules_from_file(def_config, tmp_path):
             - !include transliteration.yaml
         variants:
         """))
             - !include transliteration.yaml
         variants:
         """))
-    transpath = tmp_path / ('transliteration.yaml')
+    transpath = test_config.project_dir / ('transliteration.yaml')
     transpath.write_text('- "x > y"')
 
     transpath.write_text('- "x > y"')
 
-    loader = ICURuleLoader(def_config.load_sub_configuration('test_config.yaml'))
+    loader = ICURuleLoader(test_config)
     rules = loader.get_transliteration_rules()
     trans = Transliterator.createFromRules("test", rules)
 
     rules = loader.get_transliteration_rules()
     trans = Transliterator.createFromRules("test", rules)
 
index 2545c2db5952e59eaccca1f26f581d713ad2a601..53d45c1c93a0f44f375ab11b675902497789b73e 100644 (file)
@@ -132,10 +132,10 @@ def test_init_module_custom(tokenizer_factory, test_config,
     assert not (test_config.project_dir / 'module').exists()
 
 
     assert not (test_config.project_dir / 'module').exists()
 
 
-def test_init_from_project(tokenizer_setup, tokenizer_factory):
+def test_init_from_project(tokenizer_setup, tokenizer_factory, test_config):
     tok = tokenizer_factory()
 
     tok = tokenizer_factory()
 
-    tok.init_from_project()
+    tok.init_from_project(test_config)
 
     assert tok.normalization is not None
 
 
     assert tok.normalization is not None
 
index aed5cb7e98c75fe9a9f0b029cb04db9cceed8efe..edba32364c32d33b230dc969b118a1abd6412bd3 100644 (file)
@@ -53,7 +53,7 @@ def test_check_tokenizer(temp_db_conn, def_config, monkeypatch,
                          check_result, state):
     class _TestTokenizer:
         @staticmethod
                          check_result, state):
     class _TestTokenizer:
         @staticmethod
-        def check_database():
+        def check_database(_):
             return check_result
 
     monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db',
             return check_result
 
     monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db',