introduce generic YAML config loader

author Sarah Hoffmann <lonvia@denofr.de>

Fri, 3 Sep 2021 16:16:12 +0000 (18:16 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Fri, 3 Sep 2021 16:20:07 +0000 (18:20 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Fri, 3 Sep 2021 16:16:12 +0000 (18:16 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Fri, 3 Sep 2021 16:20:07 +0000 (18:20 +0200)
diff --git a/nominatim/config.py b/nominatim/config.py

index a8436440b9f5ca78670ba9fe9e1cc8e3979ece96..c859a9d1142cbb72e90da118b8bddb38c0d8415e 100644 (file)
--- a/nominatim/config.py
+++ b/nominatim/config.py
@@ -4,6 +4,7 @@ Nominatim configuration accessor.
  import logging
  import os
  from pathlib import Path
+import yaml
  
  from dotenv import dotenv_values
  
@@ -114,3 +115,96 @@ class Configuration:
          env.update(self.environ)
  
          return env
+
+
+    def load_sub_configuration(self, filename, config=None):
+        """ Load additional configuration from a file. `filename` is the name
+            of the configuration file. The file is first searched in the
+            project directory and then in the global settings dirctory.
+
+            If `config` is set, then the name of the configuration file can
+            be additionally given through a .env configuration option. When
+            the option is set, then the file will be exclusively loaded as set:
+            if the name is an absolute path, the file name is taken as is,
+            if the name is relative, it is taken to be relative to the
+            project directory.
+
+            The format of the file is determined from the filename suffix.
+            Currently only files with extension '.yaml' are supported.
+
+            YAML files support a special '!include' construct. When the
+            directive is given, the value is taken to be a filename, the file
+            is loaded using this function and added at the position in the
+            configuration tree.
+        """
+        configfile = self._find_config_file(filename, config)
+
+        if configfile.suffix != '.yaml':
+            LOG.format("Format error while reading '%s': only YAML format supported.",
+                       configfile)
+            raise UsageError("Cannot handle config file format.")
+
+        return self._load_from_yaml(configfile)
+
+
+    def _find_config_file(self, filename, config=None):
+        """ Resolve the location of a configuration file given a filename and
+            an optional configuration option with the file name.
+            Raises a UsageError when the file cannot be found or is not
+            a regular file.
+        """
+        if config is not None:
+            cfg_filename = self.__getattr__(config)
+            if cfg_filename:
+                cfg_filename = Path(cfg_filename)
+
+                if not cfg_filename.is_absolute():
+                    cfg_filename = self.project_dir / cfg_filename
+
+                cfg_filename = cfg_filename.resolve()
+
+                if not cfg_filename.is_file():
+                    LOG.fatal("Cannot find config file '%s'.", cfg_filename)
+                    raise UsageError("Config file not found.")
+
+                return cfg_filename
+
+
+        search_paths = [self.project_dir, self.config_dir]
+        for path in search_paths:
+            if (path / filename).is_file():
+                return path / filename
+
+        LOG.fatal("Configuration file '%s' not found.\nDirectories searched: %s",
+                  filename, search_paths)
+        raise UsageError("Config file not found.")
+
+
+    def _load_from_yaml(self, cfgfile):
+        """ Load a YAML configuration file. This installs a special handler that
+            allows to include other YAML files using the '!include' operator.
+        """
+        yaml.add_constructor('!include', self._yaml_include_representer,
+                             Loader=yaml.SafeLoader)
+        return yaml.safe_load(cfgfile.read_text(encoding='utf-8'))
+
+
+    def _yaml_include_representer(self, loader, node):
+        """ Handler for the '!include' operator in YAML files.
+
+            When the filename is relative, then the file is first searched in the
+            project directory and then in the global settings dirctory.
+        """
+        fname = loader.construct_scalar(node)
+
+        if Path(fname).is_absolute():
+            configfile = Path(fname)
+        else:
+            configfile = self._find_config_file(loader.construct_scalar(node))
+
+        if configfile.suffix != '.yaml':
+            LOG.format("Format error while reading '%s': only YAML format supported.",
+                       configfile)
+            raise UsageError("Cannot handle config file format.")
+
+        return yaml.safe_load(configfile.read_text(encoding='utf-8'))
diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py

index b408f1c3f98a7fc965146d1a7eaef47f9186a37b..0e6e40b4c88dc3109e5aa9fa60cb27925458454b 100644 (file)
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -4,10 +4,8 @@ Helper class to create ICU rules from a configuration file.
  import io
  import logging
  import itertools
-from pathlib import Path
  import re
  
-import yaml
  from icu import Transliterator
  
  from nominatim.errors import UsageError
@@ -15,17 +13,17 @@ import nominatim.tokenizer.icu_variants as variants
  
  LOG = logging.getLogger()
  
-def _flatten_yaml_list(content):
+def _flatten_config_list(content):
      if not content:
          return []
  
      if not isinstance(content, list):
-        raise UsageError("List expected in ICU yaml configuration.")
+        raise UsageError("List expected in ICU configuration.")
  
      output = []
      for ele in content:
          if isinstance(ele, list):
-            output.extend(_flatten_yaml_list(ele))
+            output.extend(_flatten_config_list(ele))
          else:
              output.append(ele)
  
@@ -48,14 +46,12 @@ class ICURuleLoader:
      """ Compiler for ICU rules from a tokenizer configuration file.
      """
  
-    def __init__(self, configfile):
-        self.configfile = configfile
+    def __init__(self, rules):
          self.variants = set()
  
-        if configfile.suffix == '.yaml':
-            self._load_from_yaml()
-        else:
-            raise UsageError("Unknown format of tokenizer configuration.")
+        self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
+        self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
+        self._parse_variant_list(self._get_section(rules, 'variants'))
  
  
      def get_search_rules(self):
@@ -88,34 +84,14 @@ class ICURuleLoader:
          """
          return self.variants
  
-    def _yaml_include_representer(self, loader, node):
-        value = loader.construct_scalar(node)
-
-        if Path(value).is_absolute():
-            content = Path(value)
-        else:
-            content = (self.configfile.parent / value)
-
-        return yaml.safe_load(content.read_text(encoding='utf-8'))
-
-
-    def _load_from_yaml(self):
-        yaml.add_constructor('!include', self._yaml_include_representer,
-                             Loader=yaml.SafeLoader)
-        rules = yaml.safe_load(self.configfile.read_text(encoding='utf-8'))
-
-        self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
-        self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
-        self._parse_variant_list(self._get_section(rules, 'variants'))
-
  
-    def _get_section(self, rules, section):
+    @staticmethod
+    def _get_section(rules, section):
          """ Get the section named 'section' from the rules. If the section does
              not exist, raise a usage error with a meaningful message.
          """
          if section not in rules:
-            LOG.fatal("Section '%s' not found in tokenizer config '%s'.",
-                      section, str(self.configfile))
+            LOG.fatal("Section '%s' not found in tokenizer config.", section)
              raise UsageError("Syntax error in tokenizer configuration file.")
  
          return rules[section]
@@ -133,7 +109,7 @@ class ICURuleLoader:
          if content is None:
              return ''
  
-        return ';'.join(_flatten_yaml_list(content)) + ';'
+        return ';'.join(_flatten_config_list(content)) + ';'
  
  
      def _parse_variant_list(self, rules):
@@ -142,7 +118,7 @@ class ICURuleLoader:
          if not rules:
              return
  
-        rules = _flatten_yaml_list(rules)
+        rules = _flatten_config_list(rules)
  
          vmaker = _VariantMaker(self.normalization_rules)
  
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index cb4112049fb7e8173b835fa1638db0f6ee3a7cc4..e9cb3d26c48cca3000c2d322c66ff4375f0af622 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -8,7 +8,6 @@ import json
  import logging
  import re
  from textwrap import dedent
-from pathlib import Path
  
  from nominatim.db.connection import connect
  from nominatim.db.properties import set_property, get_property
@@ -49,12 +48,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
              This copies all necessary data in the project directory to make
              sure the tokenizer remains stable even over updates.
          """
-        if config.TOKENIZER_CONFIG:
-            cfgfile = Path(config.TOKENIZER_CONFIG)
-        else:
-            cfgfile = config.config_dir / 'icu_tokenizer.yaml'
-
-        loader = ICURuleLoader(cfgfile)
+        loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
+                                              config='TOKENIZER_CONFIG'))
          self.naming_rules = ICUNameProcessorRules(loader=loader)
          self.term_normalization = config.TERM_NORMALIZATION
          self.max_word_frequency = config.MAX_WORD_FREQUENCY
diff --git a/test/python/test_tokenizer_icu.py b/test/python/test_tokenizer_icu.py

index 5ec434b6f4b349902ca743106a9199f1382979bc..b7101c3f67ef62229e5205d226e4c50b4c6ccad8 100644 (file)
--- a/test/python/test_tokenizer_icu.py
+++ b/test/python/test_tokenizer_icu.py
@@ -67,13 +67,10 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
  
      def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
                       variants=('~gasse -> gasse', 'street => st', )):
-        cfgfile = tmp_path / 'analyser_test_config.yaml'
-        with cfgfile.open('w') as stream:
-            cfgstr = {'normalization' : list(norm),
-                       'transliteration' : list(trans),
-                       'variants' : [ {'words': list(variants)}]}
-            yaml.dump(cfgstr, stream)
-        tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgfile))
+        cfgstr = {'normalization' : list(norm),
+                   'transliteration' : list(trans),
+                   'variants' : [ {'words': list(variants)}]}
+        tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgstr))
  
          return tok.name_analyzer()
  
diff --git a/test/python/test_tokenizer_icu_name_processor.py b/test/python/test_tokenizer_icu_name_processor.py

index cc1031164c2872b77ed7dcf0fb3600df05895376..ae05988ae42ce4a69ab9942ef8ca39305b151924 100644 (file)
--- a/test/python/test_tokenizer_icu_name_processor.py
+++ b/test/python/test_tokenizer_icu_name_processor.py
@@ -4,6 +4,7 @@ Tests for import name normalisation and variant generation.
  from textwrap import dedent
  
  import pytest
+import yaml
  
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
@@ -11,7 +12,7 @@ from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProc
  from nominatim.errors import UsageError
  
  @pytest.fixture
-def cfgfile(tmp_path, suffix='.yaml'):
+def cfgfile():
      def _create_config(*variants, **kwargs):
          content = dedent("""\
          normalization:
@@ -29,9 +30,7 @@ def cfgfile(tmp_path, suffix='.yaml'):
          content += '\n'.join(("      - " + s for s in variants)) + '\n'
          for k, v in kwargs:
              content += "    {}: {}\n".format(k, v)
-        fpath = tmp_path / ('test_config' + suffix)
-        fpath.write_text(dedent(content))
-        return fpath
+        return yaml.safe_load(content)
  
      return _create_config
  
diff --git a/test/python/test_tokenizer_icu_rule_loader.py b/test/python/test_tokenizer_icu_rule_loader.py

index bb30dc6eae2133b9c3fdb42cf95dab5620657aa6..c3480de87ac08a1b251666c0b61fb31f6405cfba 100644 (file)
--- a/test/python/test_tokenizer_icu_rule_loader.py
+++ b/test/python/test_tokenizer_icu_rule_loader.py
@@ -1,16 +1,18 @@
  """
  Tests for converting a config file to ICU rules.
  """
-import pytest
  from textwrap import dedent
  
+import pytest
+import yaml
+
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.errors import UsageError
  
  from icu import Transliterator
  
  @pytest.fixture
-def cfgfile(tmp_path, suffix='.yaml'):
+def cfgrules():
      def _create_config(*variants, **kwargs):
          content = dedent("""\
          normalization:
@@ -27,22 +29,19 @@ def cfgfile(tmp_path, suffix='.yaml'):
          content += '\n'.join(("      - " + s for s in variants)) + '\n'
          for k, v in kwargs:
              content += "    {}: {}\n".format(k, v)
-        fpath = tmp_path / ('test_config' + suffix)
-        fpath.write_text(dedent(content))
-        return fpath
+        return yaml.safe_load(content)
  
      return _create_config
  
  
-def test_empty_rule_file(tmp_path):
-    fpath = tmp_path / ('test_config.yaml')
-    fpath.write_text(dedent("""\
+def test_empty_rule_set():
+    rule_cfg = yaml.safe_load(dedent("""\
          normalization:
          transliteration:
          variants:
          """))
  
-    rules = ICURuleLoader(fpath)
+    rules = ICURuleLoader(rule_cfg)
      assert rules.get_search_rules() == ''
      assert rules.get_normalization_rules() == ''
      assert rules.get_transliteration_rules() == ''
@@ -51,19 +50,15 @@ def test_empty_rule_file(tmp_path):
  CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
  
  @pytest.mark.parametrize("section", CONFIG_SECTIONS)
-def test_missing_normalization(tmp_path, section):
-    fpath = tmp_path / ('test_config.yaml')
-    with fpath.open('w') as fd:
-        for name in CONFIG_SECTIONS:
-            if name != section:
-                fd.write(name + ':\n')
+def test_missing_section(section):
+    rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
  
      with pytest.raises(UsageError):
-        ICURuleLoader(fpath)
+        ICURuleLoader(rule_cfg)
  
  
-def test_get_search_rules(cfgfile):
-    loader = ICURuleLoader(cfgfile())
+def test_get_search_rules(cfgrules):
+    loader = ICURuleLoader(cfgrules())
  
      rules = loader.get_search_rules()
      trans = Transliterator.createFromRules("test", rules)
@@ -77,23 +72,24 @@ def test_get_search_rules(cfgfile):
      assert trans.transliterate(" проспект ") == " prospekt "
  
  
-def test_get_normalization_rules(cfgfile):
-    loader = ICURuleLoader(cfgfile())
+def test_get_normalization_rules(cfgrules):
+    loader = ICURuleLoader(cfgrules())
      rules = loader.get_normalization_rules()
      trans = Transliterator.createFromRules("test", rules)
  
      assert trans.transliterate(" проспект-Prospekt ") == " проспект prospekt "
  
  
-def test_get_transliteration_rules(cfgfile):
-    loader = ICURuleLoader(cfgfile())
+def test_get_transliteration_rules(cfgrules):
+    loader = ICURuleLoader(cfgrules())
      rules = loader.get_transliteration_rules()
      trans = Transliterator.createFromRules("test", rules)
  
      assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
  
  
-def test_transliteration_rules_from_file(tmp_path):
+def test_transliteration_rules_from_file(def_config, tmp_path):
+    def_config.project_dir = tmp_path
      cfgpath = tmp_path / ('test_config.yaml')
      cfgpath.write_text(dedent("""\
          normalization:
@@ -105,7 +101,7 @@ def test_transliteration_rules_from_file(tmp_path):
      transpath = tmp_path / ('transliteration.yaml')
      transpath.write_text('- "x > y"')
  
-    loader = ICURuleLoader(cfgpath)
+    loader = ICURuleLoader(def_config.load_sub_configuration('test_config.yaml'))
      rules = loader.get_transliteration_rules()
      trans = Transliterator.createFromRules("test", rules)
  
@@ -115,11 +111,11 @@ def test_transliteration_rules_from_file(tmp_path):
  class TestGetReplacements:
  
      @pytest.fixture(autouse=True)
-    def setup_cfg(self, cfgfile):
-        self.cfgfile = cfgfile
+    def setup_cfg(self, cfgrules):
+        self.cfgrules = cfgrules
  
      def get_replacements(self, *variants):
-        loader = ICURuleLoader(self.cfgfile(*variants))
+        loader = ICURuleLoader(self.cfgrules(*variants))
          rules = loader.get_replacement_pairs()
  
          return set((v.source, v.replacement) for v in rules)
@@ -129,7 +125,7 @@ class TestGetReplacements:
                                           '~foo~ -> bar', 'fo~ o -> bar'])
      def test_invalid_variant_description(self, variant):
          with pytest.raises(UsageError):
-            ICURuleLoader(self.cfgfile(variant))
+            ICURuleLoader(self.cfgrules(variant))
  
      def test_add_full(self):
          repl = self.get_replacements("foo -> bar")
author	Sarah Hoffmann <lonvia@denofr.de>
	Fri, 3 Sep 2021 16:16:12 +0000 (18:16 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Fri, 3 Sep 2021 16:20:07 +0000 (18:20 +0200)
nominatim/config.py		patch \| blob \| history
nominatim/tokenizer/icu_rule_loader.py		patch \| blob \| history
nominatim/tokenizer/icu_tokenizer.py		patch \| blob \| history
test/python/test_tokenizer_icu.py		patch \| blob \| history
test/python/test_tokenizer_icu_name_processor.py		patch \| blob \| history
test/python/test_tokenizer_icu_rule_loader.py		patch \| blob \| history