add support for external sanitizer modules

[nominatim.git] / nominatim / tokenizer / icu_rule_loader.py
diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py

index b8551038aa42283dfccfc6bedaf1c0b89c0ba68b..cf9fdb88ac16fa96ea2e6a5cda2a4e1afb43d5f5 100644 (file)
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -1,17 +1,26 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
  """
  Helper class to create ICU rules from a configuration file.
  """
  """
  Helper class to create ICU rules from a configuration file.
  """
+from typing import Mapping, Any, Dict, Optional
  import importlib
  import io
  import json
  import logging
  
  import importlib
  import io
  import json
  import logging
  
-from nominatim.config import flatten_config_list
+from nominatim.config import flatten_config_list, Configuration
  from nominatim.db.properties import set_property, get_property
  from nominatim.db.properties import set_property, get_property
+from nominatim.db.connection import Connection
  from nominatim.errors import UsageError
  from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
  from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
  from nominatim.errors import UsageError
  from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
  from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
-import nominatim.tools.country_info
+from nominatim.tokenizer.token_analysis.base import AnalysisModule, Analyser
+import nominatim.data.country_info
  
  LOG = logging.getLogger()
  
  
  LOG = logging.getLogger()
  
@@ -20,7 +29,7 @@ DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
  DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
  
  
  DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
  
  
-def _get_section(rules, section):
+def _get_section(rules: Mapping[str, Any], section: str) -> Any:
      """ Get the section named 'section' from the rules. If the section does
          not exist, raise a usage error with a meaningful message.
      """
      """ Get the section named 'section' from the rules. If the section does
          not exist, raise a usage error with a meaningful message.
      """
@@ -35,12 +44,13 @@ class ICURuleLoader:
      """ Compiler for ICU rules from a tokenizer configuration file.
      """
  
      """ Compiler for ICU rules from a tokenizer configuration file.
      """
  
-    def __init__(self, config):
+    def __init__(self, config: Configuration) -> None:
+        self.config = config
          rules = config.load_sub_configuration('icu_tokenizer.yaml',
                                                config='TOKENIZER_CONFIG')
  
          rules = config.load_sub_configuration('icu_tokenizer.yaml',
                                                config='TOKENIZER_CONFIG')
  
-        # Make sure country information is available to analyzers and sanatizers.
-        nominatim.tools.country_info.setup_country_config(config)
+        # Make sure country information is available to analyzers and sanitizers.
+        nominatim.data.country_info.setup_country_config(config)
  
          self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
          self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
  
          self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
          self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
@@ -51,17 +61,27 @@ class ICURuleLoader:
          self.sanitizer_rules = rules.get('sanitizers', [])
  
  
          self.sanitizer_rules = rules.get('sanitizers', [])
  
  
-    def load_config_from_db(self, conn):
+    def load_config_from_db(self, conn: Connection) -> None:
          """ Get previously saved parts of the configuration from the
              database.
          """
          """ Get previously saved parts of the configuration from the
              database.
          """
-        self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
-        self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
-        self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
+        rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
+        if rules is not None:
+            self.normalization_rules = rules
+
+        rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
+        if rules is not None:
+            self.transliteration_rules = rules
+
+        rules = get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES)
+        if rules:
+            self.analysis_rules = json.loads(rules)
+        else:
+            self.analysis_rules = []
          self._setup_analysis()
  
  
          self._setup_analysis()
  
  
-    def save_config_to_db(self, conn):
+    def save_config_to_db(self, conn: Connection) -> None:
          """ Save the part of the configuration that cannot be changed into
              the database.
          """
          """ Save the part of the configuration that cannot be changed into
              the database.
          """
@@ -70,20 +90,20 @@ class ICURuleLoader:
          set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
  
  
          set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
  
  
-    def make_sanitizer(self):
+    def make_sanitizer(self) -> PlaceSanitizer:
          """ Create a place sanitizer from the configured rules.
          """
          """ Create a place sanitizer from the configured rules.
          """
-        return PlaceSanitizer(self.sanitizer_rules)
+        return PlaceSanitizer(self.sanitizer_rules, self.config)
  
  
  
  
-    def make_token_analysis(self):
+    def make_token_analysis(self) -> ICUTokenAnalysis:
          """ Create a token analyser from the reviouly loaded rules.
          """
          return ICUTokenAnalysis(self.normalization_rules,
                                  self.transliteration_rules, self.analysis)
  
  
          """ Create a token analyser from the reviouly loaded rules.
          """
          return ICUTokenAnalysis(self.normalization_rules,
                                  self.transliteration_rules, self.analysis)
  
  
-    def get_search_rules(self):
+    def get_search_rules(self) -> str:
          """ Return the ICU rules to be used during search.
              The rules combine normalization and transliteration.
          """
          """ Return the ICU rules to be used during search.
              The rules combine normalization and transliteration.
          """
@@ -96,22 +116,22 @@ class ICURuleLoader:
          return rules.getvalue()
  
  
          return rules.getvalue()
  
  
-    def get_normalization_rules(self):
+    def get_normalization_rules(self) -> str:
          """ Return rules for normalisation of a term.
          """
          return self.normalization_rules
  
  
          """ Return rules for normalisation of a term.
          """
          return self.normalization_rules
  
  
-    def get_transliteration_rules(self):
+    def get_transliteration_rules(self) -> str:
          """ Return the rules for converting a string into its asciii representation.
          """
          return self.transliteration_rules
  
  
          """ Return the rules for converting a string into its asciii representation.
          """
          return self.transliteration_rules
  
  
-    def _setup_analysis(self):
+    def _setup_analysis(self) -> None:
          """ Process the rules used for creating the various token analyzers.
          """
          """ Process the rules used for creating the various token analyzers.
          """
-        self.analysis = {}
+        self.analysis: Dict[Optional[str], TokenAnalyzerRule]  = {}
  
          if not isinstance(self.analysis_rules, list):
              raise UsageError("Configuration section 'token-analysis' must be a list.")
  
          if not isinstance(self.analysis_rules, list):
              raise UsageError("Configuration section 'token-analysis' must be a list.")
@@ -129,7 +149,7 @@ class ICURuleLoader:
  
  
      @staticmethod
  
  
      @staticmethod
-    def _cfg_to_icu_rules(rules, section):
+    def _cfg_to_icu_rules(rules: Mapping[str, Any], section: str) -> str:
          """ Load an ICU ruleset from the given section. If the section is a
              simple string, it is interpreted as a file name and the rules are
              loaded verbatim from the given file. The filename is expected to be
          """ Load an ICU ruleset from the given section. If the section is a
              simple string, it is interpreted as a file name and the rules are
              loaded verbatim from the given file. The filename is expected to be
@@ -149,12 +169,16 @@ class TokenAnalyzerRule:
          and creates a new token analyzer on request.
      """
  
          and creates a new token analyzer on request.
      """
  
-    def __init__(self, rules, normalization_rules):
+    def __init__(self, rules: Mapping[str, Any], normalization_rules: str) -> None:
          # Find the analysis module
          module_name = 'nominatim.tokenizer.token_analysis.' \
                        + _get_section(rules, 'analyzer').replace('-', '_')
          # Find the analysis module
          module_name = 'nominatim.tokenizer.token_analysis.' \
                        + _get_section(rules, 'analyzer').replace('-', '_')
-        analysis_mod = importlib.import_module(module_name)
-        self.create = analysis_mod.create
+        self._analysis_mod: AnalysisModule = importlib.import_module(module_name)
  
          # Load the configuration.
  
          # Load the configuration.
-        self.config = analysis_mod.configure(rules, normalization_rules)
+        self.config = self._analysis_mod.configure(rules, normalization_rules)
+
+    def create(self, normalizer: Any, transliterator: Any) -> Analyser:
+        """ Create a new analyser instance for the given rule.
+        """
+        return self._analysis_mod.create(normalizer, transliterator, self.config)