]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/icu_rule_loader.py
filter duplicate results after DB query
[nominatim.git] / nominatim / tokenizer / icu_rule_loader.py
index cb38cfdfb0da9e4ca3f79f09ab7d6812df4d5778..4c36282ca54bfbd3526d24ead471a3e9fe9dbc33 100644 (file)
@@ -1,15 +1,27 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
 """
 Helper class to create ICU rules from a configuration file.
 """
 """
 Helper class to create ICU rules from a configuration file.
 """
-import importlib
+from typing import Mapping, Any, Dict, Optional
 import io
 import json
 import logging
 
 import io
 import json
 import logging
 
-from nominatim.config import flatten_config_list
+from icu import Transliterator
+
+from nominatim.config import flatten_config_list, Configuration
 from nominatim.db.properties import set_property, get_property
 from nominatim.db.properties import set_property, get_property
+from nominatim.db.connection import Connection
 from nominatim.errors import UsageError
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
 from nominatim.errors import UsageError
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
+from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
+from nominatim.tokenizer.token_analysis.base import AnalysisModule, Analyzer
+import nominatim.data.country_info
 
 LOG = logging.getLogger()
 
 
 LOG = logging.getLogger()
 
@@ -18,7 +30,7 @@ DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
 DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
 
 
 DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
 
 
-def _get_section(rules, section):
+def _get_section(rules: Mapping[str, Any], section: str) -> Any:
     """ Get the section named 'section' from the rules. If the section does
         not exist, raise a usage error with a meaningful message.
     """
     """ Get the section named 'section' from the rules. If the section does
         not exist, raise a usage error with a meaningful message.
     """
@@ -33,10 +45,14 @@ class ICURuleLoader:
     """ Compiler for ICU rules from a tokenizer configuration file.
     """
 
     """ Compiler for ICU rules from a tokenizer configuration file.
     """
 
-    def __init__(self, config):
+    def __init__(self, config: Configuration) -> None:
+        self.config = config
         rules = config.load_sub_configuration('icu_tokenizer.yaml',
                                               config='TOKENIZER_CONFIG')
 
         rules = config.load_sub_configuration('icu_tokenizer.yaml',
                                               config='TOKENIZER_CONFIG')
 
+        # Make sure country information is available to analyzers and sanitizers.
+        nominatim.data.country_info.setup_country_config(config)
+
         self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
         self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
         self.analysis_rules = _get_section(rules, 'token-analysis')
         self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
         self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
         self.analysis_rules = _get_section(rules, 'token-analysis')
@@ -46,17 +62,27 @@ class ICURuleLoader:
         self.sanitizer_rules = rules.get('sanitizers', [])
 
 
         self.sanitizer_rules = rules.get('sanitizers', [])
 
 
-    def load_config_from_db(self, conn):
+    def load_config_from_db(self, conn: Connection) -> None:
         """ Get previously saved parts of the configuration from the
             database.
         """
         """ Get previously saved parts of the configuration from the
             database.
         """
-        self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
-        self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
-        self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
+        rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
+        if rules is not None:
+            self.normalization_rules = rules
+
+        rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
+        if rules is not None:
+            self.transliteration_rules = rules
+
+        rules = get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES)
+        if rules:
+            self.analysis_rules = json.loads(rules)
+        else:
+            self.analysis_rules = []
         self._setup_analysis()
 
 
         self._setup_analysis()
 
 
-    def save_config_to_db(self, conn):
+    def save_config_to_db(self, conn: Connection) -> None:
         """ Save the part of the configuration that cannot be changed into
             the database.
         """
         """ Save the part of the configuration that cannot be changed into
             the database.
         """
@@ -65,20 +91,20 @@ class ICURuleLoader:
         set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
 
 
         set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
 
 
-    def make_sanitizer(self):
+    def make_sanitizer(self) -> PlaceSanitizer:
         """ Create a place sanitizer from the configured rules.
         """
         """ Create a place sanitizer from the configured rules.
         """
-        return PlaceSanitizer(self.sanitizer_rules)
+        return PlaceSanitizer(self.sanitizer_rules, self.config)
 
 
 
 
-    def make_token_analysis(self):
+    def make_token_analysis(self) -> ICUTokenAnalysis:
         """ Create a token analyser from the reviouly loaded rules.
         """
         """ Create a token analyser from the reviouly loaded rules.
         """
-        return self.analysis[None].create(self.normalization_rules,
-                                          self.transliteration_rules)
+        return ICUTokenAnalysis(self.normalization_rules,
+                                self.transliteration_rules, self.analysis)
 
 
 
 
-    def get_search_rules(self):
+    def get_search_rules(self) -> str:
         """ Return the ICU rules to be used during search.
             The rules combine normalization and transliteration.
         """
         """ Return the ICU rules to be used during search.
             The rules combine normalization and transliteration.
         """
@@ -91,26 +117,31 @@ class ICURuleLoader:
         return rules.getvalue()
 
 
         return rules.getvalue()
 
 
-    def get_normalization_rules(self):
+    def get_normalization_rules(self) -> str:
         """ Return rules for normalisation of a term.
         """
         return self.normalization_rules
 
 
         """ Return rules for normalisation of a term.
         """
         return self.normalization_rules
 
 
-    def get_transliteration_rules(self):
+    def get_transliteration_rules(self) -> str:
         """ Return the rules for converting a string into its asciii representation.
         """
         return self.transliteration_rules
 
 
         """ Return the rules for converting a string into its asciii representation.
         """
         return self.transliteration_rules
 
 
-    def _setup_analysis(self):
+    def _setup_analysis(self) -> None:
         """ Process the rules used for creating the various token analyzers.
         """
         """ Process the rules used for creating the various token analyzers.
         """
-        self.analysis = {}
+        self.analysis: Dict[Optional[str], TokenAnalyzerRule]  = {}
 
         if not isinstance(self.analysis_rules, list):
             raise UsageError("Configuration section 'token-analysis' must be a list.")
 
 
         if not isinstance(self.analysis_rules, list):
             raise UsageError("Configuration section 'token-analysis' must be a list.")
 
+        norm = Transliterator.createFromRules("rule_loader_normalization",
+                                              self.normalization_rules)
+        trans = Transliterator.createFromRules("rule_loader_transliteration",
+                                              self.transliteration_rules)
+
         for section in self.analysis_rules:
             name = section.get('id', None)
             if name in self.analysis:
         for section in self.analysis_rules:
             name = section.get('id', None)
             if name in self.analysis:
@@ -119,12 +150,13 @@ class ICURuleLoader:
                 else:
                     LOG.fatal("ICU tokenizer configuration has two token "
                               "analyzers with id '%s'.", name)
                 else:
                     LOG.fatal("ICU tokenizer configuration has two token "
                               "analyzers with id '%s'.", name)
-                UsageError("Syntax error in ICU tokenizer config.")
-            self.analysis[name] = TokenAnalyzerRule(section, self.normalization_rules)
+                raise UsageError("Syntax error in ICU tokenizer config.")
+            self.analysis[name] = TokenAnalyzerRule(section, norm, trans,
+                                                    self.config)
 
 
     @staticmethod
 
 
     @staticmethod
-    def _cfg_to_icu_rules(rules, section):
+    def _cfg_to_icu_rules(rules: Mapping[str, Any], section: str) -> str:
         """ Load an ICU ruleset from the given section. If the section is a
             simple string, it is interpreted as a file name and the rules are
             loaded verbatim from the given file. The filename is expected to be
         """ Load an ICU ruleset from the given section. If the section is a
             simple string, it is interpreted as a file name and the rules are
             loaded verbatim from the given file. The filename is expected to be
@@ -144,20 +176,21 @@ class TokenAnalyzerRule:
         and creates a new token analyzer on request.
     """
 
         and creates a new token analyzer on request.
     """
 
-    def __init__(self, rules, normalization_rules):
-        # Find the analysis module
-        module_name = 'nominatim.tokenizer.token_analysis.' \
-                      + _get_section(rules, 'analyzer').replace('-', '_')
-        analysis_mod = importlib.import_module(module_name)
-        self._mod_create = analysis_mod.create
+    def __init__(self, rules: Mapping[str, Any],
+                 normalizer: Any, transliterator: Any,
+                 config: Configuration) -> None:
+        analyzer_name = _get_section(rules, 'analyzer')
+        if not analyzer_name or not isinstance(analyzer_name, str):
+            raise UsageError("'analyzer' parameter needs to be simple string")
+
+        self._analysis_mod: AnalysisModule = \
+            config.load_plugin_module(analyzer_name, 'nominatim.tokenizer.token_analysis')
 
 
-        # Load the configuration.
-        self.config = analysis_mod.configure(rules, normalization_rules)
+        self.config = self._analysis_mod.configure(rules, normalizer,
+                                                   transliterator)
 
 
 
 
-    def create(self, normalization_rules, transliteration_rules):
-        """ Create an analyzer from the given rules.
+    def create(self, normalizer: Any, transliterator: Any) -> Analyzer:
+        """ Create a new analyser instance for the given rule.
         """
         """
-        return self._mod_create(normalization_rules,
-                                transliteration_rules,
-                                self.config)
+        return self._analysis_mod.create(normalizer, transliterator, self.config)