]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/icu_rule_loader.py
reintroduce cutoffs when searching for very frequent words
[nominatim.git] / nominatim / tokenizer / icu_rule_loader.py
index cf72520953456e9318576f51d9fc7acc280d668e..4c36282ca54bfbd3526d24ead471a3e9fe9dbc33 100644 (file)
@@ -1,20 +1,27 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
 """
 Helper class to create ICU rules from a configuration file.
 """
 """
 Helper class to create ICU rules from a configuration file.
 """
+from typing import Mapping, Any, Dict, Optional
 import io
 import json
 import logging
 import io
 import json
 import logging
-import itertools
-import re
 
 from icu import Transliterator
 
 
 from icu import Transliterator
 
-from nominatim.config import flatten_config_list
+from nominatim.config import flatten_config_list, Configuration
 from nominatim.db.properties import set_property, get_property
 from nominatim.db.properties import set_property, get_property
+from nominatim.db.connection import Connection
 from nominatim.errors import UsageError
 from nominatim.errors import UsageError
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
-import nominatim.tokenizer.icu_variants as variants
+from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
+from nominatim.tokenizer.token_analysis.base import AnalysisModule, Analyzer
+import nominatim.data.country_info
 
 LOG = logging.getLogger()
 
 
 LOG = logging.getLogger()
 
@@ -23,46 +30,59 @@ DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
 DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
 
 
 DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
 
 
-class VariantRule:
-    """ Saves a single variant expansion.
-
-        An expansion consists of the normalized replacement term and
-        a dicitonary of properties that describe when the expansion applies.
+def _get_section(rules: Mapping[str, Any], section: str) -> Any:
+    """ Get the section named 'section' from the rules. If the section does
+        not exist, raise a usage error with a meaningful message.
     """
     """
+    if section not in rules:
+        LOG.fatal("Section '%s' not found in tokenizer config.", section)
+        raise UsageError("Syntax error in tokenizer configuration file.")
 
 
-    def __init__(self, replacement, properties):
-        self.replacement = replacement
-        self.properties = properties or {}
+    return rules[section]
 
 
 class ICURuleLoader:
     """ Compiler for ICU rules from a tokenizer configuration file.
     """
 
 
 
 class ICURuleLoader:
     """ Compiler for ICU rules from a tokenizer configuration file.
     """
 
-    def __init__(self, config):
+    def __init__(self, config: Configuration) -> None:
+        self.config = config
         rules = config.load_sub_configuration('icu_tokenizer.yaml',
                                               config='TOKENIZER_CONFIG')
 
         rules = config.load_sub_configuration('icu_tokenizer.yaml',
                                               config='TOKENIZER_CONFIG')
 
+        # Make sure country information is available to analyzers and sanitizers.
+        nominatim.data.country_info.setup_country_config(config)
+
         self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
         self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
         self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
         self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
-        self.analysis_rules = self._get_section(rules, 'token-analysis')
+        self.analysis_rules = _get_section(rules, 'token-analysis')
         self._setup_analysis()
 
         # Load optional sanitizer rule set.
         self.sanitizer_rules = rules.get('sanitizers', [])
 
 
         self._setup_analysis()
 
         # Load optional sanitizer rule set.
         self.sanitizer_rules = rules.get('sanitizers', [])
 
 
-    def load_config_from_db(self, conn):
+    def load_config_from_db(self, conn: Connection) -> None:
         """ Get previously saved parts of the configuration from the
             database.
         """
         """ Get previously saved parts of the configuration from the
             database.
         """
-        self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
-        self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
-        self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
+        rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
+        if rules is not None:
+            self.normalization_rules = rules
+
+        rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
+        if rules is not None:
+            self.transliteration_rules = rules
+
+        rules = get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES)
+        if rules:
+            self.analysis_rules = json.loads(rules)
+        else:
+            self.analysis_rules = []
         self._setup_analysis()
 
 
         self._setup_analysis()
 
 
-    def save_config_to_db(self, conn):
+    def save_config_to_db(self, conn: Connection) -> None:
         """ Save the part of the configuration that cannot be changed into
             the database.
         """
         """ Save the part of the configuration that cannot be changed into
             the database.
         """
@@ -71,20 +91,20 @@ class ICURuleLoader:
         set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
 
 
         set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
 
 
-    def make_sanitizer(self):
+    def make_sanitizer(self) -> PlaceSanitizer:
         """ Create a place sanitizer from the configured rules.
         """
         """ Create a place sanitizer from the configured rules.
         """
-        return PlaceSanitizer(self.sanitizer_rules)
+        return PlaceSanitizer(self.sanitizer_rules, self.config)
 
 
 
 
-    def make_token_analysis(self):
+    def make_token_analysis(self) -> ICUTokenAnalysis:
         """ Create a token analyser from the reviouly loaded rules.
         """
         """ Create a token analyser from the reviouly loaded rules.
         """
-        return self.analysis[None].create(self.normalization_rules,
-                                          self.transliteration_rules)
+        return ICUTokenAnalysis(self.normalization_rules,
+                                self.transliteration_rules, self.analysis)
 
 
 
 
-    def get_search_rules(self):
+    def get_search_rules(self) -> str:
         """ Return the ICU rules to be used during search.
             The rules combine normalization and transliteration.
         """
         """ Return the ICU rules to be used during search.
             The rules combine normalization and transliteration.
         """
@@ -97,26 +117,31 @@ class ICURuleLoader:
         return rules.getvalue()
 
 
         return rules.getvalue()
 
 
-    def get_normalization_rules(self):
+    def get_normalization_rules(self) -> str:
         """ Return rules for normalisation of a term.
         """
         return self.normalization_rules
 
 
         """ Return rules for normalisation of a term.
         """
         return self.normalization_rules
 
 
-    def get_transliteration_rules(self):
+    def get_transliteration_rules(self) -> str:
         """ Return the rules for converting a string into its asciii representation.
         """
         return self.transliteration_rules
 
 
         """ Return the rules for converting a string into its asciii representation.
         """
         return self.transliteration_rules
 
 
-    def _setup_analysis(self):
+    def _setup_analysis(self) -> None:
         """ Process the rules used for creating the various token analyzers.
         """
         """ Process the rules used for creating the various token analyzers.
         """
-        self.analysis = {}
+        self.analysis: Dict[Optional[str], TokenAnalyzerRule]  = {}
 
         if not isinstance(self.analysis_rules, list):
             raise UsageError("Configuration section 'token-analysis' must be a list.")
 
 
         if not isinstance(self.analysis_rules, list):
             raise UsageError("Configuration section 'token-analysis' must be a list.")
 
+        norm = Transliterator.createFromRules("rule_loader_normalization",
+                                              self.normalization_rules)
+        trans = Transliterator.createFromRules("rule_loader_transliteration",
+                                              self.transliteration_rules)
+
         for section in self.analysis_rules:
             name = section.get('id', None)
             if name in self.analysis:
         for section in self.analysis_rules:
             name = section.get('id', None)
             if name in self.analysis:
@@ -125,30 +150,20 @@ class ICURuleLoader:
                 else:
                     LOG.fatal("ICU tokenizer configuration has two token "
                               "analyzers with id '%s'.", name)
                 else:
                     LOG.fatal("ICU tokenizer configuration has two token "
                               "analyzers with id '%s'.", name)
-                UsageError("Syntax error in ICU tokenizer config.")
-            self.analysis[name] = TokenAnalyzerRule(section, self.normalization_rules)
+                raise UsageError("Syntax error in ICU tokenizer config.")
+            self.analysis[name] = TokenAnalyzerRule(section, norm, trans,
+                                                    self.config)
 
 
     @staticmethod
 
 
     @staticmethod
-    def _get_section(rules, section):
-        """ Get the section named 'section' from the rules. If the section does
-            not exist, raise a usage error with a meaningful message.
-        """
-        if section not in rules:
-            LOG.fatal("Section '%s' not found in tokenizer config.", section)
-            raise UsageError("Syntax error in tokenizer configuration file.")
-
-        return rules[section]
-
-
-    def _cfg_to_icu_rules(self, rules, section):
+    def _cfg_to_icu_rules(rules: Mapping[str, Any], section: str) -> str:
         """ Load an ICU ruleset from the given section. If the section is a
             simple string, it is interpreted as a file name and the rules are
             loaded verbatim from the given file. The filename is expected to be
             relative to the tokenizer rule file. If the section is a list then
             each line is assumed to be a rule. All rules are concatenated and returned.
         """
         """ Load an ICU ruleset from the given section. If the section is a
             simple string, it is interpreted as a file name and the rules are
             loaded verbatim from the given file. The filename is expected to be
             relative to the tokenizer rule file. If the section is a list then
             each line is assumed to be a rule. All rules are concatenated and returned.
         """
-        content = self._get_section(rules, section)
+        content = _get_section(rules, section)
 
         if content is None:
             return ''
 
         if content is None:
             return ''
@@ -161,123 +176,21 @@ class TokenAnalyzerRule:
         and creates a new token analyzer on request.
     """
 
         and creates a new token analyzer on request.
     """
 
-    def __init__(self, rules, normalization_rules):
-        self._parse_variant_list(rules.get('variants'), normalization_rules)
-
-
-    def create(self, normalization_rules, transliteration_rules):
-        """ Create an analyzer from the given rules.
-        """
-        return ICUNameProcessor(normalization_rules,
-                                transliteration_rules,
-                                self.variants)
-
-
-    def _parse_variant_list(self, rules, normalization_rules):
-        self.variants = set()
-
-        if not rules:
-            return
-
-        rules = flatten_config_list(rules, 'variants')
-
-        vmaker = _VariantMaker(normalization_rules)
+    def __init__(self, rules: Mapping[str, Any],
+                 normalizer: Any, transliterator: Any,
+                 config: Configuration) -> None:
+        analyzer_name = _get_section(rules, 'analyzer')
+        if not analyzer_name or not isinstance(analyzer_name, str):
+            raise UsageError("'analyzer' parameter needs to be simple string")
 
 
-        properties = []
-        for section in rules:
-            # Create the property field and deduplicate against existing
-            # instances.
-            props = variants.ICUVariantProperties.from_rules(section)
-            for existing in properties:
-                if existing == props:
-                    props = existing
-                    break
-            else:
-                properties.append(props)
-
-            for rule in (section.get('words') or []):
-                self.variants.update(vmaker.compute(rule, props))
-
-
-class _VariantMaker:
-    """ Generater for all necessary ICUVariants from a single variant rule.
-
-        All text in rules is normalized to make sure the variants match later.
-    """
+        self._analysis_mod: AnalysisModule = \
+            config.load_plugin_module(analyzer_name, 'nominatim.tokenizer.token_analysis')
 
 
-    def __init__(self, norm_rules):
-        self.norm = Transliterator.createFromRules("rule_loader_normalization",
-                                                   norm_rules)
+        self.config = self._analysis_mod.configure(rules, normalizer,
+                                                   transliterator)
 
 
 
 
-    def compute(self, rule, props):
-        """ Generator for all ICUVariant tuples from a single variant rule.
+    def create(self, normalizer: Any, transliterator: Any) -> Analyzer:
+        """ Create a new analyser instance for the given rule.
         """
         """
-        parts = re.split(r'(\|)?([=-])>', rule)
-        if len(parts) != 4:
-            raise UsageError("Syntax error in variant rule: " + rule)
-
-        decompose = parts[1] is None
-        src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
-        repl_terms = (self.norm.transliterate(t.strip()) for t in parts[3].split(','))
-
-        # If the source should be kept, add a 1:1 replacement
-        if parts[2] == '-':
-            for src in src_terms:
-                if src:
-                    for froms, tos in _create_variants(*src, src[0], decompose):
-                        yield variants.ICUVariant(froms, tos, props)
-
-        for src, repl in itertools.product(src_terms, repl_terms):
-            if src and repl:
-                for froms, tos in _create_variants(*src, repl, decompose):
-                    yield variants.ICUVariant(froms, tos, props)
-
-
-    def _parse_variant_word(self, name):
-        name = name.strip()
-        match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
-        if match is None or (match.group(1) == '~' and match.group(3) == '~'):
-            raise UsageError("Invalid variant word descriptor '{}'".format(name))
-        norm_name = self.norm.transliterate(match.group(2))
-        if not norm_name:
-            return None
-
-        return norm_name, match.group(1), match.group(3)
-
-
-_FLAG_MATCH = {'^': '^ ',
-               '$': ' ^',
-               '': ' '}
-
-
-def _create_variants(src, preflag, postflag, repl, decompose):
-    if preflag == '~':
-        postfix = _FLAG_MATCH[postflag]
-        # suffix decomposition
-        src = src + postfix
-        repl = repl + postfix
-
-        yield src, repl
-        yield ' ' + src, ' ' + repl
-
-        if decompose:
-            yield src, ' ' + repl
-            yield ' ' + src, repl
-    elif postflag == '~':
-        # prefix decomposition
-        prefix = _FLAG_MATCH[preflag]
-        src = prefix + src
-        repl = prefix + repl
-
-        yield src, repl
-        yield src + ' ', repl + ' '
-
-        if decompose:
-            yield src, repl + ' '
-            yield src + ' ', repl
-    else:
-        prefix = _FLAG_MATCH[preflag]
-        postfix = _FLAG_MATCH[postflag]
-
-        yield prefix + src + postfix, prefix + repl + postfix
+        return self._analysis_mod.create(normalizer, transliterator, self.config)