add type annotations to ICU tokenizer helper modules

author Sarah Hoffmann <lonvia@denofr.de>

Wed, 13 Jul 2022 20:55:40 +0000 (22:55 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Mon, 18 Jul 2022 07:47:57 +0000 (09:47 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Wed, 13 Jul 2022 20:55:40 +0000 (22:55 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Mon, 18 Jul 2022 07:47:57 +0000 (09:47 +0200)
diff --git a/nominatim/db/properties.py b/nominatim/db/properties.py

index 9dac2053ba6d6dbd2c92589f11f9948a798baf52..3624c950e4a158c6b4f7d8b6ab7d8f9cfbd6911c 100644 (file)
--- a/nominatim/db/properties.py
+++ b/nominatim/db/properties.py
@@ -28,7 +28,7 @@ def set_property(conn: Connection, name: str, value: str) -> None:
  
  
  def get_property(conn: Connection, name: str) -> Optional[str]:
  
  
  def get_property(conn: Connection, name: str) -> Optional[str]:
-    """ Return the current value of the given propery or None if the property
+    """ Return the current value of the given property or None if the property
          is not set.
      """
      if not conn.table_exists('nominatim_properties'):
          is not set.
      """
      if not conn.table_exists('nominatim_properties'):
diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py

index 5a3d3b1276aa7c6bf3bb36787967d519fe70aa79..6484ff6a06a798b206c9c8e54bf947cf2da4b1d4 100644 (file)
--- a/nominatim/tokenizer/base.py
+++ b/nominatim/tokenizer/base.py
@@ -10,12 +10,13 @@ mainly for documentation purposes.
  """
  from abc import ABC, abstractmethod
  from typing import List, Tuple, Dict, Any
  """
  from abc import ABC, abstractmethod
  from typing import List, Tuple, Dict, Any
+from pathlib import Path
+
+from typing_extensions import Protocol
  
  from nominatim.config import Configuration
  from nominatim.data.place_info import PlaceInfo
  
  
  from nominatim.config import Configuration
  from nominatim.data.place_info import PlaceInfo
  
-# pylint: disable=unnecessary-pass
-
  class AbstractAnalyzer(ABC):
      """ The analyzer provides the functions for analysing names and building
          the token database.
  class AbstractAnalyzer(ABC):
      """ The analyzer provides the functions for analysing names and building
          the token database.
@@ -230,3 +231,13 @@ class AbstractTokenizer(ABC):
              When used outside the with construct, the caller must ensure to
              call the close() function before destructing the analyzer.
          """
              When used outside the with construct, the caller must ensure to
              call the close() function before destructing the analyzer.
          """
+
+
+class TokenizerModule(Protocol):
+    """ Interface that must be exported by modules that implement their
+        own tokenizer.
+    """
+
+    def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
+        """ Factory for new tokenizers.
+        """
diff --git a/nominatim/tokenizer/factory.py b/nominatim/tokenizer/factory.py

index 108c7841e0c7c3e4f8bf6bd25b3aa8d9c35bba42..67e221949911b19dcd10943d5833f5430194da1d 100644 (file)
--- a/nominatim/tokenizer/factory.py
+++ b/nominatim/tokenizer/factory.py
@@ -19,17 +19,20 @@ database.
  A tokenizer usually also includes PHP code for querying. The appropriate PHP
  normalizer module is installed, when the tokenizer is created.
  """
  A tokenizer usually also includes PHP code for querying. The appropriate PHP
  normalizer module is installed, when the tokenizer is created.
  """
+from typing import Optional
  import logging
  import importlib
  from pathlib import Path
  
  import logging
  import importlib
  from pathlib import Path
  
-from ..errors import UsageError
-from ..db import properties
-from ..db.connection import connect
+from nominatim.errors import UsageError
+from nominatim.db import properties
+from nominatim.db.connection import connect
+from nominatim.config import Configuration
+from nominatim.tokenizer.base import AbstractTokenizer, TokenizerModule
  
  LOG = logging.getLogger()
  
  
  LOG = logging.getLogger()
  
-def _import_tokenizer(name):
+def _import_tokenizer(name: str) -> TokenizerModule:
      """ Load the tokenizer.py module from project directory.
      """
      src_file = Path(__file__).parent / (name + '_tokenizer.py')
      """ Load the tokenizer.py module from project directory.
      """
      src_file = Path(__file__).parent / (name + '_tokenizer.py')
@@ -41,7 +44,8 @@ def _import_tokenizer(name):
      return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
  
  
      return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
  
  
-def create_tokenizer(config, init_db=True, module_name=None):
+def create_tokenizer(config: Configuration, init_db: bool = True,
+                     module_name: Optional[str] = None) -> AbstractTokenizer:
      """ Create a new tokenizer as defined by the given configuration.
  
          The tokenizer data and code is copied into the 'tokenizer' directory
      """ Create a new tokenizer as defined by the given configuration.
  
          The tokenizer data and code is copied into the 'tokenizer' directory
@@ -70,7 +74,7 @@ def create_tokenizer(config, init_db=True, module_name=None):
      return tokenizer
  
  
      return tokenizer
  
  
-def get_tokenizer_for_db(config):
+def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer:
      """ Instantiate a tokenizer for an existing database.
  
          The function looks up the appropriate tokenizer in the database
      """ Instantiate a tokenizer for an existing database.
  
          The function looks up the appropriate tokenizer in the database
diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py

index 035b6698511035fbaf6dfed1f137aeb8e911cce1..7199f5f5535aab5eb4429e862512e34e15310c29 100644 (file)
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -7,16 +7,19 @@
  """
  Helper class to create ICU rules from a configuration file.
  """
  """
  Helper class to create ICU rules from a configuration file.
  """
+from typing import Mapping, Any, Generic, Dict, Optional
  import importlib
  import io
  import json
  import logging
  
  import importlib
  import io
  import json
  import logging
  
-from nominatim.config import flatten_config_list
+from nominatim.config import flatten_config_list, Configuration
  from nominatim.db.properties import set_property, get_property
  from nominatim.db.properties import set_property, get_property
+from nominatim.db.connection import Connection
  from nominatim.errors import UsageError
  from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
  from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
  from nominatim.errors import UsageError
  from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
  from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
+from nominatim.tokenizer.token_analysis.base import AnalysisModule, Analyser, T_config
  import nominatim.data.country_info
  
  LOG = logging.getLogger()
  import nominatim.data.country_info
  
  LOG = logging.getLogger()
@@ -26,7 +29,7 @@ DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
  DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
  
  
  DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
  
  
-def _get_section(rules, section):
+def _get_section(rules: Mapping[str, Any], section: str) -> Any:
      """ Get the section named 'section' from the rules. If the section does
          not exist, raise a usage error with a meaningful message.
      """
      """ Get the section named 'section' from the rules. If the section does
          not exist, raise a usage error with a meaningful message.
      """
@@ -41,7 +44,7 @@ class ICURuleLoader:
      """ Compiler for ICU rules from a tokenizer configuration file.
      """
  
      """ Compiler for ICU rules from a tokenizer configuration file.
      """
  
-    def __init__(self, config):
+    def __init__(self, config: Configuration) -> None:
          rules = config.load_sub_configuration('icu_tokenizer.yaml',
                                                config='TOKENIZER_CONFIG')
  
          rules = config.load_sub_configuration('icu_tokenizer.yaml',
                                                config='TOKENIZER_CONFIG')
  
@@ -57,17 +60,27 @@ class ICURuleLoader:
          self.sanitizer_rules = rules.get('sanitizers', [])
  
  
          self.sanitizer_rules = rules.get('sanitizers', [])
  
  
-    def load_config_from_db(self, conn):
+    def load_config_from_db(self, conn: Connection) -> None:
          """ Get previously saved parts of the configuration from the
              database.
          """
          """ Get previously saved parts of the configuration from the
              database.
          """
-        self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
-        self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
-        self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
+        rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
+        if rules is not None:
+            self.normalization_rules = rules
+
+        rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
+        if rules is not None:
+            self.transliteration_rules = rules
+
+        rules = get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES)
+        if rules:
+            self.analysis_rules = json.loads(rules)
+        else:
+            self.analysis_rules = []
          self._setup_analysis()
  
  
          self._setup_analysis()
  
  
-    def save_config_to_db(self, conn):
+    def save_config_to_db(self, conn: Connection) -> None:
          """ Save the part of the configuration that cannot be changed into
              the database.
          """
          """ Save the part of the configuration that cannot be changed into
              the database.
          """
@@ -76,20 +89,20 @@ class ICURuleLoader:
          set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
  
  
          set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
  
  
-    def make_sanitizer(self):
+    def make_sanitizer(self) -> PlaceSanitizer:
          """ Create a place sanitizer from the configured rules.
          """
          return PlaceSanitizer(self.sanitizer_rules)
  
  
          """ Create a place sanitizer from the configured rules.
          """
          return PlaceSanitizer(self.sanitizer_rules)
  
  
-    def make_token_analysis(self):
+    def make_token_analysis(self) -> ICUTokenAnalysis:
          """ Create a token analyser from the reviouly loaded rules.
          """
          return ICUTokenAnalysis(self.normalization_rules,
                                  self.transliteration_rules, self.analysis)
  
  
          """ Create a token analyser from the reviouly loaded rules.
          """
          return ICUTokenAnalysis(self.normalization_rules,
                                  self.transliteration_rules, self.analysis)
  
  
-    def get_search_rules(self):
+    def get_search_rules(self) -> str:
          """ Return the ICU rules to be used during search.
              The rules combine normalization and transliteration.
          """
          """ Return the ICU rules to be used during search.
              The rules combine normalization and transliteration.
          """
@@ -102,22 +115,22 @@ class ICURuleLoader:
          return rules.getvalue()
  
  
          return rules.getvalue()
  
  
-    def get_normalization_rules(self):
+    def get_normalization_rules(self) -> str:
          """ Return rules for normalisation of a term.
          """
          return self.normalization_rules
  
  
          """ Return rules for normalisation of a term.
          """
          return self.normalization_rules
  
  
-    def get_transliteration_rules(self):
+    def get_transliteration_rules(self) -> str:
          """ Return the rules for converting a string into its asciii representation.
          """
          return self.transliteration_rules
  
  
          """ Return the rules for converting a string into its asciii representation.
          """
          return self.transliteration_rules
  
  
-    def _setup_analysis(self):
+    def _setup_analysis(self) -> None:
          """ Process the rules used for creating the various token analyzers.
          """
          """ Process the rules used for creating the various token analyzers.
          """
-        self.analysis = {}
+        self.analysis: Dict[Optional[str], TokenAnalyzerRule[Any]]  = {}
  
          if not isinstance(self.analysis_rules, list):
              raise UsageError("Configuration section 'token-analysis' must be a list.")
  
          if not isinstance(self.analysis_rules, list):
              raise UsageError("Configuration section 'token-analysis' must be a list.")
@@ -135,7 +148,7 @@ class ICURuleLoader:
  
  
      @staticmethod
  
  
      @staticmethod
-    def _cfg_to_icu_rules(rules, section):
+    def _cfg_to_icu_rules(rules: Mapping[str, Any], section: str) -> str:
          """ Load an ICU ruleset from the given section. If the section is a
              simple string, it is interpreted as a file name and the rules are
              loaded verbatim from the given file. The filename is expected to be
          """ Load an ICU ruleset from the given section. If the section is a
              simple string, it is interpreted as a file name and the rules are
              loaded verbatim from the given file. The filename is expected to be
@@ -150,17 +163,21 @@ class ICURuleLoader:
          return ';'.join(flatten_config_list(content, section)) + ';'
  
  
          return ';'.join(flatten_config_list(content, section)) + ';'
  
  
-class TokenAnalyzerRule:
+class TokenAnalyzerRule(Generic[T_config]):
      """ Factory for a single analysis module. The class saves the configuration
          and creates a new token analyzer on request.
      """
  
      """ Factory for a single analysis module. The class saves the configuration
          and creates a new token analyzer on request.
      """
  
-    def __init__(self, rules, normalization_rules):
+    def __init__(self, rules: Mapping[str, Any], normalization_rules: str) -> None:
          # Find the analysis module
          module_name = 'nominatim.tokenizer.token_analysis.' \
                        + _get_section(rules, 'analyzer').replace('-', '_')
          # Find the analysis module
          module_name = 'nominatim.tokenizer.token_analysis.' \
                        + _get_section(rules, 'analyzer').replace('-', '_')
-        analysis_mod = importlib.import_module(module_name)
-        self.create = analysis_mod.create
+        self._analysis_mod: AnalysisModule[T_config] = importlib.import_module(module_name)
  
          # Load the configuration.
  
          # Load the configuration.
-        self.config = analysis_mod.configure(rules, normalization_rules)
+        self.config = self._analysis_mod.configure(rules, normalization_rules)
+
+    def create(self, normalizer: Any, transliterator: Any) -> Analyser:
+        """ Create a new analyser instance for the given rule.
+        """
+        return self._analysis_mod.create(normalizer, transliterator, self.config)
diff --git a/nominatim/tokenizer/icu_token_analysis.py b/nominatim/tokenizer/icu_token_analysis.py

index 68fc82e333b6a44de6eb9d42ed06a2d4ae17da58..ed7aea23be4b745926688c517aaad6b77a7c3a9e 100644 (file)
--- a/nominatim/tokenizer/icu_token_analysis.py
+++ b/nominatim/tokenizer/icu_token_analysis.py
@@ -8,15 +8,22 @@
  Container class collecting all components required to transform an OSM name
  into a Nominatim token.
  """
  Container class collecting all components required to transform an OSM name
  into a Nominatim token.
  """
-
+from typing import Mapping, Optional, TYPE_CHECKING
  from icu import Transliterator
  
  from icu import Transliterator
  
+from nominatim.tokenizer.token_analysis.base import Analyser
+
+if TYPE_CHECKING:
+    from typing import Any
+    from nominatim.tokenizer.icu_rule_loader import TokenAnalyzerRule # pylint: disable=cyclic-import
+
  class ICUTokenAnalysis:
      """ Container class collecting the transliterators and token analysis
          modules for a single NameAnalyser instance.
      """
  
  class ICUTokenAnalysis:
      """ Container class collecting the transliterators and token analysis
          modules for a single NameAnalyser instance.
      """
  
-    def __init__(self, norm_rules, trans_rules, analysis_rules):
+    def __init__(self, norm_rules: str, trans_rules: str,
+                 analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule[Any]']):
          self.normalizer = Transliterator.createFromRules("icu_normalization",
                                                           norm_rules)
          trans_rules += ";[:Space:]+ > ' '"
          self.normalizer = Transliterator.createFromRules("icu_normalization",
                                                           norm_rules)
          trans_rules += ";[:Space:]+ > ' '"
@@ -25,11 +32,11 @@ class ICUTokenAnalysis:
          self.search = Transliterator.createFromRules("icu_search",
                                                       norm_rules + trans_rules)
  
          self.search = Transliterator.createFromRules("icu_search",
                                                       norm_rules + trans_rules)
  
-        self.analysis = {name: arules.create(self.normalizer, self.to_ascii, arules.config)
+        self.analysis = {name: arules.create(self.normalizer, self.to_ascii)
                           for name, arules in analysis_rules.items()}
  
  
                           for name, arules in analysis_rules.items()}
  
  
-    def get_analyzer(self, name):
+    def get_analyzer(self, name: str) -> Analyser:
          """ Return the given named analyzer. If no analyzer with that
              name exists, return the default analyzer.
          """
          """ Return the given named analyzer. If no analyzer with that
              name exists, return the default analyzer.
          """
author	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 13 Jul 2022 20:55:40 +0000 (22:55 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Mon, 18 Jul 2022 07:47:57 +0000 (09:47 +0200)
nominatim/db/properties.py		patch \| blob \| history
nominatim/tokenizer/base.py		patch \| blob \| history
nominatim/tokenizer/factory.py		patch \| blob \| history
nominatim/tokenizer/icu_rule_loader.py		patch \| blob \| history
nominatim/tokenizer/icu_token_analysis.py		patch \| blob \| history