Merge remote-tracking branch 'upstream/master'

[nominatim.git] / nominatim / tokenizer / base.py
diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py

index e126507bde1c07be7653072e05349d0a7959fc55..061cff36b99f22273e55e350d410d4291c425b91 100644 (file)
--- a/nominatim/tokenizer/base.py
+++ b/nominatim/tokenizer/base.py
@@ -1,14 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
  """
  """
-Abstract class defintions for tokenizers. These base classes are here
+Abstract class definitions for tokenizers. These base classes are here
  mainly for documentation purposes.
  """
  from abc import ABC, abstractmethod
  mainly for documentation purposes.
  """
  from abc import ABC, abstractmethod
-from typing import List, Tuple, Dict, Any
+from typing import List, Tuple, Dict, Any, Optional, Iterable
+from pathlib import Path
  
  from nominatim.config import Configuration
  
  from nominatim.config import Configuration
-from nominatim.indexer.place_info import PlaceInfo
-
-# pylint: disable=unnecessary-pass
+from nominatim.db.connection import Connection
+from nominatim.data.place_info import PlaceInfo
+from nominatim.typing import Protocol
  
  class AbstractAnalyzer(ABC):
      """ The analyzer provides the functions for analysing names and building
  
  class AbstractAnalyzer(ABC):
      """ The analyzer provides the functions for analysing names and building
@@ -22,7 +29,7 @@ class AbstractAnalyzer(ABC):
          return self
  
  
          return self
  
  
-    def __exit__(self, exc_type, exc_value, traceback) -> None:
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
          self.close()
  
  
          self.close()
  
  
@@ -30,7 +37,6 @@ class AbstractAnalyzer(ABC):
      def close(self) -> None:
          """ Free all resources used by the analyzer.
          """
      def close(self) -> None:
          """ Free all resources used by the analyzer.
          """
-        pass
  
  
      @abstractmethod
  
  
      @abstractmethod
@@ -47,10 +53,9 @@ class AbstractAnalyzer(ABC):
  
              Returns:
                  The function returns the list of all tuples that could be
  
              Returns:
                  The function returns the list of all tuples that could be
-                found for the given words. Each list entry is a tuple of
-                (original word, word token, word id).
+                    found for the given words. Each list entry is a tuple of
+                    (original word, word token, word id).
          """
          """
-        pass
  
  
      @abstractmethod
  
  
      @abstractmethod
@@ -66,7 +71,6 @@ class AbstractAnalyzer(ABC):
              Returns:
                  The given postcode after normalization.
          """
              Returns:
                  The given postcode after normalization.
          """
-        pass
  
  
      @abstractmethod
  
  
      @abstractmethod
@@ -74,11 +78,11 @@ class AbstractAnalyzer(ABC):
          """ Update the tokenizer's postcode tokens from the current content
              of the `location_postcode` table.
          """
          """ Update the tokenizer's postcode tokens from the current content
              of the `location_postcode` table.
          """
-        pass
  
  
      @abstractmethod
  
  
      @abstractmethod
-    def update_special_phrases(self, phrases: List[Tuple[str, str, str, str]],
+    def update_special_phrases(self,
+                               phrases: Iterable[Tuple[str, str, str, str]],
                                 should_replace: bool) -> None:
          """ Update the tokenizer's special phrase tokens from the given
              list of special phrases.
                                 should_replace: bool) -> None:
          """ Update the tokenizer's special phrase tokens from the given
              list of special phrases.
@@ -90,11 +94,10 @@ class AbstractAnalyzer(ABC):
                                  When false, just add the given phrases to the
                                  ones that already exist.
          """
                                  When false, just add the given phrases to the
                                  ones that already exist.
          """
-        pass
  
  
      @abstractmethod
  
  
      @abstractmethod
-    def add_country_names(self, country_code: str, names: Dict[str, str]):
+    def add_country_names(self, country_code: str, names: Dict[str, str]) -> None:
          """ Add the given names to the tokenizer's list of country tokens.
  
              Arguments:
          """ Add the given names to the tokenizer's list of country tokens.
  
              Arguments:
@@ -102,7 +105,6 @@ class AbstractAnalyzer(ABC):
                                refer to.
                  names: Dictionary of name type to name.
          """
                                refer to.
                  names: Dictionary of name type to name.
          """
-        pass
  
  
      @abstractmethod
  
  
      @abstractmethod
@@ -112,11 +114,11 @@ class AbstractAnalyzer(ABC):
              the search index.
  
              Arguments:
              the search index.
  
              Arguments:
-                place: Place information retrived from the database.
+                place: Place information retrieved from the database.
  
              Returns:
                  A JSON-serialisable structure that will be handed into
  
              Returns:
                  A JSON-serialisable structure that will be handed into
-                the database via the `token_info` field.
+                    the database via the `token_info` field.
          """
  
  
          """
  
  
@@ -140,22 +142,21 @@ class AbstractTokenizer(ABC):
  
                init_db: When set to False, then initialisation of database
                  tables should be skipped. This option is only required for
  
                init_db: When set to False, then initialisation of database
                  tables should be skipped. This option is only required for
-                migration purposes and can be savely ignored by custom
+                migration purposes and can be safely ignored by custom
                  tokenizers.
                  tokenizers.
-
-            TODO: can we move the init_db parameter somewhere else?
          """
          """
-        pass
  
  
      @abstractmethod
  
  
      @abstractmethod
-    def init_from_project(self) -> None:
+    def init_from_project(self, config: Configuration) -> None:
          """ Initialise the tokenizer from an existing database setup.
  
              The function should load all previously saved configuration from
              the project directory and/or the property table.
          """ Initialise the tokenizer from an existing database setup.
  
              The function should load all previously saved configuration from
              the project directory and/or the property table.
+
+            Arguments:
+              config: Read-only object with configuration options.
          """
          """
-        pass
  
  
      @abstractmethod
  
  
      @abstractmethod
@@ -168,7 +169,6 @@ class AbstractTokenizer(ABC):
              Arguments:
                config: Read-only object with configuration options.
          """
              Arguments:
                config: Read-only object with configuration options.
          """
-        pass
  
  
      @abstractmethod
  
  
      @abstractmethod
@@ -183,22 +183,37 @@ class AbstractTokenizer(ABC):
              Arguments:
                config: Read-only object with configuration options.
          """
              Arguments:
                config: Read-only object with configuration options.
          """
-        pass
  
  
      @abstractmethod
  
  
      @abstractmethod
-    def check_database(self) -> str:
+    def check_database(self, config: Configuration) -> Optional[str]:
          """ Check that the database is set up correctly and ready for being
              queried.
  
          """ Check that the database is set up correctly and ready for being
              queried.
  
+            Arguments:
+              config: Read-only object with configuration options.
+
              Returns:
                If an issue was found, return an error message with the
              Returns:
                If an issue was found, return an error message with the
-              description of the issue as well as hints for the user on
-              how to resolve the issue.
+                  description of the issue as well as hints for the user on
+                  how to resolve the issue. If everything is okay, return `None`.
+        """
+
+
+    @abstractmethod
+    def update_statistics(self) -> None:
+        """ Recompute any tokenizer statistics necessary for efficient lookup.
+            This function is meant to be called from time to time by the user
+            to improve performance. However, the tokenizer must not depend on
+            it to be called in order to work.
+        """
  
  
-              Return `None`, if no issue was found.
+
+    @abstractmethod
+    def update_word_tokens(self) -> None:
+        """ Do house-keeping on the tokenizers internal data structures.
+            Remove unused word tokens, resort data etc.
          """
          """
-        pass
  
  
      @abstractmethod
  
  
      @abstractmethod
@@ -215,4 +230,24 @@ class AbstractTokenizer(ABC):
              When used outside the with construct, the caller must ensure to
              call the close() function before destructing the analyzer.
          """
              When used outside the with construct, the caller must ensure to
              call the close() function before destructing the analyzer.
          """
-        pass
+
+
+    @abstractmethod
+    def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
+        """ Return a list of the most frequent full words in the database.
+
+            Arguments:
+              conn: Open connection to the database which may be used to
+                    retrieve the words.
+              num: Maximum number of words to return.
+        """
+
+
+class TokenizerModule(Protocol):
+    """ Interface that must be exported by modules that implement their
+        own tokenizer.
+    """
+
+    def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
+        """ Factory for new tokenizers.
+        """