define formal public Python interface for tokenizer

author Sarah Hoffmann <lonvia@denofr.de>

Tue, 10 Aug 2021 12:51:35 +0000 (14:51 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Mon, 16 Aug 2021 09:41:54 +0000 (11:41 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Tue, 10 Aug 2021 12:51:35 +0000 (14:51 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Mon, 16 Aug 2021 09:41:54 +0000 (11:41 +0200)
diff --git a/docs/develop/Tokenizers.md b/docs/develop/Tokenizers.md

index e10587a627480afc71f95d5a1601e1a2841df5e2..b860ed36dc41142423d34160bc90744e4f8c7c0b 100644 (file)
--- a/docs/develop/Tokenizers.md
+++ b/docs/develop/Tokenizers.md
@@ -73,3 +73,67 @@ the saved tokens in the database. It then returns the list of possibly matching
  tokens and the list of possible splits to the query parser. The parser uses
  this information to compute all possible interpretations of the query and
  rank them accordingly.
+
+## Tokenizer API
+
+The following section describes the functions that need to be implemented
+for a custom tokenizer implementation.
+
+!!! warning
+    This API is currently in early alpha status. While this API is meant to
+    be a public API on which other tokenizers may be implemented, the API is
+    far away from being stable at the moment.
+
+### Directory Structure
+
+Nominatim expects two files for a tokenizer:
+
+* `nominiatim/tokenizer/<NAME>_tokenizer.py` containing the Pythonpart of the
+  implementation
+* `lib-php/tokenizer/<NAME>_tokenizer.php` with the PHP part of the
+  implementation
+
+where `<NAME>` is a unique name for the tokenizer consisting of only lower-case
+letters, digits and underscore. A tokenizer also needs to install some SQL
+functions. By convention, these should be placed in `lib-sql/tokenizer`.
+
+If the tokenizer has a default configuration file, this should be saved in
+the `settings/<NAME>_tokenizer.<SUFFIX>`.
+
+### Configuration and Persistance
+
+Tokenizers may define custom settings for their configuration. All settings
+must be prefixed with `NOMINATIM_TOKENIZER_`. Settings may be transient or
+persistent. Transient settings are loaded from the configuration file when
+Nominatim is started and may thus be changed at any time. Persistent settings
+are tied to a database installation and must only be read during installation
+time. If they are needed for the runtime then they must be saved into the
+`nominatim_properties` table and later loaded from there.
+
+### The Python module
+
+The Python module is expect to export a single factory function:
+
+```python
+def create(dsn: str, data_dir: Path) -> AbstractTokenizer
+```
+
+The `dsn` parameter contains the DSN of the Nominatim database. The `data_dir`
+is a directory in the project directory that the tokenizer may use to save
+database-specific data. The function must return the instance of the tokenizer
+class as defined below.
+
+### Python Tokenizer Class
+
+All tokenizers must inherit from `nominatim.tokenizer.base.AbstractTokenizer`
+and implement the abstract functions defined there.
+
+::: nominatim.tokenizer.base.AbstractTokenizer
+    rendering:
+        heading_level: 4
+
+### Python Analyzer Class
+
+::: nominatim.tokenizer.base.AbstractAnalyzer
+    rendering:
+        heading_level: 4
diff --git a/docs/extra.css b/docs/extra.css

index 136c59a6438121ba5c3c645fb1376cd2feb378d3..9289c1d39884909c0d6d4c4a0209f8cec1039c97 100644 (file)
--- a/docs/extra.css
+++ b/docs/extra.css
@@ -13,3 +13,11 @@ th, td {
  th {
      background-color: #eee;
  }
+
+/* Indentation for mkdocstrings.
+div.doc-contents:not(.first) {
+  padding-left: 25px;
+  border-left: 4px solid rgba(230, 230, 230);
+  margin-bottom: 60px;
+}*/
+
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml

index 5c6147aa6cafae3892fcad460453e83563e78bf6..bc8a6cddb60d69d597d50c9b3010f4490c7fd361 100644 (file)
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -30,6 +30,7 @@ pages:
          - 'Architecture Overview' : 'develop/overview.md'
          - 'OSM Data Import' : 'develop/Import.md'
          - 'Place Ranking' : 'develop/Ranking.md'
+        - 'Tokenizers' : 'develop/Tokenizers.md'
          - 'Postcodes' : 'develop/Postcodes.md'
          - 'Testing' : 'develop/Testing.md'
          - 'External Data Sources': 'develop/data-sources.md'
@@ -44,3 +45,11 @@ markdown_extensions:
      - toc:
          permalink: 
  extra_css: [extra.css, styles.css]
+plugins:
+    - search
+    - mkdocstrings:
+        handlers:
+          python:
+            rendering:
+              show_source: false
+              show_signature_annotations: false
diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py

new file mode 100644 (file)

index 0000000..00ecae4
--- /dev/null
+++ b/nominatim/tokenizer/base.py
@@ -0,0 +1,224 @@
+"""
+Abstract class defintions for tokenizers. These base classes are here
+mainly for documentation purposes.
+"""
+from abc import ABC, abstractmethod
+from typing import List, Tuple, Dict, Any
+
+from nominatim.config import Configuration
+
+# pylint: disable=unnecessary-pass
+
+class AbstractAnalyzer(ABC):
+    """ The analyzer provides the functions for analysing names and building
+        the token database.
+
+        Analyzers are instantiated on a per-thread base. Access to global data
+        structures must be synchronised accordingly.
+    """
+
+    def __enter__(self) -> 'AbstractAnalyzer':
+        return self
+
+
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        self.close()
+
+
+    @abstractmethod
+    def close(self) -> None:
+        """ Free all resources used by the analyzer.
+        """
+        pass
+
+
+    @abstractmethod
+    def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
+        """ Return token information for the given list of words.
+
+            The function is used for testing and debugging only
+            and does not need to be particularly efficient.
+
+            Arguments:
+                words: A list of words to look up the tokens for.
+                       If a word starts with # it is assumed to be a full name
+                       otherwise is a partial term.
+
+            Returns:
+                The function returns the list of all tuples that could be
+                found for the given words. Each list entry is a tuple of
+                (original word, word token, word id).
+        """
+        pass
+
+
+    @abstractmethod
+    def normalize_postcode(self, postcode: str) -> str:
+        """ Convert the postcode to its standardized form.
+
+            This function must yield exactly the same result as the SQL function
+            `token_normalized_postcode()`.
+
+            Arguments:
+                postcode: The postcode to be normalized.
+
+            Returns:
+                The given postcode after normalization.
+        """
+        pass
+
+
+    @abstractmethod
+    def update_postcodes_from_db(self) -> None:
+        """ Update the tokenizer's postcode tokens from the current content
+            of the `location_postcode` table.
+        """
+        pass
+
+
+    @abstractmethod
+    def update_special_phrases(self, phrases: List[Tuple[str, str, str, str]],
+                               should_replace: bool) -> None:
+        """ Update the tokenizer's special phrase tokens from the given
+            list of special phrases.
+
+            Arguments:
+                phrases: The new list of special phrases. Each entry is
+                         a tuple of (phrase, class, type, operator).
+                should_replace: If true, replace the current list of phrases.
+                                When false, just add the given phrases to the
+                                ones that already exist.
+        """
+        pass
+
+
+    @abstractmethod
+    def add_country_names(self, country_code: str, names: Dict[str, str]):
+        """ Add the given names to the tokenizer's list of country tokens.
+
+            Arguments:
+                country_code: two-letter country code for the country the names
+                              refer to.
+                names: Dictionary of name type to name.
+        """
+        pass
+
+
+    @abstractmethod
+    def process_place(self, place: Dict) -> Any:
+        """ Extract tokens for the given place and compute the
+            information to be handed to the PL/pgSQL processor for building
+            the search index.
+
+            Arguments:
+                place: Dictionary with the information about the place. Currently
+                       the following fields may be present:
+
+                       - *name* is a dictionary of names for the place together
+                         with the designation of the name.
+                       - *address* is a dictionary of address terms.
+                       - *country_feature* is set to a country code when the
+                         place describes a country.
+
+            Returns:
+                A JSON-serialisable structure that will be handed into
+                the database via the `token_info` field.
+        """
+
+
+
+class AbstractTokenizer(ABC):
+    """ The tokenizer instance is the central instance of the tokenizer in
+        the system. There will only be a single instance of the tokenizer
+        active at any time.
+    """
+
+    @abstractmethod
+    def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
+        """ Set up a new tokenizer for the database.
+
+            The function should copy all necessary data into the project
+            directory or save it in the property table to make sure that
+            the tokenizer remains stable over updates.
+
+            Arguments:
+              config: Read-only object with configuration obtions.
+
+              init_db: When set to False, then initialisation of database
+                tables should be skipped. This option is only required for
+                migration purposes and can be savely ignored by custom
+                tokenizers.
+
+            TODO: can we move the init_db parameter somewhere else?
+        """
+        pass
+
+
+    @abstractmethod
+    def init_from_project(self) -> None:
+        """ Initialise the tokenizer from an existing database setup.
+
+            The function should load all previously saved configuration from
+            the project directory and/or the property table.
+        """
+        pass
+
+
+    @abstractmethod
+    def finalize_import(self, config: Configuration) -> None:
+        """ This function is called at the very end of an import when all
+            data has been imported and indexed. The tokenizer may create
+            at this point any additional indexes and data structures needed
+            during query time.
+
+            Arguments:
+              config: Read-only object with configuration obtions.
+        """
+        pass
+
+
+    @abstractmethod
+    def update_sql_functions(self, config: Configuration) -> None:
+        """ Update the SQL part of the tokenizer. This function is called
+            automatically on migrations or may be called explicitly by the
+            user through the `nominatim refresh --functions` command.
+
+            The tokenizer must only update the code of the tokenizer. The
+            data structures or data itself must not be changed by this function.
+
+            Arguments:
+              config: Read-only object with configuration obtions.
+        """
+        pass
+
+
+    @abstractmethod
+    def check_database(self) -> str:
+        """ Check that the database is set up correctly and ready for being
+            queried.
+
+            Returns:
+              If an issue was found, return an error message with the
+              description of the issue as well as hints for the user on
+              how to resolve the issue.
+
+              Return `None`, if no issue was found.
+        """
+        pass
+
+
+    @abstractmethod
+    def name_analyzer(self) -> AbstractAnalyzer:
+        """ Create a new analyzer for tokenizing names and queries
+            using this tokinzer. Analyzers are context managers and should
+            be used accordingly:
+
+            ```
+            with tokenizer.name_analyzer() as analyzer:
+                analyser.tokenize()
+            ```
+
+            When used outside the with construct, the caller must ensure to
+            call the close() function before destructing the analyzer.
+        """
+        pass
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index a887ae286834e6005a97ddd53da467b4af54f1ff..44034f842622f08257878b69d392af1f47b00df7 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -16,6 +16,7 @@ from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
+from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
  DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
@@ -28,7 +29,7 @@ def create(dsn, data_dir):
      return LegacyICUTokenizer(dsn, data_dir)
  
  
-class LegacyICUTokenizer:
+class LegacyICUTokenizer(AbstractTokenizer):
      """ This tokenizer uses libICU to covert names and queries to ASCII.
          Otherwise it uses the same algorithms and data structures as the
          normalization routines in Nominatim 3.
@@ -192,7 +193,7 @@ class LegacyICUTokenizer:
          return words
  
  
-class LegacyICUNameAnalyzer:
+class LegacyICUNameAnalyzer(AbstractAnalyzer):
      """ The legacy analyzer uses the ICU library for splitting names.
  
          Each instance opens a connection to the database to request the
@@ -207,14 +208,6 @@ class LegacyICUNameAnalyzer:
          self._cache = _TokenCache()
  
  
-    def __enter__(self):
-        return self
-
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.close()
-
-
      def close(self):
          """ Free all resources used by the analyzer.
          """
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py

index c19dce2f5a2a3c0d903cd13bca7bd6e3738a8008..8957426b353efa7ec17f572e754f7fe47f90022c 100644 (file)
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -16,6 +16,7 @@ from nominatim.db import properties
  from nominatim.db import utils as db_utils
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.errors import UsageError
+from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
  DBCFG_NORMALIZATION = "tokenizer_normalization"
  DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
@@ -76,7 +77,7 @@ def _check_module(module_dir, conn):
              raise UsageError("Database module cannot be accessed.") from err
  
  
-class LegacyTokenizer:
+class LegacyTokenizer(AbstractTokenizer):
      """ The legacy tokenizer uses a special PostgreSQL module to normalize
          names and queries. The tokenizer thus implements normalization through
          calls to the database.
@@ -238,7 +239,7 @@ class LegacyTokenizer:
          properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
  
  
-class LegacyNameAnalyzer:
+class LegacyNameAnalyzer(AbstractAnalyzer):
      """ The legacy analyzer uses the special Postgresql module for
          splitting names.
  
@@ -255,14 +256,6 @@ class LegacyNameAnalyzer:
          self._cache = _TokenCache(self.conn)
  
  
-    def __enter__(self):
-        return self
-
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.close()
-
-
      def close(self):
          """ Free all resources used by the analyzer.
          """
author	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 10 Aug 2021 12:51:35 +0000 (14:51 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Mon, 16 Aug 2021 09:41:54 +0000 (11:41 +0200)
docs/develop/Tokenizers.md		patch \| blob \| history
docs/extra.css		patch \| blob \| history
docs/mkdocs.yml		patch \| blob \| history
nominatim/tokenizer/base.py	[new file with mode: 0644]	patch \| blob
nominatim/tokenizer/legacy_icu_tokenizer.py		patch \| blob \| history
nominatim/tokenizer/legacy_tokenizer.py		patch \| blob \| history