src/nominatim_db/tokenizer/base.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2025 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Abstract class definitions for tokenizers. These base classes are here
   9 mainly for documentation purposes.
  10 """
  11 from abc import ABC, abstractmethod
  12 from typing import List, Tuple, Dict, Any, Optional, Iterable
  13
  14 from ..typing import Protocol
  15 from ..config import Configuration
  16 from ..db.connection import Connection
  17 from ..data.place_info import PlaceInfo
  18
  19
  20 class AbstractAnalyzer(ABC):
  21     """ The analyzer provides the functions for analysing names and building
  22         the token database.
  23
  24         Analyzers are instantiated on a per-thread base. Access to global data
  25         structures must be synchronised accordingly.
  26     """
  27
  28     def __enter__(self) -> 'AbstractAnalyzer':
  29         return self
  30
  31     def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
  32         self.close()
  33
  34     @abstractmethod
  35     def close(self) -> None:
  36         """ Free all resources used by the analyzer.
  37         """
  38
  39     @abstractmethod
  40     def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
  41         """ Return token information for the given list of words.
  42
  43             The function is used for testing and debugging only
  44             and does not need to be particularly efficient.
  45
  46             Arguments:
  47                 words: A list of words to look up the tokens for.
  48                        If a word starts with # it is assumed to be a full name
  49                        otherwise is a partial term.
  50
  51             Returns:
  52                 The function returns the list of all tuples that could be
  53                     found for the given words. Each list entry is a tuple of
  54                     (original word, word token, word id).
  55         """
  56
  57     @abstractmethod
  58     def normalize_postcode(self, postcode: str) -> str:
  59         """ Convert the postcode to its standardized form.
  60
  61             This function must yield exactly the same result as the SQL function
  62             `token_normalized_postcode()`.
  63
  64             Arguments:
  65                 postcode: The postcode to be normalized.
  66
  67             Returns:
  68                 The given postcode after normalization.
  69         """
  70
  71     @abstractmethod
  72     def update_postcodes_from_db(self) -> None:
  73         """ Update the tokenizer's postcode tokens from the current content
  74             of the `location_postcode` table.
  75         """
  76
  77     @abstractmethod
  78     def update_special_phrases(self,
  79                                phrases: Iterable[Tuple[str, str, str, str]],
  80                                should_replace: bool) -> None:
  81         """ Update the tokenizer's special phrase tokens from the given
  82             list of special phrases.
  83
  84             Arguments:
  85                 phrases: The new list of special phrases. Each entry is
  86                          a tuple of (phrase, class, type, operator).
  87                 should_replace: If true, replace the current list of phrases.
  88                                 When false, just add the given phrases to the
  89                                 ones that already exist.
  90         """
  91
  92     @abstractmethod
  93     def add_country_names(self, country_code: str, names: Dict[str, str]) -> None:
  94         """ Add the given names to the tokenizer's list of country tokens.
  95
  96             Arguments:
  97                 country_code: two-letter country code for the country the names
  98                               refer to.
  99                 names: Dictionary of name type to name.
 100         """
 101
 102     @abstractmethod
 103     def process_place(self, place: PlaceInfo) -> Any:
 104         """ Extract tokens for the given place and compute the
 105             information to be handed to the PL/pgSQL processor for building
 106             the search index.
 107
 108             Arguments:
 109                 place: Place information retrieved from the database.
 110
 111             Returns:
 112                 A JSON-serialisable structure that will be handed into
 113                     the database via the `token_info` field.
 114         """
 115
 116
 117 class AbstractTokenizer(ABC):
 118     """ The tokenizer instance is the central instance of the tokenizer in
 119         the system. There will only be a single instance of the tokenizer
 120         active at any time.
 121     """
 122
 123     @abstractmethod
 124     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
 125         """ Set up a new tokenizer for the database.
 126
 127             The function should copy all necessary data into the project
 128             directory or save it in the property table to make sure that
 129             the tokenizer remains stable over updates.
 130
 131             Arguments:
 132               config: Read-only object with configuration options.
 133
 134               init_db: When set to False, then initialisation of database
 135                 tables should be skipped. This option is only required for
 136                 migration purposes and can be safely ignored by custom
 137                 tokenizers.
 138         """
 139
 140     @abstractmethod
 141     def init_from_project(self, config: Configuration) -> None:
 142         """ Initialise the tokenizer from an existing database setup.
 143
 144             The function should load all previously saved configuration from
 145             the project directory and/or the property table.
 146
 147             Arguments:
 148               config: Read-only object with configuration options.
 149         """
 150
 151     @abstractmethod
 152     def finalize_import(self, config: Configuration) -> None:
 153         """ This function is called at the very end of an import when all
 154             data has been imported and indexed. The tokenizer may create
 155             at this point any additional indexes and data structures needed
 156             during query time.
 157
 158             Arguments:
 159               config: Read-only object with configuration options.
 160         """
 161
 162     @abstractmethod
 163     def update_sql_functions(self, config: Configuration) -> None:
 164         """ Update the SQL part of the tokenizer. This function is called
 165             automatically on migrations or may be called explicitly by the
 166             user through the `nominatim refresh --functions` command.
 167
 168             The tokenizer must only update the code of the tokenizer. The
 169             data structures or data itself must not be changed by this function.
 170
 171             Arguments:
 172               config: Read-only object with configuration options.
 173         """
 174
 175     @abstractmethod
 176     def check_database(self, config: Configuration) -> Optional[str]:
 177         """ Check that the database is set up correctly and ready for being
 178             queried.
 179
 180             Arguments:
 181               config: Read-only object with configuration options.
 182
 183             Returns:
 184               If an issue was found, return an error message with the
 185                   description of the issue as well as hints for the user on
 186                   how to resolve the issue. If everything is okay, return `None`.
 187         """
 188
 189     @abstractmethod
 190     def update_statistics(self, config: Configuration, threads: int = 1) -> None:
 191         """ Recompute any tokenizer statistics necessary for efficient lookup.
 192             This function is meant to be called from time to time by the user
 193             to improve performance. However, the tokenizer must not depend on
 194             it to be called in order to work.
 195         """
 196
 197     @abstractmethod
 198     def update_word_tokens(self) -> None:
 199         """ Do house-keeping on the tokenizers internal data structures.
 200             Remove unused word tokens, resort data etc.
 201         """
 202
 203     @abstractmethod
 204     def name_analyzer(self) -> AbstractAnalyzer:
 205         """ Create a new analyzer for tokenizing names and queries
 206             using this tokinzer. Analyzers are context managers and should
 207             be used accordingly:
 208
 209             ```
 210             with tokenizer.name_analyzer() as analyzer:
 211                 analyser.tokenize()
 212             ```
 213
 214             When used outside the with construct, the caller must ensure to
 215             call the close() function before destructing the analyzer.
 216         """
 217
 218     @abstractmethod
 219     def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
 220         """ Return a list of the most frequent full words in the database.
 221
 222             Arguments:
 223               conn: Open connection to the database which may be used to
 224                     retrieve the words.
 225               num: Maximum number of words to return.
 226         """
 227
 228
 229 class TokenizerModule(Protocol):
 230     """ Interface that must be exported by modules that implement their
 231         own tokenizer.
 232     """
 233
 234     def create(self, dsn: str) -> AbstractTokenizer:
 235         """ Factory for new tokenizers.
 236         """