nominatim/tokenizer/base.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Abstract class defintions for tokenizers. These base classes are here
   9 mainly for documentation purposes.
  10 """
  11 from abc import ABC, abstractmethod
  12 from typing import List, Tuple, Dict, Any
  13 from pathlib import Path
  14
  15 from typing_extensions import Protocol
  16
  17 from nominatim.config import Configuration
  18 from nominatim.data.place_info import PlaceInfo
  19
  20 class AbstractAnalyzer(ABC):
  21     """ The analyzer provides the functions for analysing names and building
  22         the token database.
  23
  24         Analyzers are instantiated on a per-thread base. Access to global data
  25         structures must be synchronised accordingly.
  26     """
  27
  28     def __enter__(self) -> 'AbstractAnalyzer':
  29         return self
  30
  31
  32     def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
  33         self.close()
  34
  35
  36     @abstractmethod
  37     def close(self) -> None:
  38         """ Free all resources used by the analyzer.
  39         """
  40
  41
  42     @abstractmethod
  43     def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
  44         """ Return token information for the given list of words.
  45
  46             The function is used for testing and debugging only
  47             and does not need to be particularly efficient.
  48
  49             Arguments:
  50                 words: A list of words to look up the tokens for.
  51                        If a word starts with # it is assumed to be a full name
  52                        otherwise is a partial term.
  53
  54             Returns:
  55                 The function returns the list of all tuples that could be
  56                 found for the given words. Each list entry is a tuple of
  57                 (original word, word token, word id).
  58         """
  59
  60
  61     @abstractmethod
  62     def normalize_postcode(self, postcode: str) -> str:
  63         """ Convert the postcode to its standardized form.
  64
  65             This function must yield exactly the same result as the SQL function
  66             `token_normalized_postcode()`.
  67
  68             Arguments:
  69                 postcode: The postcode to be normalized.
  70
  71             Returns:
  72                 The given postcode after normalization.
  73         """
  74
  75
  76     @abstractmethod
  77     def update_postcodes_from_db(self) -> None:
  78         """ Update the tokenizer's postcode tokens from the current content
  79             of the `location_postcode` table.
  80         """
  81
  82
  83     @abstractmethod
  84     def update_special_phrases(self, phrases: List[Tuple[str, str, str, str]],
  85                                should_replace: bool) -> None:
  86         """ Update the tokenizer's special phrase tokens from the given
  87             list of special phrases.
  88
  89             Arguments:
  90                 phrases: The new list of special phrases. Each entry is
  91                          a tuple of (phrase, class, type, operator).
  92                 should_replace: If true, replace the current list of phrases.
  93                                 When false, just add the given phrases to the
  94                                 ones that already exist.
  95         """
  96
  97
  98     @abstractmethod
  99     def add_country_names(self, country_code: str, names: Dict[str, str]) -> None:
 100         """ Add the given names to the tokenizer's list of country tokens.
 101
 102             Arguments:
 103                 country_code: two-letter country code for the country the names
 104                               refer to.
 105                 names: Dictionary of name type to name.
 106         """
 107
 108
 109     @abstractmethod
 110     def process_place(self, place: PlaceInfo) -> Any:
 111         """ Extract tokens for the given place and compute the
 112             information to be handed to the PL/pgSQL processor for building
 113             the search index.
 114
 115             Arguments:
 116                 place: Place information retrived from the database.
 117
 118             Returns:
 119                 A JSON-serialisable structure that will be handed into
 120                 the database via the `token_info` field.
 121         """
 122
 123
 124
 125 class AbstractTokenizer(ABC):
 126     """ The tokenizer instance is the central instance of the tokenizer in
 127         the system. There will only be a single instance of the tokenizer
 128         active at any time.
 129     """
 130
 131     @abstractmethod
 132     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
 133         """ Set up a new tokenizer for the database.
 134
 135             The function should copy all necessary data into the project
 136             directory or save it in the property table to make sure that
 137             the tokenizer remains stable over updates.
 138
 139             Arguments:
 140               config: Read-only object with configuration options.
 141
 142               init_db: When set to False, then initialisation of database
 143                 tables should be skipped. This option is only required for
 144                 migration purposes and can be savely ignored by custom
 145                 tokenizers.
 146
 147             TODO: can we move the init_db parameter somewhere else?
 148         """
 149
 150
 151     @abstractmethod
 152     def init_from_project(self, config: Configuration) -> None:
 153         """ Initialise the tokenizer from an existing database setup.
 154
 155             The function should load all previously saved configuration from
 156             the project directory and/or the property table.
 157
 158             Arguments:
 159               config: Read-only object with configuration options.
 160         """
 161
 162
 163     @abstractmethod
 164     def finalize_import(self, config: Configuration) -> None:
 165         """ This function is called at the very end of an import when all
 166             data has been imported and indexed. The tokenizer may create
 167             at this point any additional indexes and data structures needed
 168             during query time.
 169
 170             Arguments:
 171               config: Read-only object with configuration options.
 172         """
 173
 174
 175     @abstractmethod
 176     def update_sql_functions(self, config: Configuration) -> None:
 177         """ Update the SQL part of the tokenizer. This function is called
 178             automatically on migrations or may be called explicitly by the
 179             user through the `nominatim refresh --functions` command.
 180
 181             The tokenizer must only update the code of the tokenizer. The
 182             data structures or data itself must not be changed by this function.
 183
 184             Arguments:
 185               config: Read-only object with configuration options.
 186         """
 187
 188
 189     @abstractmethod
 190     def check_database(self, config: Configuration) -> str:
 191         """ Check that the database is set up correctly and ready for being
 192             queried.
 193
 194             Arguments:
 195               config: Read-only object with configuration options.
 196
 197             Returns:
 198               If an issue was found, return an error message with the
 199               description of the issue as well as hints for the user on
 200               how to resolve the issue. If everything is okay, return `None`.
 201         """
 202
 203
 204     @abstractmethod
 205     def update_statistics(self) -> None:
 206         """ Recompute any tokenizer statistics necessary for efficient lookup.
 207             This function is meant to be called from time to time by the user
 208             to improve performance. However, the tokenizer must not depend on
 209             it to be called in order to work.
 210         """
 211
 212
 213     @abstractmethod
 214     def update_word_tokens(self) -> None:
 215         """ Do house-keeping on the tokenizers internal data structures.
 216             Remove unused word tokens, resort data etc.
 217         """
 218
 219
 220     @abstractmethod
 221     def name_analyzer(self) -> AbstractAnalyzer:
 222         """ Create a new analyzer for tokenizing names and queries
 223             using this tokinzer. Analyzers are context managers and should
 224             be used accordingly:
 225
 226             ```
 227             with tokenizer.name_analyzer() as analyzer:
 228                 analyser.tokenize()
 229             ```
 230
 231             When used outside the with construct, the caller must ensure to
 232             call the close() function before destructing the analyzer.
 233         """
 234
 235
 236 class TokenizerModule(Protocol):
 237     """ Interface that must be exported by modules that implement their
 238         own tokenizer.
 239     """
 240
 241     def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
 242         """ Factory for new tokenizers.
 243         """