nominatim/tokenizer/base.py

   1 """
   2 Abstract class defintions for tokenizers. These base classes are here
   3 mainly for documentation purposes.
   4 """
   5 from abc import ABC, abstractmethod
   6 from typing import List, Tuple, Dict, Any
   7
   8 from nominatim.config import Configuration
   9 from nominatim.indexer.place_info import PlaceInfo
  10
  11 # pylint: disable=unnecessary-pass
  12
  13 class AbstractAnalyzer(ABC):
  14     """ The analyzer provides the functions for analysing names and building
  15         the token database.
  16
  17         Analyzers are instantiated on a per-thread base. Access to global data
  18         structures must be synchronised accordingly.
  19     """
  20
  21     def __enter__(self) -> 'AbstractAnalyzer':
  22         return self
  23
  24
  25     def __exit__(self, exc_type, exc_value, traceback) -> None:
  26         self.close()
  27
  28
  29     @abstractmethod
  30     def close(self) -> None:
  31         """ Free all resources used by the analyzer.
  32         """
  33
  34
  35     @abstractmethod
  36     def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
  37         """ Return token information for the given list of words.
  38
  39             The function is used for testing and debugging only
  40             and does not need to be particularly efficient.
  41
  42             Arguments:
  43                 words: A list of words to look up the tokens for.
  44                        If a word starts with # it is assumed to be a full name
  45                        otherwise is a partial term.
  46
  47             Returns:
  48                 The function returns the list of all tuples that could be
  49                 found for the given words. Each list entry is a tuple of
  50                 (original word, word token, word id).
  51         """
  52
  53
  54     @abstractmethod
  55     def normalize_postcode(self, postcode: str) -> str:
  56         """ Convert the postcode to its standardized form.
  57
  58             This function must yield exactly the same result as the SQL function
  59             `token_normalized_postcode()`.
  60
  61             Arguments:
  62                 postcode: The postcode to be normalized.
  63
  64             Returns:
  65                 The given postcode after normalization.
  66         """
  67
  68
  69     @abstractmethod
  70     def update_postcodes_from_db(self) -> None:
  71         """ Update the tokenizer's postcode tokens from the current content
  72             of the `location_postcode` table.
  73         """
  74
  75
  76     @abstractmethod
  77     def update_special_phrases(self, phrases: List[Tuple[str, str, str, str]],
  78                                should_replace: bool) -> None:
  79         """ Update the tokenizer's special phrase tokens from the given
  80             list of special phrases.
  81
  82             Arguments:
  83                 phrases: The new list of special phrases. Each entry is
  84                          a tuple of (phrase, class, type, operator).
  85                 should_replace: If true, replace the current list of phrases.
  86                                 When false, just add the given phrases to the
  87                                 ones that already exist.
  88         """
  89
  90
  91     @abstractmethod
  92     def add_country_names(self, country_code: str, names: Dict[str, str]):
  93         """ Add the given names to the tokenizer's list of country tokens.
  94
  95             Arguments:
  96                 country_code: two-letter country code for the country the names
  97                               refer to.
  98                 names: Dictionary of name type to name.
  99         """
 100
 101
 102     @abstractmethod
 103     def process_place(self, place: PlaceInfo) -> Any:
 104         """ Extract tokens for the given place and compute the
 105             information to be handed to the PL/pgSQL processor for building
 106             the search index.
 107
 108             Arguments:
 109                 place: Place information retrived from the database.
 110
 111             Returns:
 112                 A JSON-serialisable structure that will be handed into
 113                 the database via the `token_info` field.
 114         """
 115
 116
 117
 118 class AbstractTokenizer(ABC):
 119     """ The tokenizer instance is the central instance of the tokenizer in
 120         the system. There will only be a single instance of the tokenizer
 121         active at any time.
 122     """
 123
 124     @abstractmethod
 125     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
 126         """ Set up a new tokenizer for the database.
 127
 128             The function should copy all necessary data into the project
 129             directory or save it in the property table to make sure that
 130             the tokenizer remains stable over updates.
 131
 132             Arguments:
 133               config: Read-only object with configuration options.
 134
 135               init_db: When set to False, then initialisation of database
 136                 tables should be skipped. This option is only required for
 137                 migration purposes and can be savely ignored by custom
 138                 tokenizers.
 139
 140             TODO: can we move the init_db parameter somewhere else?
 141         """
 142
 143
 144     @abstractmethod
 145     def init_from_project(self, config: Configuration) -> None:
 146         """ Initialise the tokenizer from an existing database setup.
 147
 148             The function should load all previously saved configuration from
 149             the project directory and/or the property table.
 150
 151             Arguments:
 152               config: Read-only object with configuration options.
 153         """
 154
 155
 156     @abstractmethod
 157     def finalize_import(self, config: Configuration) -> None:
 158         """ This function is called at the very end of an import when all
 159             data has been imported and indexed. The tokenizer may create
 160             at this point any additional indexes and data structures needed
 161             during query time.
 162
 163             Arguments:
 164               config: Read-only object with configuration options.
 165         """
 166
 167
 168     @abstractmethod
 169     def update_sql_functions(self, config: Configuration) -> None:
 170         """ Update the SQL part of the tokenizer. This function is called
 171             automatically on migrations or may be called explicitly by the
 172             user through the `nominatim refresh --functions` command.
 173
 174             The tokenizer must only update the code of the tokenizer. The
 175             data structures or data itself must not be changed by this function.
 176
 177             Arguments:
 178               config: Read-only object with configuration options.
 179         """
 180
 181
 182     @abstractmethod
 183     def check_database(self, config: Configuration) -> str:
 184         """ Check that the database is set up correctly and ready for being
 185             queried.
 186
 187             Arguments:
 188               config: Read-only object with configuration options.
 189
 190             Returns:
 191               If an issue was found, return an error message with the
 192               description of the issue as well as hints for the user on
 193               how to resolve the issue. If everything is okay, return `None`.
 194         """
 195
 196
 197     @abstractmethod
 198     def update_statistics(self) -> None:
 199         """ Recompute any tokenizer statistics necessary for efficient lookup.
 200             This function is meant to be called from time to time by the user
 201             to improve performance. However, the tokenizer must not depend on
 202             it to be called in order to work.
 203         """
 204
 205
 206     @abstractmethod
 207     def name_analyzer(self) -> AbstractAnalyzer:
 208         """ Create a new analyzer for tokenizing names and queries
 209             using this tokinzer. Analyzers are context managers and should
 210             be used accordingly:
 211
 212             ```
 213             with tokenizer.name_analyzer() as analyzer:
 214                 analyser.tokenize()
 215             ```
 216
 217             When used outside the with construct, the caller must ensure to
 218             call the close() function before destructing the analyzer.
 219         """