nominatim/tokenizer/base.py

   1 """
   2 Abstract class defintions for tokenizers. These base classes are here
   3 mainly for documentation purposes.
   4 """
   5 from abc import ABC, abstractmethod
   6 from typing import List, Tuple, Dict, Any
   7
   8 from nominatim.config import Configuration
   9 from nominatim.indexer.place_info import PlaceInfo
  10
  11 # pylint: disable=unnecessary-pass
  12
  13 class AbstractAnalyzer(ABC):
  14     """ The analyzer provides the functions for analysing names and building
  15         the token database.
  16
  17         Analyzers are instantiated on a per-thread base. Access to global data
  18         structures must be synchronised accordingly.
  19     """
  20
  21     def __enter__(self) -> 'AbstractAnalyzer':
  22         return self
  23
  24
  25     def __exit__(self, exc_type, exc_value, traceback) -> None:
  26         self.close()
  27
  28
  29     @abstractmethod
  30     def close(self) -> None:
  31         """ Free all resources used by the analyzer.
  32         """
  33         pass
  34
  35
  36     @abstractmethod
  37     def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
  38         """ Return token information for the given list of words.
  39
  40             The function is used for testing and debugging only
  41             and does not need to be particularly efficient.
  42
  43             Arguments:
  44                 words: A list of words to look up the tokens for.
  45                        If a word starts with # it is assumed to be a full name
  46                        otherwise is a partial term.
  47
  48             Returns:
  49                 The function returns the list of all tuples that could be
  50                 found for the given words. Each list entry is a tuple of
  51                 (original word, word token, word id).
  52         """
  53         pass
  54
  55
  56     @abstractmethod
  57     def normalize_postcode(self, postcode: str) -> str:
  58         """ Convert the postcode to its standardized form.
  59
  60             This function must yield exactly the same result as the SQL function
  61             `token_normalized_postcode()`.
  62
  63             Arguments:
  64                 postcode: The postcode to be normalized.
  65
  66             Returns:
  67                 The given postcode after normalization.
  68         """
  69         pass
  70
  71
  72     @abstractmethod
  73     def update_postcodes_from_db(self) -> None:
  74         """ Update the tokenizer's postcode tokens from the current content
  75             of the `location_postcode` table.
  76         """
  77         pass
  78
  79
  80     @abstractmethod
  81     def update_special_phrases(self, phrases: List[Tuple[str, str, str, str]],
  82                                should_replace: bool) -> None:
  83         """ Update the tokenizer's special phrase tokens from the given
  84             list of special phrases.
  85
  86             Arguments:
  87                 phrases: The new list of special phrases. Each entry is
  88                          a tuple of (phrase, class, type, operator).
  89                 should_replace: If true, replace the current list of phrases.
  90                                 When false, just add the given phrases to the
  91                                 ones that already exist.
  92         """
  93         pass
  94
  95
  96     @abstractmethod
  97     def add_country_names(self, country_code: str, names: Dict[str, str]):
  98         """ Add the given names to the tokenizer's list of country tokens.
  99
 100             Arguments:
 101                 country_code: two-letter country code for the country the names
 102                               refer to.
 103                 names: Dictionary of name type to name.
 104         """
 105         pass
 106
 107
 108     @abstractmethod
 109     def process_place(self, place: PlaceInfo) -> Any:
 110         """ Extract tokens for the given place and compute the
 111             information to be handed to the PL/pgSQL processor for building
 112             the search index.
 113
 114             Arguments:
 115                 place: Place information retrived from the database.
 116
 117             Returns:
 118                 A JSON-serialisable structure that will be handed into
 119                 the database via the `token_info` field.
 120         """
 121
 122
 123
 124 class AbstractTokenizer(ABC):
 125     """ The tokenizer instance is the central instance of the tokenizer in
 126         the system. There will only be a single instance of the tokenizer
 127         active at any time.
 128     """
 129
 130     @abstractmethod
 131     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
 132         """ Set up a new tokenizer for the database.
 133
 134             The function should copy all necessary data into the project
 135             directory or save it in the property table to make sure that
 136             the tokenizer remains stable over updates.
 137
 138             Arguments:
 139               config: Read-only object with configuration options.
 140
 141               init_db: When set to False, then initialisation of database
 142                 tables should be skipped. This option is only required for
 143                 migration purposes and can be savely ignored by custom
 144                 tokenizers.
 145
 146             TODO: can we move the init_db parameter somewhere else?
 147         """
 148         pass
 149
 150
 151     @abstractmethod
 152     def init_from_project(self, config: Configuration) -> None:
 153         """ Initialise the tokenizer from an existing database setup.
 154
 155             The function should load all previously saved configuration from
 156             the project directory and/or the property table.
 157
 158             Arguments:
 159               config: Read-only object with configuration options.
 160         """
 161         pass
 162
 163
 164     @abstractmethod
 165     def finalize_import(self, config: Configuration) -> None:
 166         """ This function is called at the very end of an import when all
 167             data has been imported and indexed. The tokenizer may create
 168             at this point any additional indexes and data structures needed
 169             during query time.
 170
 171             Arguments:
 172               config: Read-only object with configuration options.
 173         """
 174         pass
 175
 176
 177     @abstractmethod
 178     def update_sql_functions(self, config: Configuration) -> None:
 179         """ Update the SQL part of the tokenizer. This function is called
 180             automatically on migrations or may be called explicitly by the
 181             user through the `nominatim refresh --functions` command.
 182
 183             The tokenizer must only update the code of the tokenizer. The
 184             data structures or data itself must not be changed by this function.
 185
 186             Arguments:
 187               config: Read-only object with configuration options.
 188         """
 189         pass
 190
 191
 192     @abstractmethod
 193     def check_database(self, config: Configuration) -> str:
 194         """ Check that the database is set up correctly and ready for being
 195             queried.
 196
 197             Arguments:
 198               config: Read-only object with configuration options.
 199
 200             Returns:
 201               If an issue was found, return an error message with the
 202               description of the issue as well as hints for the user on
 203               how to resolve the issue. If everything is okay, return `None`.
 204         """
 205         pass
 206
 207
 208     @abstractmethod
 209     def update_statistics(self) -> None:
 210         """ Recompute any tokenizer statistics necessary for efficient lookup.
 211             This function is meant to be called from time to time by the user
 212             to improve performance. However, the tokenizer must not depend on
 213             it to be called in order to work.
 214         """
 215         pass
 216
 217
 218     @abstractmethod
 219     def name_analyzer(self) -> AbstractAnalyzer:
 220         """ Create a new analyzer for tokenizing names and queries
 221             using this tokinzer. Analyzers are context managers and should
 222             be used accordingly:
 223
 224             ```
 225             with tokenizer.name_analyzer() as analyzer:
 226                 analyser.tokenize()
 227             ```
 228
 229             When used outside the with construct, the caller must ensure to
 230             call the close() function before destructing the analyzer.
 231         """
 232         pass