nominatim/tokenizer/base.py

   1 """
   2 Abstract class defintions for tokenizers. These base classes are here
   3 mainly for documentation purposes.
   4 """
   5 from abc import ABC, abstractmethod
   6 from typing import List, Tuple, Dict, Any
   7
   8 from nominatim.config import Configuration
   9
  10 # pylint: disable=unnecessary-pass
  11
  12 class AbstractAnalyzer(ABC):
  13     """ The analyzer provides the functions for analysing names and building
  14         the token database.
  15
  16         Analyzers are instantiated on a per-thread base. Access to global data
  17         structures must be synchronised accordingly.
  18     """
  19
  20     def __enter__(self) -> 'AbstractAnalyzer':
  21         return self
  22
  23
  24     def __exit__(self, exc_type, exc_value, traceback) -> None:
  25         self.close()
  26
  27
  28     @abstractmethod
  29     def close(self) -> None:
  30         """ Free all resources used by the analyzer.
  31         """
  32         pass
  33
  34
  35     @abstractmethod
  36     def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
  37         """ Return token information for the given list of words.
  38
  39             The function is used for testing and debugging only
  40             and does not need to be particularly efficient.
  41
  42             Arguments:
  43                 words: A list of words to look up the tokens for.
  44                        If a word starts with # it is assumed to be a full name
  45                        otherwise is a partial term.
  46
  47             Returns:
  48                 The function returns the list of all tuples that could be
  49                 found for the given words. Each list entry is a tuple of
  50                 (original word, word token, word id).
  51         """
  52         pass
  53
  54
  55     @abstractmethod
  56     def normalize_postcode(self, postcode: str) -> str:
  57         """ Convert the postcode to its standardized form.
  58
  59             This function must yield exactly the same result as the SQL function
  60             `token_normalized_postcode()`.
  61
  62             Arguments:
  63                 postcode: The postcode to be normalized.
  64
  65             Returns:
  66                 The given postcode after normalization.
  67         """
  68         pass
  69
  70
  71     @abstractmethod
  72     def update_postcodes_from_db(self) -> None:
  73         """ Update the tokenizer's postcode tokens from the current content
  74             of the `location_postcode` table.
  75         """
  76         pass
  77
  78
  79     @abstractmethod
  80     def update_special_phrases(self, phrases: List[Tuple[str, str, str, str]],
  81                                should_replace: bool) -> None:
  82         """ Update the tokenizer's special phrase tokens from the given
  83             list of special phrases.
  84
  85             Arguments:
  86                 phrases: The new list of special phrases. Each entry is
  87                          a tuple of (phrase, class, type, operator).
  88                 should_replace: If true, replace the current list of phrases.
  89                                 When false, just add the given phrases to the
  90                                 ones that already exist.
  91         """
  92         pass
  93
  94
  95     @abstractmethod
  96     def add_country_names(self, country_code: str, names: Dict[str, str]):
  97         """ Add the given names to the tokenizer's list of country tokens.
  98
  99             Arguments:
 100                 country_code: two-letter country code for the country the names
 101                               refer to.
 102                 names: Dictionary of name type to name.
 103         """
 104         pass
 105
 106
 107     @abstractmethod
 108     def process_place(self, place: Dict) -> Any:
 109         """ Extract tokens for the given place and compute the
 110             information to be handed to the PL/pgSQL processor for building
 111             the search index.
 112
 113             Arguments:
 114                 place: Dictionary with the information about the place. Currently
 115                        the following fields may be present:
 116
 117                        - *name* is a dictionary of names for the place together
 118                          with the designation of the name.
 119                        - *address* is a dictionary of address terms.
 120                        - *country_feature* is set to a country code when the
 121                          place describes a country.
 122
 123             Returns:
 124                 A JSON-serialisable structure that will be handed into
 125                 the database via the `token_info` field.
 126         """
 127
 128
 129
 130 class AbstractTokenizer(ABC):
 131     """ The tokenizer instance is the central instance of the tokenizer in
 132         the system. There will only be a single instance of the tokenizer
 133         active at any time.
 134     """
 135
 136     @abstractmethod
 137     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
 138         """ Set up a new tokenizer for the database.
 139
 140             The function should copy all necessary data into the project
 141             directory or save it in the property table to make sure that
 142             the tokenizer remains stable over updates.
 143
 144             Arguments:
 145               config: Read-only object with configuration obtions.
 146
 147               init_db: When set to False, then initialisation of database
 148                 tables should be skipped. This option is only required for
 149                 migration purposes and can be savely ignored by custom
 150                 tokenizers.
 151
 152             TODO: can we move the init_db parameter somewhere else?
 153         """
 154         pass
 155
 156
 157     @abstractmethod
 158     def init_from_project(self) -> None:
 159         """ Initialise the tokenizer from an existing database setup.
 160
 161             The function should load all previously saved configuration from
 162             the project directory and/or the property table.
 163         """
 164         pass
 165
 166
 167     @abstractmethod
 168     def finalize_import(self, config: Configuration) -> None:
 169         """ This function is called at the very end of an import when all
 170             data has been imported and indexed. The tokenizer may create
 171             at this point any additional indexes and data structures needed
 172             during query time.
 173
 174             Arguments:
 175               config: Read-only object with configuration obtions.
 176         """
 177         pass
 178
 179
 180     @abstractmethod
 181     def update_sql_functions(self, config: Configuration) -> None:
 182         """ Update the SQL part of the tokenizer. This function is called
 183             automatically on migrations or may be called explicitly by the
 184             user through the `nominatim refresh --functions` command.
 185
 186             The tokenizer must only update the code of the tokenizer. The
 187             data structures or data itself must not be changed by this function.
 188
 189             Arguments:
 190               config: Read-only object with configuration obtions.
 191         """
 192         pass
 193
 194
 195     @abstractmethod
 196     def check_database(self) -> str:
 197         """ Check that the database is set up correctly and ready for being
 198             queried.
 199
 200             Returns:
 201               If an issue was found, return an error message with the
 202               description of the issue as well as hints for the user on
 203               how to resolve the issue.
 204
 205               Return `None`, if no issue was found.
 206         """
 207         pass
 208
 209
 210     @abstractmethod
 211     def name_analyzer(self) -> AbstractAnalyzer:
 212         """ Create a new analyzer for tokenizing names and queries
 213             using this tokinzer. Analyzers are context managers and should
 214             be used accordingly:
 215
 216             ```
 217             with tokenizer.name_analyzer() as analyzer:
 218                 analyser.tokenize()
 219             ```
 220
 221             When used outside the with construct, the caller must ensure to
 222             call the close() function before destructing the analyzer.
 223         """
 224         pass