2 Abstract class defintions for tokenizers. These base classes are here
3 mainly for documentation purposes.
5 from abc import ABC, abstractmethod
6 from typing import List, Tuple, Dict, Any
8 from nominatim.config import Configuration
10 # pylint: disable=unnecessary-pass
12 class AbstractAnalyzer(ABC):
13 """ The analyzer provides the functions for analysing names and building
16 Analyzers are instantiated on a per-thread base. Access to global data
17 structures must be synchronised accordingly.
20 def __enter__(self) -> 'AbstractAnalyzer':
24 def __exit__(self, exc_type, exc_value, traceback) -> None:
29 def close(self) -> None:
30 """ Free all resources used by the analyzer.
36 def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
37 """ Return token information for the given list of words.
39 The function is used for testing and debugging only
40 and does not need to be particularly efficient.
43 words: A list of words to look up the tokens for.
44 If a word starts with # it is assumed to be a full name
45 otherwise is a partial term.
48 The function returns the list of all tuples that could be
49 found for the given words. Each list entry is a tuple of
50 (original word, word token, word id).
56 def normalize_postcode(self, postcode: str) -> str:
57 """ Convert the postcode to its standardized form.
59 This function must yield exactly the same result as the SQL function
60 `token_normalized_postcode()`.
63 postcode: The postcode to be normalized.
66 The given postcode after normalization.
72 def update_postcodes_from_db(self) -> None:
73 """ Update the tokenizer's postcode tokens from the current content
74 of the `location_postcode` table.
80 def update_special_phrases(self, phrases: List[Tuple[str, str, str, str]],
81 should_replace: bool) -> None:
82 """ Update the tokenizer's special phrase tokens from the given
83 list of special phrases.
86 phrases: The new list of special phrases. Each entry is
87 a tuple of (phrase, class, type, operator).
88 should_replace: If true, replace the current list of phrases.
89 When false, just add the given phrases to the
90 ones that already exist.
96 def add_country_names(self, country_code: str, names: Dict[str, str]):
97 """ Add the given names to the tokenizer's list of country tokens.
100 country_code: two-letter country code for the country the names
102 names: Dictionary of name type to name.
108 def process_place(self, place: Dict) -> Any:
109 """ Extract tokens for the given place and compute the
110 information to be handed to the PL/pgSQL processor for building
114 place: Dictionary with the information about the place. Currently
115 the following fields may be present:
117 - *name* is a dictionary of names for the place together
118 with the designation of the name.
119 - *address* is a dictionary of address terms.
120 - *country_feature* is set to a country code when the
121 place describes a country.
124 A JSON-serialisable structure that will be handed into
125 the database via the `token_info` field.
130 class AbstractTokenizer(ABC):
131 """ The tokenizer instance is the central instance of the tokenizer in
132 the system. There will only be a single instance of the tokenizer
137 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
138 """ Set up a new tokenizer for the database.
140 The function should copy all necessary data into the project
141 directory or save it in the property table to make sure that
142 the tokenizer remains stable over updates.
145 config: Read-only object with configuration obtions.
147 init_db: When set to False, then initialisation of database
148 tables should be skipped. This option is only required for
149 migration purposes and can be savely ignored by custom
152 TODO: can we move the init_db parameter somewhere else?
158 def init_from_project(self) -> None:
159 """ Initialise the tokenizer from an existing database setup.
161 The function should load all previously saved configuration from
162 the project directory and/or the property table.
168 def finalize_import(self, config: Configuration) -> None:
169 """ This function is called at the very end of an import when all
170 data has been imported and indexed. The tokenizer may create
171 at this point any additional indexes and data structures needed
175 config: Read-only object with configuration obtions.
181 def update_sql_functions(self, config: Configuration) -> None:
182 """ Update the SQL part of the tokenizer. This function is called
183 automatically on migrations or may be called explicitly by the
184 user through the `nominatim refresh --functions` command.
186 The tokenizer must only update the code of the tokenizer. The
187 data structures or data itself must not be changed by this function.
190 config: Read-only object with configuration obtions.
196 def check_database(self) -> str:
197 """ Check that the database is set up correctly and ready for being
201 If an issue was found, return an error message with the
202 description of the issue as well as hints for the user on
203 how to resolve the issue.
205 Return `None`, if no issue was found.
211 def name_analyzer(self) -> AbstractAnalyzer:
212 """ Create a new analyzer for tokenizing names and queries
213 using this tokinzer. Analyzers are context managers and should
217 with tokenizer.name_analyzer() as analyzer:
221 When used outside the with construct, the caller must ensure to
222 call the close() function before destructing the analyzer.