1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Abstract class definitions for tokenizers. These base classes are here
9 mainly for documentation purposes.
11 from abc import ABC, abstractmethod
12 from typing import List, Tuple, Dict, Any, Optional, Iterable
13 from pathlib import Path
15 from ..typing import Protocol
16 from ..config import Configuration
17 from ..db.connection import Connection
18 from ..data.place_info import PlaceInfo
20 class AbstractAnalyzer(ABC):
21 """ The analyzer provides the functions for analysing names and building
24 Analyzers are instantiated on a per-thread base. Access to global data
25 structures must be synchronised accordingly.
28 def __enter__(self) -> 'AbstractAnalyzer':
32 def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
37 def close(self) -> None:
38 """ Free all resources used by the analyzer.
43 def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
44 """ Return token information for the given list of words.
46 The function is used for testing and debugging only
47 and does not need to be particularly efficient.
50 words: A list of words to look up the tokens for.
51 If a word starts with # it is assumed to be a full name
52 otherwise is a partial term.
55 The function returns the list of all tuples that could be
56 found for the given words. Each list entry is a tuple of
57 (original word, word token, word id).
62 def normalize_postcode(self, postcode: str) -> str:
63 """ Convert the postcode to its standardized form.
65 This function must yield exactly the same result as the SQL function
66 `token_normalized_postcode()`.
69 postcode: The postcode to be normalized.
72 The given postcode after normalization.
77 def update_postcodes_from_db(self) -> None:
78 """ Update the tokenizer's postcode tokens from the current content
79 of the `location_postcode` table.
84 def update_special_phrases(self,
85 phrases: Iterable[Tuple[str, str, str, str]],
86 should_replace: bool) -> None:
87 """ Update the tokenizer's special phrase tokens from the given
88 list of special phrases.
91 phrases: The new list of special phrases. Each entry is
92 a tuple of (phrase, class, type, operator).
93 should_replace: If true, replace the current list of phrases.
94 When false, just add the given phrases to the
95 ones that already exist.
100 def add_country_names(self, country_code: str, names: Dict[str, str]) -> None:
101 """ Add the given names to the tokenizer's list of country tokens.
104 country_code: two-letter country code for the country the names
106 names: Dictionary of name type to name.
111 def process_place(self, place: PlaceInfo) -> Any:
112 """ Extract tokens for the given place and compute the
113 information to be handed to the PL/pgSQL processor for building
117 place: Place information retrieved from the database.
120 A JSON-serialisable structure that will be handed into
121 the database via the `token_info` field.
126 class AbstractTokenizer(ABC):
127 """ The tokenizer instance is the central instance of the tokenizer in
128 the system. There will only be a single instance of the tokenizer
133 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
134 """ Set up a new tokenizer for the database.
136 The function should copy all necessary data into the project
137 directory or save it in the property table to make sure that
138 the tokenizer remains stable over updates.
141 config: Read-only object with configuration options.
143 init_db: When set to False, then initialisation of database
144 tables should be skipped. This option is only required for
145 migration purposes and can be safely ignored by custom
151 def init_from_project(self, config: Configuration) -> None:
152 """ Initialise the tokenizer from an existing database setup.
154 The function should load all previously saved configuration from
155 the project directory and/or the property table.
158 config: Read-only object with configuration options.
163 def finalize_import(self, config: Configuration) -> None:
164 """ This function is called at the very end of an import when all
165 data has been imported and indexed. The tokenizer may create
166 at this point any additional indexes and data structures needed
170 config: Read-only object with configuration options.
175 def update_sql_functions(self, config: Configuration) -> None:
176 """ Update the SQL part of the tokenizer. This function is called
177 automatically on migrations or may be called explicitly by the
178 user through the `nominatim refresh --functions` command.
180 The tokenizer must only update the code of the tokenizer. The
181 data structures or data itself must not be changed by this function.
184 config: Read-only object with configuration options.
189 def check_database(self, config: Configuration) -> Optional[str]:
190 """ Check that the database is set up correctly and ready for being
194 config: Read-only object with configuration options.
197 If an issue was found, return an error message with the
198 description of the issue as well as hints for the user on
199 how to resolve the issue. If everything is okay, return `None`.
204 def update_statistics(self, config: Configuration, threads: int = 1) -> None:
205 """ Recompute any tokenizer statistics necessary for efficient lookup.
206 This function is meant to be called from time to time by the user
207 to improve performance. However, the tokenizer must not depend on
208 it to be called in order to work.
213 def update_word_tokens(self) -> None:
214 """ Do house-keeping on the tokenizers internal data structures.
215 Remove unused word tokens, resort data etc.
220 def name_analyzer(self) -> AbstractAnalyzer:
221 """ Create a new analyzer for tokenizing names and queries
222 using this tokinzer. Analyzers are context managers and should
226 with tokenizer.name_analyzer() as analyzer:
230 When used outside the with construct, the caller must ensure to
231 call the close() function before destructing the analyzer.
236 def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
237 """ Return a list of the most frequent full words in the database.
240 conn: Open connection to the database which may be used to
242 num: Maximum number of words to return.
246 class TokenizerModule(Protocol):
247 """ Interface that must be exported by modules that implement their
251 def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
252 """ Factory for new tokenizers.