X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/d35e3c25b66d25e4a3dd073c7ba17b0c204ec8e8..38f467bae3d9a6128180367c0e46ffd0bcad0961:/nominatim/tokenizer/token_analysis/base.py diff --git a/nominatim/tokenizer/token_analysis/base.py b/nominatim/tokenizer/token_analysis/base.py index b55b4f7c..68046f96 100644 --- a/nominatim/tokenizer/token_analysis/base.py +++ b/nominatim/tokenizer/token_analysis/base.py @@ -7,39 +7,90 @@ """ Common data types and protocols for analysers. """ -from typing import TypeVar, Mapping, List, Any +from typing import Mapping, List, Any -from typing_extensions import Protocol +from nominatim.typing import Protocol +from nominatim.data.place_name import PlaceName +class Analyzer(Protocol): + """ The `create()` function of an analysis module needs to return an + object that implements the following functions. + """ -T_config = TypeVar('T_config') # pylint: disable=invalid-name + def get_canonical_id(self, name: PlaceName) -> str: + """ Return the canonical form of the given name. The canonical ID must + be unique (the same ID must always yield the same variants) and + must be a form from which the variants can be derived. -class Analyser(Protocol): - """ Instance of the token analyser. - """ + Arguments: + name: Extended place name description as prepared by + the sanitizers. - def normalize(self, name: str) -> str: - """ Return the normalized form of the name. This is the standard form - from which possible variants for the name can be derived. + Returns: + ID string with a canonical form of the name. The string may + be empty, when the analyzer cannot analyze the name at all, + for example because the character set in use does not match. """ - def get_variants_ascii(self, norm_name: str) -> List[str]: - """ Compute the spelling variants for the given normalized name - and transliterate the result. + def compute_variants(self, canonical_id: str) -> List[str]: + """ Compute the transliterated spelling variants for the given + canonical ID. + + Arguments: + canonical_id: ID string previously computed with + `get_canonical_id()`. + + Returns: + A list of possible spelling variants. All strings must have + been transformed with the global normalizer and + transliterator ICU rules. Otherwise they cannot be matched + against the input by the query frontend. + The list may be empty, when there are no useful + spelling variants. This may happen when an analyzer only + usually outputs additional variants to the canonical spelling + and there are no such variants. """ -class AnalysisModule(Protocol[T_config]): - """ Protocol for analysis modules. + +class AnalysisModule(Protocol): + """ The setup of the token analysis is split into two parts: + configuration and analyser factory. A token analysis module must + therefore implement the two functions here described. """ - def configure(self, rules: Mapping[str, Any], normalization_rules: str) -> T_config: + def configure(self, rules: Mapping[str, Any], + normalizer: Any, transliterator: Any) -> Any: """ Prepare the configuration of the analysis module. This function should prepare all data that can be shared between instances of this analyser. + + Arguments: + rules: A dictionary with the additional configuration options + as specified in the tokenizer configuration. + normalizer: an ICU Transliterator with the compiled + global normalization rules. + transliterator: an ICU Transliterator with the compiled + global transliteration rules. + + Returns: + A data object with configuration data. This will be handed + as is into the `create()` function and may be + used freely by the analysis module as needed. """ - def create(self, normalizer: Any, transliterator: Any, config: T_config) -> Analyser: + def create(self, normalizer: Any, transliterator: Any, config: Any) -> Analyzer: """ Create a new instance of the analyser. A separate instance of the analyser is created for each thread when used in multi-threading context. + + Arguments: + normalizer: an ICU Transliterator with the compiled normalization + rules. + transliterator: an ICU Transliterator with the compiled + transliteration rules. + config: The object that was returned by the call to configure(). + + Returns: + A new analyzer instance. This must be an object that implements + the Analyzer protocol. """