X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/f0d640961a9005cf0e6874bd86b764a03ac2ab86..78c19bc006d5957f183968d086226be4b2b65fe3:/nominatim/tokenizer/token_analysis/base.py diff --git a/nominatim/tokenizer/token_analysis/base.py b/nominatim/tokenizer/token_analysis/base.py index 53264b94..c7ec61c9 100644 --- a/nominatim/tokenizer/token_analysis/base.py +++ b/nominatim/tokenizer/token_analysis/base.py @@ -10,27 +10,56 @@ Common data types and protocols for analysers. from typing import Mapping, List, Any from nominatim.typing import Protocol +from nominatim.data.place_name import PlaceName -class Analyser(Protocol): +class Analyzer(Protocol): """ The `create()` function of an analysis module needs to return an object that implements the following functions. """ - def normalize(self, name: str) -> str: - """ Return the normalized form of the name. This is the standard form - from which possible variants for the name can be derived. + def get_canonical_id(self, name: PlaceName) -> str: + """ Return the canonical form of the given name. The canonical ID must + be unique (the same ID must always yield the same variants) and + must be a form from which the variants can be derived. + + Arguments: + name: Extended place name description as prepared by + the sanitizers. + + Returns: + ID string with a canonical form of the name. The string may + be empty, when the analyzer cannot analyze the name at all, + for example because the character set in use does not match. """ - def get_variants_ascii(self, norm_name: str) -> List[str]: - """ Compute the spelling variants for the given normalized name - and transliterate the result. + def compute_variants(self, canonical_id: str) -> List[str]: + """ Compute the transliterated spelling variants for the given + canonical ID. + + Arguments: + canonical_id: ID string previously computed with + `get_canonical_id()`. + + Returns: + A list of possible spelling variants. All strings must have + been transformed with the global normalizer and + transliterator ICU rules. Otherwise they cannot be matched + against the input by the query frontend. + The list may be empty, when there are no useful + spelling variants. This may happen when an analyzer only + usually outputs additional variants to the canonical spelling + and there are no such variants. """ + class AnalysisModule(Protocol): - """ Protocol for analysis modules. + """ The setup of the token analysis is split into two parts: + configuration and analyser factory. A token analysis module must + therefore implement the two functions here described. """ - def configure(self, rules: Mapping[str, Any], normalization_rules: str) -> Any: + def configure(self, rules: Mapping[str, Any], + normalizer: Any, transliterator: Any) -> Any: """ Prepare the configuration of the analysis module. This function should prepare all data that can be shared between instances of this analyser. @@ -38,15 +67,18 @@ class AnalysisModule(Protocol): Arguments: rules: A dictionary with the additional configuration options as specified in the tokenizer configuration. - normalization_rules: ICU rules for normalization as a string - that can be used with createFromRules(). + normalizer: an ICU Transliterator with the compiled + global normalization rules. + transliterator: an ICU Transliterator with the compiled + global transliteration rules. Returns: - A data object with the configuration that was set up. May be - used freely by the analysis module as needed. + A data object with configuration data. This will be handed + as is into the `create()` function and may be + used freely by the analysis module as needed. """ - def create(self, normalizer: Any, transliterator: Any, config: Any) -> Analyser: + def create(self, normalizer: Any, transliterator: Any, config: Any) -> Analyzer: """ Create a new instance of the analyser. A separate instance of the analyser is created for each thread when used in multi-threading context. @@ -54,11 +86,11 @@ class AnalysisModule(Protocol): Arguments: normalizer: an ICU Transliterator with the compiled normalization rules. - transliterator: an ICU tranliterator with the compiled + transliterator: an ICU Transliterator with the compiled transliteration rules. config: The object that was returned by the call to configure(). Returns: - A new analyzer instance. This must be a class that implements - the Analyser protocol. + A new analyzer instance. This must be an object that implements + the Analyzer protocol. """