From f0d640961a9005cf0e6874bd86b764a03ac2ab86 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 29 Jul 2022 09:41:28 +0200 Subject: [PATCH] add documentation for custom token analysis --- docs/develop/ICU-Tokenizer-Modules.md | 22 ++++++++++++++++++-- nominatim/tokenizer/token_analysis/base.py | 24 +++++++++++++++++++++- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/docs/develop/ICU-Tokenizer-Modules.md b/docs/develop/ICU-Tokenizer-Modules.md index 0578026c..e4af65ed 100644 --- a/docs/develop/ICU-Tokenizer-Modules.md +++ b/docs/develop/ICU-Tokenizer-Modules.md @@ -52,7 +52,8 @@ the function. ### The sanitation function -The sanitation function receives a single object with three members: +The sanitation function receives a single object of type `ProcessInfo` +which has with three members: * `place`: read-only information about the place being processed. See PlaceInfo below. @@ -62,7 +63,7 @@ The sanitation function receives a single object with three members: is a PlaceName object. While the `place` member is provided for information only, the `names` and -`address` lists are meant to be manipulated by the sanitizer. If may add and +`address` lists are meant to be manipulated by the sanitizer. It may add and remove entries, change information within a single entry (for example by adding extra attributes) or completely replace the list with a different one. @@ -80,3 +81,20 @@ adding extra attributes) or completely replace the list with a different one. rendering: show_source: no heading_level: 6 + +## Custom token analysis module + +Setup of a token analyser is split into two parts: configuration and +analyser factory. A token analysis module must therefore implement two +functions: + +::: nominatim.tokenizer.token_analysis.base.AnalysisModule + rendering: + show_source: no + heading_level: 6 + + +::: nominatim.tokenizer.token_analysis.base.Analyser + rendering: + show_source: no + heading_level: 6 diff --git a/nominatim/tokenizer/token_analysis/base.py b/nominatim/tokenizer/token_analysis/base.py index b2a4386c..53264b94 100644 --- a/nominatim/tokenizer/token_analysis/base.py +++ b/nominatim/tokenizer/token_analysis/base.py @@ -12,7 +12,8 @@ from typing import Mapping, List, Any from nominatim.typing import Protocol class Analyser(Protocol): - """ Instance of the token analyser. + """ The `create()` function of an analysis module needs to return an + object that implements the following functions. """ def normalize(self, name: str) -> str: @@ -33,10 +34,31 @@ class AnalysisModule(Protocol): """ Prepare the configuration of the analysis module. This function should prepare all data that can be shared between instances of this analyser. + + Arguments: + rules: A dictionary with the additional configuration options + as specified in the tokenizer configuration. + normalization_rules: ICU rules for normalization as a string + that can be used with createFromRules(). + + Returns: + A data object with the configuration that was set up. May be + used freely by the analysis module as needed. """ def create(self, normalizer: Any, transliterator: Any, config: Any) -> Analyser: """ Create a new instance of the analyser. A separate instance of the analyser is created for each thread when used in multi-threading context. + + Arguments: + normalizer: an ICU Transliterator with the compiled normalization + rules. + transliterator: an ICU tranliterator with the compiled + transliteration rules. + config: The object that was returned by the call to configure(). + + Returns: + A new analyzer instance. This must be a class that implements + the Analyser protocol. """ -- 2.39.5