X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/6e89310a9285f1ad15d8002bf68f578eada367a0..0cf636a80ce668e5c2a0d0000208f63a33315071:/src/nominatim_db/tokenizer/icu_token_analysis.py diff --git a/src/nominatim_db/tokenizer/icu_token_analysis.py b/src/nominatim_db/tokenizer/icu_token_analysis.py index fe6704d4..c1ba106c 100644 --- a/src/nominatim_db/tokenizer/icu_token_analysis.py +++ b/src/nominatim_db/tokenizer/icu_token_analysis.py @@ -14,8 +14,9 @@ from icu import Transliterator from .token_analysis.base import Analyzer if TYPE_CHECKING: - from typing import Any - from .icu_rule_loader import TokenAnalyzerRule # pylint: disable=cyclic-import + from typing import Any # noqa + from .icu_rule_loader import TokenAnalyzerRule + class ICUTokenAnalysis: """ Container class collecting the transliterators and token analysis @@ -24,6 +25,8 @@ class ICUTokenAnalysis: def __init__(self, norm_rules: str, trans_rules: str, analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule']): + # additional break signs are not relevant during name analysis + norm_rules += ";[[:Space:][-:]]+ > ' ';" self.normalizer = Transliterator.createFromRules("icu_normalization", norm_rules) trans_rules += ";[:Space:]+ > ' '" @@ -35,7 +38,6 @@ class ICUTokenAnalysis: self.analysis = {name: arules.create(self.normalizer, self.to_ascii) for name, arules in analysis_rules.items()} - def get_analyzer(self, name: Optional[str]) -> Analyzer: """ Return the given named analyzer. If no analyzer with that name exists, return the default analyzer.