From 86ad9efa8abb1fb478b3be5b6c469877aad05a51 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 7 Jan 2025 21:32:32 +0100 Subject: [PATCH] keep break indicators [:-] during normalisation All punctuation will be converted to '-'. Soft breaks : may be added by preprocessors. The break signs are only used during query analysis and are ignored during import token analysis. --- settings/icu_tokenizer.yaml | 7 ++++--- src/nominatim_api/search/geocoder.py | 4 ++-- src/nominatim_db/tokenizer/icu_token_analysis.py | 2 ++ 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index 530df1a6..437319fa 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -9,16 +9,17 @@ normalization: - "'nº' > 'no'" - "ª > a" - "º > o" - - "[[:Punctuation:][:Symbol:]\u02bc] > ' '" + - "[[:Punctuation:][:Symbol:][\u02bc] - [-:]]+ > '-'" - "ß > 'ss'" # German szet is unambiguously equal to double ss - - "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:]] >" + - "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:] [-:]] >" - "[:Lm:] >" - ":: [[:Number:]] Latin ()" - ":: [[:Number:]] Ascii ();" - ":: [[:Number:]] NFD ();" - "[[:Nonspacing Mark:] [:Cf:]] >;" - - "[:Space:]+ > ' '" + - "[-:]?[:Space:]+[-:]? > ' '" transliteration: + - "[-:] > ' '" - ":: Latin ()" - !include icu-rules/extended-unicode-to-asccii.yaml - ":: Ascii ()" diff --git a/src/nominatim_api/search/geocoder.py b/src/nominatim_api/search/geocoder.py index efe5b721..69455d77 100644 --- a/src/nominatim_api/search/geocoder.py +++ b/src/nominatim_api/search/geocoder.py @@ -133,7 +133,7 @@ class ForwardGeocoder: """ assert self.query_analyzer is not None qwords = [word for phrase in query.source - for word in re.split('[, ]+', phrase.text) if word] + for word in re.split('[-,: ]+', phrase.text) if word] if not qwords: return @@ -146,7 +146,7 @@ class ForwardGeocoder: distance = 0.0 norm = self.query_analyzer.normalize_text(' '.join((result.display_name, result.country_code or ''))) - words = set((w for w in norm.split(' ') if w)) + words = set((w for w in re.split('[-,: ]+', norm) if w)) if not words: continue for qword in qwords: diff --git a/src/nominatim_db/tokenizer/icu_token_analysis.py b/src/nominatim_db/tokenizer/icu_token_analysis.py index a3cdcb7a..c1ba106c 100644 --- a/src/nominatim_db/tokenizer/icu_token_analysis.py +++ b/src/nominatim_db/tokenizer/icu_token_analysis.py @@ -25,6 +25,8 @@ class ICUTokenAnalysis: def __init__(self, norm_rules: str, trans_rules: str, analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule']): + # additional break signs are not relevant during name analysis + norm_rules += ";[[:Space:][-:]]+ > ' ';" self.normalizer = Transliterator.createFromRules("icu_normalization", norm_rules) trans_rules += ";[:Space:]+ > ' '" -- 2.39.5