X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/08dcd05d7bc09ad4e70ff47ff6f393a1b91ad21a..0a19cc18e5060e02eb76850cf428a4d9c48ec0c1:/nominatim/api/search/icu_tokenizer.py diff --git a/nominatim/api/search/icu_tokenizer.py b/nominatim/api/search/icu_tokenizer.py index 9bd16e1d..ad08294e 100644 --- a/nominatim/api/search/icu_tokenizer.py +++ b/nominatim/api/search/icu_tokenizer.py @@ -153,7 +153,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): """ log().section('Analyze query (using ICU tokenizer)') normalized = list(filter(lambda p: p.text, - (qmod.Phrase(p.ptype, self.normalizer.transliterate(p.text)) + (qmod.Phrase(p.ptype, self.normalize_text(p.text)) for p in phrases))) query = qmod.QueryStruct(normalized) log().var_dump('Normalized query', query.source) @@ -187,6 +187,19 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): return query + def normalize_text(self, text: str) -> str: + """ Bring the given text into a normalized form. That is the + standardized form search will work with. All information removed + at this stage is inevitably lost. + """ + norm = cast(str, self.normalizer.transliterate(text)) + numspaces = norm.count(' ') + if numspaces > 4 and len(norm) <= (numspaces + 1) * 3: + return '' + + return norm + + def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]: """ Transliterate the phrases and split them into tokens.