From: Sarah Hoffmann Date: Mon, 18 Mar 2024 13:37:25 +0000 (+0100) Subject: Merge remote-tracking branch 'upstream/master' X-Git-Tag: deploy~15 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/6f68c2d805c3fdc903fa3d46093bd1376be3e636?hp=-c Merge remote-tracking branch 'upstream/master' --- 6f68c2d805c3fdc903fa3d46093bd1376be3e636 diff --combined nominatim/api/search/icu_tokenizer.py index 76a1a2e5,05ec7690..23cfa5a1 --- a/nominatim/api/search/icu_tokenizer.py +++ b/nominatim/api/search/icu_tokenizer.py @@@ -97,6 -97,7 +97,7 @@@ class ICUToken(qmod.Token) """ Create a ICUToken from the row of the word table. """ count = 1 if row.info is None else row.info.get('count', 1) + addr_count = 1 if row.info is None else row.info.get('addr_count', 1) penalty = 0.0 if row.type == 'w': @@@ -123,7 -124,8 +124,8 @@@ return ICUToken(penalty=penalty, token=row.word_id, count=count, lookup_word=lookup_word, is_indexed=True, - word_token=row.word_token, info=row.info) + word_token=row.word_token, info=row.info, + addr_count=addr_count) @@@ -206,12 -208,7 +208,12 @@@ class ICUQueryAnalyzer(AbstractQueryAna standardized form search will work with. All information removed at this stage is inevitably lost. """ - return cast(str, self.normalizer.transliterate(text)) + norm = cast(str, self.normalizer.transliterate(text)) + numspaces = norm.count(' ') + if numspaces > 4 and len(norm) <= (numspaces + 1) * 3: + return '' + + return norm def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]: @@@ -262,7 -259,7 +264,7 @@@ if len(part.token) <= 4 and part[0].isdigit()\ and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER): query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER, - ICUToken(0.5, 0, 1, part.token, True, part.token, None)) + ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None)) def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: