From: Sarah Hoffmann Date: Sun, 7 Jan 2024 14:24:30 +0000 (+0100) Subject: Merge remote-tracking branch 'upstream/master' X-Git-Tag: deploy~26 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/784ca928f61de93da84696a8cea12b89c52d9167?hp=-c Merge remote-tracking branch 'upstream/master' --- 784ca928f61de93da84696a8cea12b89c52d9167 diff --combined nominatim/api/search/icu_tokenizer.py index ff1c3fee,72e0f547..6f3e09e8 --- a/nominatim/api/search/icu_tokenizer.py +++ b/nominatim/api/search/icu_tokenizer.py @@@ -8,7 -8,6 +8,6 @@@ Implementation of query analysis for the ICU tokenizer. """ from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast - from copy import copy from collections import defaultdict import dataclasses import difflib @@@ -188,10 -187,6 +187,6 @@@ class ICUQueryAnalyzer(AbstractQueryAna query.add_token(trange, qmod.TokenType.NEAR_ITEM, token) else: query.add_token(trange, qmod.TokenType.QUALIFIER, token) - if trange.start == 0 or trange.end == query.num_token_slots(): - token = copy(token) - token.penalty += 0.1 * (query.num_token_slots()) - query.add_token(trange, qmod.TokenType.NEAR_ITEM, token) else: query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token) @@@ -208,12 -203,7 +203,12 @@@ standardized form search will work with. All information removed at this stage is inevitably lost. """ - return cast(str, self.normalizer.transliterate(text)) + norm = cast(str, self.normalizer.transliterate(text)) + numspaces = norm.count(' ') + if numspaces > 4 and len(norm) <= (numspaces + 1) * 3: + return '' + + return norm def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]: