X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/af85ad390fcafbf6d73577b0d5d95c945d6d70b2..3fb739de9275680d4f482d363f22c535442f0017:/nominatim/api/search/icu_tokenizer.py diff --git a/nominatim/api/search/icu_tokenizer.py b/nominatim/api/search/icu_tokenizer.py index eabd329d..76a1a2e5 100644 --- a/nominatim/api/search/icu_tokenizer.py +++ b/nominatim/api/search/icu_tokenizer.py @@ -8,7 +8,6 @@ Implementation of query analysis for the ICU tokenizer. """ from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast -from copy import copy from collections import defaultdict import dataclasses import difflib @@ -187,11 +186,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): if trange.start == 0: query.add_token(trange, qmod.TokenType.NEAR_ITEM, token) else: - query.add_token(trange, qmod.TokenType.QUALIFIER, token) - if trange.start == 0 or trange.end == query.num_token_slots(): - token = copy(token) - token.penalty += 0.1 * (query.num_token_slots()) + if trange.start == 0 and trange.end == query.num_token_slots(): query.add_token(trange, qmod.TokenType.NEAR_ITEM, token) + else: + query.add_token(trange, qmod.TokenType.QUALIFIER, token) else: query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token) @@ -208,7 +206,12 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): standardized form search will work with. All information removed at this stage is inevitably lost. """ - return cast(str, self.normalizer.transliterate(text)) + norm = cast(str, self.normalizer.transliterate(text)) + numspaces = norm.count(' ') + if numspaces > 4 and len(norm) <= (numspaces + 1) * 3: + return '' + + return norm def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]: