X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/d3a575319feaf32009e0abe3efe4d8f2dc196e28..0278ab7f41bf9bc96a87084c04bfec263c6acc40:/nominatim/api/search/icu_tokenizer.py diff --git a/nominatim/api/search/icu_tokenizer.py b/nominatim/api/search/icu_tokenizer.py index 72e0f547..f6590f5b 100644 --- a/nominatim/api/search/icu_tokenizer.py +++ b/nominatim/api/search/icu_tokenizer.py @@ -97,6 +97,7 @@ class ICUToken(qmod.Token): """ Create a ICUToken from the row of the word table. """ count = 1 if row.info is None else row.info.get('count', 1) + addr_count = 1 if row.info is None else row.info.get('addr_count', 1) penalty = 0.0 if row.type == 'w': @@ -121,9 +122,10 @@ class ICUToken(qmod.Token): else: lookup_word = row.word_token - return ICUToken(penalty=penalty, token=row.word_id, count=count, + return ICUToken(penalty=penalty, token=row.word_id, count=max(1, count), lookup_word=lookup_word, is_indexed=True, - word_token=row.word_token, info=row.info) + word_token=row.word_token, info=row.info, + addr_count=max(1, addr_count)) @@ -186,7 +188,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): if trange.start == 0: query.add_token(trange, qmod.TokenType.NEAR_ITEM, token) else: - query.add_token(trange, qmod.TokenType.QUALIFIER, token) + if trange.start == 0 and trange.end == query.num_token_slots(): + query.add_token(trange, qmod.TokenType.NEAR_ITEM, token) + else: + query.add_token(trange, qmod.TokenType.QUALIFIER, token) else: query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token) @@ -203,7 +208,12 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): standardized form search will work with. All information removed at this stage is inevitably lost. """ - return cast(str, self.normalizer.transliterate(text)) + norm = cast(str, self.normalizer.transliterate(text)) + numspaces = norm.count(' ') + if numspaces > 4 and len(norm) <= (numspaces + 1) * 3: + return '' + + return norm def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]: @@ -254,7 +264,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): if len(part.token) <= 4 and part[0].isdigit()\ and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER): query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER, - ICUToken(0.5, 0, 1, part.token, True, part.token, None)) + ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None)) def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: