X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/bd2c64876f7ddc99da14ea78a652f797e17134f4..bc7adbae2bc8ebc61bca3800155d070908502dd9:/nominatim/api/search/legacy_tokenizer.py?ds=sidebyside diff --git a/nominatim/api/search/legacy_tokenizer.py b/nominatim/api/search/legacy_tokenizer.py index 96975704..031f2456 100644 --- a/nominatim/api/search/legacy_tokenizer.py +++ b/nominatim/api/search/legacy_tokenizer.py @@ -44,7 +44,7 @@ class LegacyToken(qmod.Token): @property def info(self) -> Dict[str, Any]: - """ Dictionary of additional propoerties of the token. + """ Dictionary of additional properties of the token. Should only be used for debugging purposes. """ return {'category': self.category, @@ -107,15 +107,15 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer): for row in await self.lookup_in_db(lookup_words): for trange in words[row.word_token.strip()]: token, ttype = self.make_token(row) - if ttype == qmod.TokenType.CATEGORY: + if ttype == qmod.TokenType.NEAR_ITEM: if trange.start == 0: - query.add_token(trange, qmod.TokenType.CATEGORY, token) + query.add_token(trange, qmod.TokenType.NEAR_ITEM, token) elif ttype == qmod.TokenType.QUALIFIER: query.add_token(trange, qmod.TokenType.QUALIFIER, token) if trange.start == 0 or trange.end == query.num_token_slots(): token = copy(token) token.penalty += 0.1 * (query.num_token_slots()) - query.add_token(trange, qmod.TokenType.CATEGORY, token) + query.add_token(trange, qmod.TokenType.NEAR_ITEM, token) elif ttype != qmod.TokenType.PARTIAL or trange.start + 1 == trange.end: query.add_token(trange, ttype, token) @@ -127,6 +127,15 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer): return query + def normalize_text(self, text: str) -> str: + """ Bring the given text into a normalized form. + + This only removes case, so some difference with the normalization + in the phrase remains. + """ + return text.lower() + + def split_query(self, query: qmod.QueryStruct) -> Tuple[List[str], Dict[str, List[qmod.TokenRange]]]: """ Transliterate the phrases and split them into tokens. @@ -186,7 +195,7 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer): ttype = qmod.TokenType.POSTCODE lookup_word = row.word_token[1:] else: - ttype = qmod.TokenType.CATEGORY if row.operator in ('in', 'near')\ + ttype = qmod.TokenType.NEAR_ITEM if row.operator in ('in', 'near')\ else qmod.TokenType.QUALIFIER lookup_word = row.word elif row.word_token.startswith(' '): @@ -200,7 +209,8 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer): is_indexed = False return LegacyToken(penalty=penalty, token=row.word_id, - count=row.search_name_count or 1, + count=max(1, row.search_name_count or 1), + addr_count=1, # not supported lookup_word=lookup_word, word_token=row.word_token.strip(), category=(rowclass, row.type) if rowclass is not None else None, @@ -217,7 +227,7 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer): if len(part) <= 4 and part.isdigit()\ and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER): query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER, - LegacyToken(penalty=0.5, token=0, count=1, + LegacyToken(penalty=0.5, token=0, count=1, addr_count=1, lookup_word=part, word_token=part, category=None, country=None, operator=None, is_indexed=True)) @@ -233,12 +243,11 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer): and (repl.ttype != qmod.TokenType.HOUSENUMBER or len(tlist.tokens[0].lookup_word) > 4): repl.add_penalty(0.39) - elif tlist.ttype == qmod.TokenType.HOUSENUMBER: + elif tlist.ttype == qmod.TokenType.HOUSENUMBER \ + and len(tlist.tokens[0].lookup_word) <= 3: if any(c.isdigit() for c in tlist.tokens[0].lookup_word): for repl in node.starting: - if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER \ - and (repl.ttype != qmod.TokenType.HOUSENUMBER - or len(tlist.tokens[0].lookup_word) <= 3): + if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER: repl.add_penalty(0.5 - tlist.tokens[0].penalty)