X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/004883bdb1cfdfea053cb59fe32792c4e368e88c..7d911f9ffbdf63b2b2a45c3a3ee7063d006a5779:/nominatim/api/search/icu_tokenizer.py diff --git a/nominatim/api/search/icu_tokenizer.py b/nominatim/api/search/icu_tokenizer.py index 14698a28..f259995d 100644 --- a/nominatim/api/search/icu_tokenizer.py +++ b/nominatim/api/search/icu_tokenizer.py @@ -21,10 +21,7 @@ from nominatim.typing import SaRow from nominatim.api.connection import SearchConnection from nominatim.api.logging import log from nominatim.api.search import query as qmod - -# XXX: TODO -class AbstractQueryAnalyzer: - pass +from nominatim.api.search.query_analyzer_factory import AbstractQueryAnalyzer DB_TO_TOKEN_TYPE = { @@ -156,7 +153,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): """ log().section('Analyze query (using ICU tokenizer)') normalized = list(filter(lambda p: p.text, - (qmod.Phrase(p.ptype, self.normalizer.transliterate(p.text)) + (qmod.Phrase(p.ptype, self.normalize_text(p.text)) for p in phrases))) query = qmod.QueryStruct(normalized) log().var_dump('Normalized query', query.source) @@ -190,6 +187,14 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): return query + def normalize_text(self, text: str) -> str: + """ Bring the given text into a normalized form. That is the + standardized form search will work with. All information removed + at this stage is inevitably lost. + """ + return cast(str, self.normalizer.transliterate(text)) + + def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]: """ Transliterate the phrases and split them into tokens. @@ -251,12 +256,11 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): and (repl.ttype != qmod.TokenType.HOUSENUMBER or len(tlist.tokens[0].lookup_word) > 4): repl.add_penalty(0.39) - elif tlist.ttype == qmod.TokenType.HOUSENUMBER: + elif tlist.ttype == qmod.TokenType.HOUSENUMBER \ + and len(tlist.tokens[0].lookup_word) <= 3: if any(c.isdigit() for c in tlist.tokens[0].lookup_word): for repl in node.starting: - if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER \ - and (repl.ttype != qmod.TokenType.HOUSENUMBER - or len(tlist.tokens[0].lookup_word) <= 3): + if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER: repl.add_penalty(0.5 - tlist.tokens[0].penalty) elif tlist.ttype not in (qmod.TokenType.COUNTRY, qmod.TokenType.PARTIAL): norm = parts[i].normalized