X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/db3991af74f63828d5668cf45d1afc92e9ee07c2..abc911079ef233ae599a0fb485a40f6128c41c47:/src/nominatim_api/search/icu_tokenizer.py diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index 6f1dcf79..04e781ca 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -50,15 +50,16 @@ PENALTY_IN_TOKEN_BREAK = { @dataclasses.dataclass class QueryPart: """ Normalized and transliterated form of a single term in the query. + When the term came out of a split during the transliteration, the normalized string is the full word before transliteration. - The word number keeps track of the word before transliteration - and can be used to identify partial transliterated terms. + Check the subsequent break type to figure out if the word is + continued. + Penalty is the break penalty for the break following the token. """ token: str normalized: str - word_number: int penalty: float @@ -244,7 +245,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): standardized form search will work with. All information removed at this stage is inevitably lost. """ - return cast(str, self.normalizer.transliterate(text)) + return cast(str, self.normalizer.transliterate(text)).strip('-: ') def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]: """ Transliterate the phrases and split them into tokens. @@ -256,7 +257,6 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): parts: QueryParts = [] phrase_start = 0 words = defaultdict(list) - wordnr = 0 for phrase in query.source: query.nodes[-1].ptype = phrase.ptype phrase_split = re.split('([ :-])', phrase.text) @@ -271,12 +271,11 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): if trans: for term in trans.split(' '): if term: - parts.append(QueryPart(term, word, wordnr, + parts.append(QueryPart(term, word, PENALTY_IN_TOKEN_BREAK[qmod.BreakType.TOKEN])) query.add_node(qmod.BreakType.TOKEN, phrase.ptype) query.nodes[-1].btype = qmod.BreakType(breakchar) parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType(breakchar)] - wordnr += 1 for word, wrange in yield_words(parts, phrase_start): words[word].append(wrange) @@ -323,7 +322,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): elif tlist.ttype not in (qmod.TokenType.COUNTRY, qmod.TokenType.PARTIAL): norm = parts[i].normalized for j in range(i + 1, tlist.end): - if parts[j - 1].word_number != parts[j].word_number: + if node.btype != qmod.BreakType.TOKEN: norm += ' ' + parts[j].normalized for token in tlist.tokens: cast(ICUToken, token).rematch(norm)