X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/d984100e23d28253b3659e7b628cde2c8b436cf8..4577669213ea392fa7e25a2fce444f387763f4c8:/src/nominatim_api/search/icu_tokenizer.py diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index 6f1dcf79..35621125 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -37,28 +37,29 @@ DB_TO_TOKEN_TYPE = { } PENALTY_IN_TOKEN_BREAK = { - qmod.BreakType.START: 0.5, - qmod.BreakType.END: 0.5, - qmod.BreakType.PHRASE: 0.5, - qmod.BreakType.SOFT_PHRASE: 0.5, - qmod.BreakType.WORD: 0.1, - qmod.BreakType.PART: 0.0, - qmod.BreakType.TOKEN: 0.0 + qmod.BREAK_START: 0.5, + qmod.BREAK_END: 0.5, + qmod.BREAK_PHRASE: 0.5, + qmod.BREAK_SOFT_PHRASE: 0.5, + qmod.BREAK_WORD: 0.1, + qmod.BREAK_PART: 0.0, + qmod.BREAK_TOKEN: 0.0 } @dataclasses.dataclass class QueryPart: """ Normalized and transliterated form of a single term in the query. + When the term came out of a split during the transliteration, the normalized string is the full word before transliteration. - The word number keeps track of the word before transliteration - and can be used to identify partial transliterated terms. + Check the subsequent break type to figure out if the word is + continued. + Penalty is the break penalty for the break following the token. """ token: str normalized: str - word_number: int penalty: float @@ -66,19 +67,20 @@ QueryParts = List[QueryPart] WordDict = Dict[str, List[qmod.TokenRange]] -def yield_words(terms: List[QueryPart], start: int) -> Iterator[Tuple[str, qmod.TokenRange]]: - """ Return all combinations of words in the terms list after the - given position. +def extract_words(terms: List[QueryPart], start: int, words: WordDict) -> None: + """ Add all combinations of words in the terms list after the + given position to the word list. """ total = len(terms) + base_penalty = PENALTY_IN_TOKEN_BREAK[qmod.BREAK_WORD] for first in range(start, total): word = terms[first].token - penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType.WORD] - yield word, qmod.TokenRange(first, first + 1, penalty=penalty) + penalty = base_penalty + words[word].append(qmod.TokenRange(first, first + 1, penalty=penalty)) for last in range(first + 1, min(first + 20, total)): word = ' '.join((word, terms[last].token)) penalty += terms[last - 1].penalty - yield word, qmod.TokenRange(first, last + 1, penalty=penalty) + words[word].append(qmod.TokenRange(first, last + 1, penalty=penalty)) @dataclasses.dataclass @@ -244,7 +246,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): standardized form search will work with. All information removed at this stage is inevitably lost. """ - return cast(str, self.normalizer.transliterate(text)) + return cast(str, self.normalizer.transliterate(text)).strip('-: ') def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]: """ Transliterate the phrases and split them into tokens. @@ -255,8 +257,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): """ parts: QueryParts = [] phrase_start = 0 - words = defaultdict(list) - wordnr = 0 + words: WordDict = defaultdict(list) for phrase in query.source: query.nodes[-1].ptype = phrase.ptype phrase_split = re.split('([ :-])', phrase.text) @@ -271,18 +272,16 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): if trans: for term in trans.split(' '): if term: - parts.append(QueryPart(term, word, wordnr, - PENALTY_IN_TOKEN_BREAK[qmod.BreakType.TOKEN])) - query.add_node(qmod.BreakType.TOKEN, phrase.ptype) - query.nodes[-1].btype = qmod.BreakType(breakchar) - parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType(breakchar)] - wordnr += 1 + parts.append(QueryPart(term, word, + PENALTY_IN_TOKEN_BREAK[qmod.BREAK_TOKEN])) + query.add_node(qmod.BREAK_TOKEN, phrase.ptype) + query.nodes[-1].btype = breakchar + parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[breakchar] - for word, wrange in yield_words(parts, phrase_start): - words[word].append(wrange) + extract_words(parts, phrase_start, words) phrase_start = len(parts) - query.nodes[-1].btype = qmod.BreakType.END + query.nodes[-1].btype = qmod.BREAK_END return parts, words @@ -323,16 +322,16 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): elif tlist.ttype not in (qmod.TokenType.COUNTRY, qmod.TokenType.PARTIAL): norm = parts[i].normalized for j in range(i + 1, tlist.end): - if parts[j - 1].word_number != parts[j].word_number: + if node.btype != qmod.BREAK_TOKEN: norm += ' ' + parts[j].normalized for token in tlist.tokens: cast(ICUToken, token).rematch(norm) def _dump_transliterated(query: qmod.QueryStruct, parts: QueryParts) -> str: - out = query.nodes[0].btype.value + out = query.nodes[0].btype for node, part in zip(query.nodes[1:], parts): - out += part.token + node.btype.value + out += part.token + node.btype return out