X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/499110f549cc2c369ea3a0fb55a79a11bdc0352f..abc911079ef233ae599a0fb485a40f6128c41c47:/src/nominatim_api/search/icu_tokenizer.py?ds=inline diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index d52614fd..04e781ca 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -7,7 +7,7 @@ """ Implementation of query analysis for the ICU tokenizer. """ -from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast +from typing import Tuple, Dict, List, Optional, Iterator, Any, cast from collections import defaultdict import dataclasses import difflib @@ -36,17 +36,31 @@ DB_TO_TOKEN_TYPE = { 'C': qmod.TokenType.COUNTRY } +PENALTY_IN_TOKEN_BREAK = { + qmod.BreakType.START: 0.5, + qmod.BreakType.END: 0.5, + qmod.BreakType.PHRASE: 0.5, + qmod.BreakType.SOFT_PHRASE: 0.5, + qmod.BreakType.WORD: 0.1, + qmod.BreakType.PART: 0.0, + qmod.BreakType.TOKEN: 0.0 +} + -class QueryPart(NamedTuple): +@dataclasses.dataclass +class QueryPart: """ Normalized and transliterated form of a single term in the query. + When the term came out of a split during the transliteration, the normalized string is the full word before transliteration. - The word number keeps track of the word before transliteration - and can be used to identify partial transliterated terms. + Check the subsequent break type to figure out if the word is + continued. + + Penalty is the break penalty for the break following the token. """ token: str normalized: str - word_number: int + penalty: float QueryParts = List[QueryPart] @@ -60,10 +74,12 @@ def yield_words(terms: List[QueryPart], start: int) -> Iterator[Tuple[str, qmod. total = len(terms) for first in range(start, total): word = terms[first].token - yield word, qmod.TokenRange(first, first + 1) + penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType.WORD] + yield word, qmod.TokenRange(first, first + 1, penalty=penalty) for last in range(first + 1, min(first + 20, total)): word = ' '.join((word, terms[last].token)) - yield word, qmod.TokenRange(first, last + 1) + penalty += terms[last - 1].penalty + yield word, qmod.TokenRange(first, last + 1, penalty=penalty) @dataclasses.dataclass @@ -96,25 +112,25 @@ class ICUToken(qmod.Token): self.penalty += (distance/len(self.lookup_word)) @staticmethod - def from_db_row(row: SaRow) -> 'ICUToken': + def from_db_row(row: SaRow, base_penalty: float = 0.0) -> 'ICUToken': """ Create a ICUToken from the row of the word table. """ count = 1 if row.info is None else row.info.get('count', 1) addr_count = 1 if row.info is None else row.info.get('addr_count', 1) - penalty = 0.0 + penalty = base_penalty if row.type == 'w': - penalty = 0.3 + penalty += 0.3 elif row.type == 'W': if len(row.word_token) == 1 and row.word_token == row.word: - penalty = 0.2 if row.word.isdigit() else 0.3 + penalty += 0.2 if row.word.isdigit() else 0.3 elif row.type == 'H': - penalty = sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit()) + penalty += sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit()) if all(not c.isdigit() for c in row.word_token): penalty += 0.2 * (len(row.word_token) - 1) elif row.type == 'C': if len(row.word_token) == 1: - penalty = 0.3 + penalty += 0.3 if row.info is None: lookup_word = row.word @@ -204,7 +220,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): for row in await self.lookup_in_db(list(words.keys())): for trange in words[row.word_token]: - token = ICUToken.from_db_row(row) + token = ICUToken.from_db_row(row, trange.penalty or 0.0) if row.type == 'S': if row.info['op'] in ('in', 'near'): if trange.start == 0: @@ -229,7 +245,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): standardized form search will work with. All information removed at this stage is inevitably lost. """ - return cast(str, self.normalizer.transliterate(text)) + return cast(str, self.normalizer.transliterate(text)).strip('-: ') def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]: """ Transliterate the phrases and split them into tokens. @@ -241,7 +257,6 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): parts: QueryParts = [] phrase_start = 0 words = defaultdict(list) - wordnr = 0 for phrase in query.source: query.nodes[-1].ptype = phrase.ptype phrase_split = re.split('([ :-])', phrase.text) @@ -256,10 +271,11 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): if trans: for term in trans.split(' '): if term: - parts.append(QueryPart(term, word, wordnr)) + parts.append(QueryPart(term, word, + PENALTY_IN_TOKEN_BREAK[qmod.BreakType.TOKEN])) query.add_node(qmod.BreakType.TOKEN, phrase.ptype) query.nodes[-1].btype = qmod.BreakType(breakchar) - wordnr += 1 + parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType(breakchar)] for word, wrange in yield_words(parts, phrase_start): words[word].append(wrange) @@ -280,7 +296,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): """ Add tokens to query that are not saved in the database. """ for part, node, i in zip(parts, query.nodes, range(1000)): - if len(part.token) <= 4 and part[0].isdigit()\ + if len(part.token) <= 4 and part.token.isdigit()\ and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER): query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER, ICUToken(penalty=0.5, token=0, @@ -306,7 +322,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): elif tlist.ttype not in (qmod.TokenType.COUNTRY, qmod.TokenType.PARTIAL): norm = parts[i].normalized for j in range(i + 1, tlist.end): - if parts[j - 1].word_number != parts[j].word_number: + if node.btype != qmod.BreakType.TOKEN: norm += ' ' + parts[j].normalized for token in tlist.tokens: cast(ICUToken, token).rematch(norm)