X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/b56edf3d0ac38db742a92991180d626861c314a1..31412e06740727695c5d9512e0cd59c0dd683322:/src/nominatim_api/search/icu_tokenizer.py diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index d4d0643f..1a449276 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -29,21 +29,21 @@ from .query_analyzer_factory import AbstractQueryAnalyzer DB_TO_TOKEN_TYPE = { - 'W': qmod.TokenType.WORD, - 'w': qmod.TokenType.PARTIAL, - 'H': qmod.TokenType.HOUSENUMBER, - 'P': qmod.TokenType.POSTCODE, - 'C': qmod.TokenType.COUNTRY + 'W': qmod.TOKEN_WORD, + 'w': qmod.TOKEN_PARTIAL, + 'H': qmod.TOKEN_HOUSENUMBER, + 'P': qmod.TOKEN_POSTCODE, + 'C': qmod.TOKEN_COUNTRY } PENALTY_IN_TOKEN_BREAK = { - qmod.BreakType.START: 0.5, - qmod.BreakType.END: 0.5, - qmod.BreakType.PHRASE: 0.5, - qmod.BreakType.SOFT_PHRASE: 0.5, - qmod.BreakType.WORD: 0.1, - qmod.BreakType.PART: 0.0, - qmod.BreakType.TOKEN: 0.0 + qmod.BREAK_START: 0.5, + qmod.BREAK_END: 0.5, + qmod.BREAK_PHRASE: 0.5, + qmod.BREAK_SOFT_PHRASE: 0.5, + qmod.BREAK_WORD: 0.1, + qmod.BREAK_PART: 0.0, + qmod.BREAK_TOKEN: 0.0 } @@ -72,7 +72,7 @@ def extract_words(terms: List[QueryPart], start: int, words: WordDict) -> None: given position to the word list. """ total = len(terms) - base_penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType.WORD] + base_penalty = PENALTY_IN_TOKEN_BREAK[qmod.BREAK_WORD] for first in range(start, total): word = terms[first].token penalty = base_penalty @@ -225,12 +225,12 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): if row.type == 'S': if row.info['op'] in ('in', 'near'): if trange.start == 0: - query.add_token(trange, qmod.TokenType.NEAR_ITEM, token) + query.add_token(trange, qmod.TOKEN_NEAR_ITEM, token) else: if trange.start == 0 and trange.end == query.num_token_slots(): - query.add_token(trange, qmod.TokenType.NEAR_ITEM, token) + query.add_token(trange, qmod.TOKEN_NEAR_ITEM, token) else: - query.add_token(trange, qmod.TokenType.QUALIFIER, token) + query.add_token(trange, qmod.TOKEN_QUALIFIER, token) else: query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token) @@ -273,15 +273,15 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): for term in trans.split(' '): if term: parts.append(QueryPart(term, word, - PENALTY_IN_TOKEN_BREAK[qmod.BreakType.TOKEN])) - query.add_node(qmod.BreakType.TOKEN, phrase.ptype) - query.nodes[-1].btype = qmod.BreakType(breakchar) - parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType(breakchar)] + PENALTY_IN_TOKEN_BREAK[qmod.BREAK_TOKEN])) + query.add_node(qmod.BREAK_TOKEN, phrase.ptype) + query.nodes[-1].btype = breakchar + parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[breakchar] extract_words(parts, phrase_start, words) phrase_start = len(parts) - query.nodes[-1].btype = qmod.BreakType.END + query.nodes[-1].btype = qmod.BREAK_END return parts, words @@ -297,8 +297,8 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): """ for part, node, i in zip(parts, query.nodes, range(1000)): if len(part.token) <= 4 and part.token.isdigit()\ - and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER): - query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER, + and not node.has_tokens(i+1, qmod.TOKEN_HOUSENUMBER): + query.add_token(qmod.TokenRange(i, i+1), qmod.TOKEN_HOUSENUMBER, ICUToken(penalty=0.5, token=0, count=1, addr_count=1, lookup_word=part.token, word_token=part.token, info=None)) @@ -307,31 +307,31 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): """ Add penalties to tokens that depend on presence of other token. """ for i, node, tlist in query.iter_token_lists(): - if tlist.ttype == qmod.TokenType.POSTCODE: + if tlist.ttype == qmod.TOKEN_POSTCODE: for repl in node.starting: - if repl.end == tlist.end and repl.ttype != qmod.TokenType.POSTCODE \ - and (repl.ttype != qmod.TokenType.HOUSENUMBER + if repl.end == tlist.end and repl.ttype != qmod.TOKEN_POSTCODE \ + and (repl.ttype != qmod.TOKEN_HOUSENUMBER or len(tlist.tokens[0].lookup_word) > 4): repl.add_penalty(0.39) - elif (tlist.ttype == qmod.TokenType.HOUSENUMBER + elif (tlist.ttype == qmod.TOKEN_HOUSENUMBER and len(tlist.tokens[0].lookup_word) <= 3): if any(c.isdigit() for c in tlist.tokens[0].lookup_word): for repl in node.starting: - if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER: + if repl.end == tlist.end and repl.ttype != qmod.TOKEN_HOUSENUMBER: repl.add_penalty(0.5 - tlist.tokens[0].penalty) - elif tlist.ttype not in (qmod.TokenType.COUNTRY, qmod.TokenType.PARTIAL): + elif tlist.ttype not in (qmod.TOKEN_COUNTRY, qmod.TOKEN_PARTIAL): norm = parts[i].normalized for j in range(i + 1, tlist.end): - if node.btype != qmod.BreakType.TOKEN: + if node.btype != qmod.BREAK_TOKEN: norm += ' ' + parts[j].normalized for token in tlist.tokens: cast(ICUToken, token).rematch(norm) def _dump_transliterated(query: qmod.QueryStruct, parts: QueryParts) -> str: - out = query.nodes[0].btype.value + out = query.nodes[0].btype for node, part in zip(query.nodes[1:], parts): - out += part.token + node.btype.value + out += part.token + node.btype return out @@ -341,7 +341,7 @@ def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]: for tlist in node.starting: for token in tlist.tokens: t = cast(ICUToken, token) - yield [tlist.ttype.name, t.token, t.word_token or '', + yield [tlist.ttype, t.token, t.word_token or '', t.lookup_word or '', t.penalty, t.count, t.info]