X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/4e4d29f653d4929f49536255314ec19264166ec6..657aae5f1bfebb99df5b9e6aa00c5b89269e4910:/nominatim/api/search/icu_tokenizer.py?ds=sidebyside diff --git a/nominatim/api/search/icu_tokenizer.py b/nominatim/api/search/icu_tokenizer.py index 196fde2a..eb90c122 100644 --- a/nominatim/api/search/icu_tokenizer.py +++ b/nominatim/api/search/icu_tokenizer.py @@ -8,7 +8,6 @@ Implementation of query analysis for the ICU tokenizer. """ from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast -from copy import copy from collections import defaultdict import dataclasses import difflib @@ -22,6 +21,7 @@ from nominatim.api.connection import SearchConnection from nominatim.api.logging import log from nominatim.api.search import query as qmod from nominatim.api.search.query_analyzer_factory import AbstractQueryAnalyzer +from nominatim.db.sqlalchemy_types import Json DB_TO_TOKEN_TYPE = { @@ -97,6 +97,7 @@ class ICUToken(qmod.Token): """ Create a ICUToken from the row of the word table. """ count = 1 if row.info is None else row.info.get('count', 1) + addr_count = 1 if row.info is None else row.info.get('addr_count', 1) penalty = 0.0 if row.type == 'w': @@ -121,9 +122,10 @@ class ICUToken(qmod.Token): else: lookup_word = row.word_token - return ICUToken(penalty=penalty, token=row.word_id, count=count, + return ICUToken(penalty=penalty, token=row.word_id, count=max(1, count), lookup_word=lookup_word, is_indexed=True, - word_token=row.word_token, info=row.info) + word_token=row.word_token, info=row.info, + addr_count=max(1, addr_count)) @@ -159,7 +161,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): sa.Column('word_token', sa.Text, nullable=False), sa.Column('type', sa.Text, nullable=False), sa.Column('word', sa.Text), - sa.Column('info', self.conn.t.types.Json)) + sa.Column('info', Json)) async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct: @@ -184,13 +186,12 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): if row.type == 'S': if row.info['op'] in ('in', 'near'): if trange.start == 0: - query.add_token(trange, qmod.TokenType.CATEGORY, token) + query.add_token(trange, qmod.TokenType.NEAR_ITEM, token) else: - query.add_token(trange, qmod.TokenType.QUALIFIER, token) - if trange.start == 0 or trange.end == query.num_token_slots(): - token = copy(token) - token.penalty += 0.1 * (query.num_token_slots()) - query.add_token(trange, qmod.TokenType.CATEGORY, token) + if trange.start == 0 and trange.end == query.num_token_slots(): + query.add_token(trange, qmod.TokenType.NEAR_ITEM, token) + else: + query.add_token(trange, qmod.TokenType.QUALIFIER, token) else: query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token) @@ -258,7 +259,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): if len(part.token) <= 4 and part[0].isdigit()\ and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER): query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER, - ICUToken(0.5, 0, 1, part.token, True, part.token, None)) + ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None)) def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: