X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/d856788bf5cf7b4b3e2b4de10f37cf3002ae6826..57598a048e9124b905572ed8dc4fa9465b5d38a6:/src/nominatim_api/search/icu_tokenizer.py?ds=inline diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index 1aadc97e..c18dd8be 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -48,6 +48,7 @@ class QueryPart(NamedTuple): QueryParts = List[QueryPart] WordDict = Dict[str, List[qmod.TokenRange]] + def yield_words(terms: List[QueryPart], start: int) -> Iterator[Tuple[str, qmod.TokenRange]]: """ Return all combinations of words in the terms list after the given position. @@ -72,7 +73,6 @@ class ICUToken(qmod.Token): assert self.info return self.info.get('class', ''), self.info.get('type', '') - def rematch(self, norm: str) -> None: """ Check how well the token matches the given normalized string and add a penalty, if necessary. @@ -91,7 +91,6 @@ class ICUToken(qmod.Token): distance += abs((ato-afrom) - (bto-bfrom)) self.penalty += (distance/len(self.lookup_word)) - @staticmethod def from_db_row(row: SaRow) -> 'ICUToken': """ Create a ICUToken from the row of the word table. @@ -128,16 +127,13 @@ class ICUToken(qmod.Token): addr_count=max(1, addr_count)) - class ICUQueryAnalyzer(AbstractQueryAnalyzer): """ Converter for query strings into a tokenized query using the tokens created by a ICU tokenizer. """ - def __init__(self, conn: SearchConnection) -> None: self.conn = conn - async def setup(self) -> None: """ Set up static data structures needed for the analysis. """ @@ -163,7 +159,6 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): sa.Column('word', sa.Text), sa.Column('info', Json)) - async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct: """ Analyze the given list of phrases and return the tokenized query. @@ -202,14 +197,17 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): return query - def normalize_text(self, text: str) -> str: """ Bring the given text into a normalized form. That is the standardized form search will work with. All information removed at this stage is inevitably lost. """ - return cast(str, self.normalizer.transliterate(text)) + norm = cast(str, self.normalizer.transliterate(text)) + numspaces = norm.count(' ') + if numspaces > 4 and len(norm) <= (numspaces + 1) * 3: + return '' + return norm def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]: """ Transliterate the phrases and split them into tokens. @@ -243,7 +241,6 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): return parts, words - async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]': """ Return the token information from the database for the given word tokens. @@ -251,7 +248,6 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): t = self.conn.t.meta.tables['word'] return await self.conn.execute(t.select().where(t.c.word_token.in_(words))) - def add_extra_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: """ Add tokens to query that are not saved in the database. """ @@ -263,7 +259,6 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): count=1, addr_count=1, lookup_word=part.token, word_token=part.token, info=None)) - def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: """ Add penalties to tokens that depend on presence of other token. """ @@ -274,8 +269,8 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): and (repl.ttype != qmod.TokenType.HOUSENUMBER or len(tlist.tokens[0].lookup_word) > 4): repl.add_penalty(0.39) - elif tlist.ttype == qmod.TokenType.HOUSENUMBER \ - and len(tlist.tokens[0].lookup_word) <= 3: + elif (tlist.ttype == qmod.TokenType.HOUSENUMBER + and len(tlist.tokens[0].lookup_word) <= 3): if any(c.isdigit() for c in tlist.tokens[0].lookup_word): for repl in node.starting: if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER: