1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Implementation of query analysis for the legacy tokenizer.
10 from typing import Tuple, Dict, List, Optional, Iterator, Any, cast
12 from collections import defaultdict
15 import sqlalchemy as sa
17 from ..typing import SaRow
18 from ..connection import SearchConnection
19 from ..logging import log
20 from . import query as qmod
21 from .query_analyzer_factory import AbstractQueryAnalyzer
23 def yield_words(terms: List[str], start: int) -> Iterator[Tuple[str, qmod.TokenRange]]:
24 """ Return all combinations of words in the terms list after the
28 for first in range(start, total):
30 yield word, qmod.TokenRange(first, first + 1)
31 for last in range(first + 1, min(first + 20, total)):
32 word = ' '.join((word, terms[last]))
33 yield word, qmod.TokenRange(first, last + 1)
36 @dataclasses.dataclass
37 class LegacyToken(qmod.Token):
38 """ Specialised token for legacy tokenizer.
41 category: Optional[Tuple[str, str]]
42 country: Optional[str]
43 operator: Optional[str]
46 def info(self) -> Dict[str, Any]:
47 """ Dictionary of additional properties of the token.
48 Should only be used for debugging purposes.
50 return {'category': self.category,
51 'country': self.country,
52 'operator': self.operator}
55 def get_category(self) -> Tuple[str, str]:
60 class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
61 """ Converter for query strings into a tokenized query
62 using the tokens created by a legacy tokenizer.
65 def __init__(self, conn: SearchConnection) -> None:
68 async def setup(self) -> None:
69 """ Set up static data structures needed for the analysis.
71 self.max_word_freq = int(await self.conn.get_property('tokenizer_maxwordfreq'))
72 if 'word' not in self.conn.t.meta.tables:
73 sa.Table('word', self.conn.t.meta,
74 sa.Column('word_id', sa.Integer),
75 sa.Column('word_token', sa.Text, nullable=False),
76 sa.Column('word', sa.Text),
77 sa.Column('class', sa.Text),
78 sa.Column('type', sa.Text),
79 sa.Column('country_code', sa.Text),
80 sa.Column('search_name_count', sa.Integer),
81 sa.Column('operator', sa.Text))
84 async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
85 """ Analyze the given list of phrases and return the
88 log().section('Analyze query (using Legacy tokenizer)')
92 for row in await self.conn.execute(sa.select(*(sa.func.make_standard_name(p.text)
94 normalized = [qmod.Phrase(p.ptype, r) for r, p in zip(row, phrases) if r]
97 query = qmod.QueryStruct(normalized)
98 log().var_dump('Normalized query', query.source)
102 parts, words = self.split_query(query)
103 lookup_words = list(words.keys())
104 log().var_dump('Split query', parts)
105 log().var_dump('Extracted words', lookup_words)
107 for row in await self.lookup_in_db(lookup_words):
108 for trange in words[row.word_token.strip()]:
109 token, ttype = self.make_token(row)
110 if ttype == qmod.TokenType.NEAR_ITEM:
111 if trange.start == 0:
112 query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
113 elif ttype == qmod.TokenType.QUALIFIER:
114 query.add_token(trange, qmod.TokenType.QUALIFIER, token)
115 if trange.start == 0 or trange.end == query.num_token_slots():
117 token.penalty += 0.1 * (query.num_token_slots())
118 query.add_token(trange, qmod.TokenType.NEAR_ITEM, token)
119 elif ttype != qmod.TokenType.PARTIAL or trange.start + 1 == trange.end:
120 query.add_token(trange, ttype, token)
122 self.add_extra_tokens(query, parts)
123 self.rerank_tokens(query)
125 log().table_dump('Word tokens', _dump_word_tokens(query))
130 def normalize_text(self, text: str) -> str:
131 """ Bring the given text into a normalized form.
133 This only removes case, so some difference with the normalization
134 in the phrase remains.
139 def split_query(self, query: qmod.QueryStruct) -> Tuple[List[str],
140 Dict[str, List[qmod.TokenRange]]]:
141 """ Transliterate the phrases and split them into tokens.
143 Returns a list of transliterated tokens and a dictionary
144 of words for lookup together with their position.
146 parts: List[str] = []
148 words = defaultdict(list)
149 for phrase in query.source:
150 query.nodes[-1].ptype = phrase.ptype
151 for trans in phrase.text.split(' '):
153 for term in trans.split(' '):
156 query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
157 query.nodes[-1].btype = qmod.BreakType.WORD
158 query.nodes[-1].btype = qmod.BreakType.PHRASE
159 for word, wrange in yield_words(parts, phrase_start):
160 words[word].append(wrange)
161 phrase_start = len(parts)
162 query.nodes[-1].btype = qmod.BreakType.END
167 async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
168 """ Return the token information from the database for the
171 t = self.conn.t.meta.tables['word']
173 sql = t.select().where(t.c.word_token.in_(words + [' ' + w for w in words]))
175 return await self.conn.execute(sql)
178 def make_token(self, row: SaRow) -> Tuple[LegacyToken, qmod.TokenType]:
179 """ Create a LegacyToken from the row of the word table.
180 Also determines the type of token.
185 rowclass = getattr(row, 'class')
187 if row.country_code is not None:
188 ttype = qmod.TokenType.COUNTRY
189 lookup_word = row.country_code
190 elif rowclass is not None:
191 if rowclass == 'place' and row.type == 'house':
192 ttype = qmod.TokenType.HOUSENUMBER
193 lookup_word = row.word_token[1:]
194 elif rowclass == 'place' and row.type == 'postcode':
195 ttype = qmod.TokenType.POSTCODE
196 lookup_word = row.word_token[1:]
198 ttype = qmod.TokenType.NEAR_ITEM if row.operator in ('in', 'near')\
199 else qmod.TokenType.QUALIFIER
200 lookup_word = row.word
201 elif row.word_token.startswith(' '):
202 ttype = qmod.TokenType.WORD
203 lookup_word = row.word or row.word_token[1:]
205 ttype = qmod.TokenType.PARTIAL
206 lookup_word = row.word_token
208 if row.search_name_count > self.max_word_freq:
211 return LegacyToken(penalty=penalty, token=row.word_id,
212 count=max(1, row.search_name_count or 1),
213 addr_count=1, # not supported
214 lookup_word=lookup_word,
215 word_token=row.word_token.strip(),
216 category=(rowclass, row.type) if rowclass is not None else None,
217 country=row.country_code,
218 operator=row.operator,
219 is_indexed=is_indexed),\
223 def add_extra_tokens(self, query: qmod.QueryStruct, parts: List[str]) -> None:
224 """ Add tokens to query that are not saved in the database.
226 for part, node, i in zip(parts, query.nodes, range(1000)):
227 if len(part) <= 4 and part.isdigit()\
228 and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
229 query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
230 LegacyToken(penalty=0.5, token=0, count=1, addr_count=1,
231 lookup_word=part, word_token=part,
232 category=None, country=None,
233 operator=None, is_indexed=True))
236 def rerank_tokens(self, query: qmod.QueryStruct) -> None:
237 """ Add penalties to tokens that depend on presence of other token.
239 for _, node, tlist in query.iter_token_lists():
240 if tlist.ttype == qmod.TokenType.POSTCODE:
241 for repl in node.starting:
242 if repl.end == tlist.end and repl.ttype != qmod.TokenType.POSTCODE \
243 and (repl.ttype != qmod.TokenType.HOUSENUMBER
244 or len(tlist.tokens[0].lookup_word) > 4):
245 repl.add_penalty(0.39)
246 elif tlist.ttype == qmod.TokenType.HOUSENUMBER \
247 and len(tlist.tokens[0].lookup_word) <= 3:
248 if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
249 for repl in node.starting:
250 if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER:
251 repl.add_penalty(0.5 - tlist.tokens[0].penalty)
255 def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
256 yield ['type', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
257 for node in query.nodes:
258 for tlist in node.starting:
259 for token in tlist.tokens:
260 t = cast(LegacyToken, token)
261 yield [tlist.ttype.name, t.token, t.word_token or '',
262 t.lookup_word or '', t.penalty, t.count, t.info]
265 async def create_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer:
266 """ Create and set up a new query analyzer for a database based
267 on the ICU tokenizer.
269 out = LegacyQueryAnalyzer(conn)