1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2023 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Implementation of query analysis for the legacy tokenizer.
10 from typing import Tuple, Dict, List, Optional, Iterator, Any, cast
12 from collections import defaultdict
15 import sqlalchemy as sa
17 from nominatim.typing import SaRow
18 from nominatim.api.connection import SearchConnection
19 from nominatim.api.logging import log
20 from nominatim.api.search import query as qmod
21 from nominatim.api.search.query_analyzer_factory import AbstractQueryAnalyzer
23 def yield_words(terms: List[str], start: int) -> Iterator[Tuple[str, qmod.TokenRange]]:
24 """ Return all combinations of words in the terms list after the
28 for first in range(start, total):
30 yield word, qmod.TokenRange(first, first + 1)
31 for last in range(first + 1, min(first + 20, total)):
32 word = ' '.join((word, terms[last]))
33 yield word, qmod.TokenRange(first, last + 1)
36 @dataclasses.dataclass
37 class LegacyToken(qmod.Token):
38 """ Specialised token for legacy tokenizer.
41 category: Optional[Tuple[str, str]]
42 country: Optional[str]
43 operator: Optional[str]
46 def info(self) -> Dict[str, Any]:
47 """ Dictionary of additional propoerties of the token.
48 Should only be used for debugging purposes.
50 return {'category': self.category,
51 'country': self.country,
52 'operator': self.operator}
55 def get_category(self) -> Tuple[str, str]:
60 class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
61 """ Converter for query strings into a tokenized query
62 using the tokens created by a legacy tokenizer.
65 def __init__(self, conn: SearchConnection) -> None:
68 async def setup(self) -> None:
69 """ Set up static data structures needed for the analysis.
71 self.max_word_freq = int(await self.conn.get_property('tokenizer_maxwordfreq'))
72 if 'word' not in self.conn.t.meta.tables:
73 sa.Table('word', self.conn.t.meta,
74 sa.Column('word_id', sa.Integer),
75 sa.Column('word_token', sa.Text, nullable=False),
76 sa.Column('word', sa.Text),
77 sa.Column('class', sa.Text),
78 sa.Column('type', sa.Text),
79 sa.Column('country_code', sa.Text),
80 sa.Column('search_name_count', sa.Integer),
81 sa.Column('operator', sa.Text))
84 async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
85 """ Analyze the given list of phrases and return the
88 log().section('Analyze query (using Legacy tokenizer)')
92 for row in await self.conn.execute(sa.select(*(sa.func.make_standard_name(p.text)
94 normalized = [qmod.Phrase(p.ptype, r) for r, p in zip(row, phrases) if r]
97 query = qmod.QueryStruct(normalized)
98 log().var_dump('Normalized query', query.source)
102 parts, words = self.split_query(query)
103 lookup_words = list(words.keys())
104 log().var_dump('Split query', parts)
105 log().var_dump('Extracted words', lookup_words)
107 for row in await self.lookup_in_db(lookup_words):
108 for trange in words[row.word_token.strip()]:
109 token, ttype = self.make_token(row)
110 if ttype == qmod.TokenType.CATEGORY:
111 if trange.start == 0:
112 query.add_token(trange, qmod.TokenType.CATEGORY, token)
113 elif ttype == qmod.TokenType.QUALIFIER:
114 query.add_token(trange, qmod.TokenType.QUALIFIER, token)
115 if trange.start == 0 or trange.end == query.num_token_slots():
117 token.penalty += 0.1 * (query.num_token_slots())
118 query.add_token(trange, qmod.TokenType.CATEGORY, token)
119 elif ttype != qmod.TokenType.PARTIAL or trange.start + 1 == trange.end:
120 query.add_token(trange, ttype, token)
122 self.add_extra_tokens(query, parts)
123 self.rerank_tokens(query)
125 log().table_dump('Word tokens', _dump_word_tokens(query))
130 def normalize_text(self, text: str) -> str:
131 """ Bring the given text into a normalized form.
133 This only removes case, so some difference with the normalization
134 in the phrase remains.
139 def split_query(self, query: qmod.QueryStruct) -> Tuple[List[str],
140 Dict[str, List[qmod.TokenRange]]]:
141 """ Transliterate the phrases and split them into tokens.
143 Returns a list of transliterated tokens and a dictionary
144 of words for lookup together with their position.
146 parts: List[str] = []
148 words = defaultdict(list)
149 for phrase in query.source:
150 query.nodes[-1].ptype = phrase.ptype
151 for trans in phrase.text.split(' '):
153 for term in trans.split(' '):
156 query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
157 query.nodes[-1].btype = qmod.BreakType.WORD
158 query.nodes[-1].btype = qmod.BreakType.PHRASE
159 for word, wrange in yield_words(parts, phrase_start):
160 words[word].append(wrange)
161 phrase_start = len(parts)
162 query.nodes[-1].btype = qmod.BreakType.END
167 async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
168 """ Return the token information from the database for the
171 t = self.conn.t.meta.tables['word']
173 sql = t.select().where(t.c.word_token.in_(words + [' ' + w for w in words]))
175 return await self.conn.execute(sql)
178 def make_token(self, row: SaRow) -> Tuple[LegacyToken, qmod.TokenType]:
179 """ Create a LegacyToken from the row of the word table.
180 Also determines the type of token.
185 rowclass = getattr(row, 'class')
187 if row.country_code is not None:
188 ttype = qmod.TokenType.COUNTRY
189 lookup_word = row.country_code
190 elif rowclass is not None:
191 if rowclass == 'place' and row.type == 'house':
192 ttype = qmod.TokenType.HOUSENUMBER
193 lookup_word = row.word_token[1:]
194 elif rowclass == 'place' and row.type == 'postcode':
195 ttype = qmod.TokenType.POSTCODE
196 lookup_word = row.word_token[1:]
198 ttype = qmod.TokenType.CATEGORY if row.operator in ('in', 'near')\
199 else qmod.TokenType.QUALIFIER
200 lookup_word = row.word
201 elif row.word_token.startswith(' '):
202 ttype = qmod.TokenType.WORD
203 lookup_word = row.word or row.word_token[1:]
205 ttype = qmod.TokenType.PARTIAL
206 lookup_word = row.word_token
208 if row.search_name_count > self.max_word_freq:
211 return LegacyToken(penalty=penalty, token=row.word_id,
212 count=row.search_name_count or 1,
213 lookup_word=lookup_word,
214 word_token=row.word_token.strip(),
215 category=(rowclass, row.type) if rowclass is not None else None,
216 country=row.country_code,
217 operator=row.operator,
218 is_indexed=is_indexed),\
222 def add_extra_tokens(self, query: qmod.QueryStruct, parts: List[str]) -> None:
223 """ Add tokens to query that are not saved in the database.
225 for part, node, i in zip(parts, query.nodes, range(1000)):
226 if len(part) <= 4 and part.isdigit()\
227 and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
228 query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
229 LegacyToken(penalty=0.5, token=0, count=1,
230 lookup_word=part, word_token=part,
231 category=None, country=None,
232 operator=None, is_indexed=True))
235 def rerank_tokens(self, query: qmod.QueryStruct) -> None:
236 """ Add penalties to tokens that depend on presence of other token.
238 for _, node, tlist in query.iter_token_lists():
239 if tlist.ttype == qmod.TokenType.POSTCODE:
240 for repl in node.starting:
241 if repl.end == tlist.end and repl.ttype != qmod.TokenType.POSTCODE \
242 and (repl.ttype != qmod.TokenType.HOUSENUMBER
243 or len(tlist.tokens[0].lookup_word) > 4):
244 repl.add_penalty(0.39)
245 elif tlist.ttype == qmod.TokenType.HOUSENUMBER \
246 and len(tlist.tokens[0].lookup_word) <= 3:
247 if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
248 for repl in node.starting:
249 if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER:
250 repl.add_penalty(0.5 - tlist.tokens[0].penalty)
254 def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
255 yield ['type', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
256 for node in query.nodes:
257 for tlist in node.starting:
258 for token in tlist.tokens:
259 t = cast(LegacyToken, token)
260 yield [tlist.ttype.name, t.token, t.word_token or '',
261 t.lookup_word or '', t.penalty, t.count, t.info]
264 async def create_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer:
265 """ Create and set up a new query analyzer for a database based
266 on the ICU tokenizer.
268 out = LegacyQueryAnalyzer(conn)