1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2023 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Implementation of query analysis for the legacy tokenizer.
10 from typing import Tuple, Dict, List, Optional, Iterator, Any, cast
12 from collections import defaultdict
15 import sqlalchemy as sa
17 from nominatim.typing import SaRow
18 from nominatim.api.connection import SearchConnection
19 from nominatim.api.logging import log
20 from nominatim.api.search import query as qmod
21 from nominatim.api.search.query_analyzer_factory import AbstractQueryAnalyzer
23 def yield_words(terms: List[str], start: int) -> Iterator[Tuple[str, qmod.TokenRange]]:
24 """ Return all combinations of words in the terms list after the
28 for first in range(start, total):
30 yield word, qmod.TokenRange(first, first + 1)
31 for last in range(first + 1, min(first + 20, total)):
32 word = ' '.join((word, terms[last]))
33 yield word, qmod.TokenRange(first, last + 1)
36 @dataclasses.dataclass
37 class LegacyToken(qmod.Token):
38 """ Specialised token for legacy tokenizer.
41 category: Optional[Tuple[str, str]]
42 country: Optional[str]
43 operator: Optional[str]
46 def info(self) -> Dict[str, Any]:
47 """ Dictionary of additional propoerties of the token.
48 Should only be used for debugging purposes.
50 return {'category': self.category,
51 'country': self.country,
52 'operator': self.operator}
55 def get_category(self) -> Tuple[str, str]:
60 class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
61 """ Converter for query strings into a tokenized query
62 using the tokens created by a legacy tokenizer.
65 def __init__(self, conn: SearchConnection) -> None:
68 async def setup(self) -> None:
69 """ Set up static data structures needed for the analysis.
71 self.max_word_freq = int(await self.conn.get_property('tokenizer_maxwordfreq'))
72 if 'word' not in self.conn.t.meta.tables:
73 sa.Table('word', self.conn.t.meta,
74 sa.Column('word_id', sa.Integer),
75 sa.Column('word_token', sa.Text, nullable=False),
76 sa.Column('word', sa.Text),
77 sa.Column('class', sa.Text),
78 sa.Column('type', sa.Text),
79 sa.Column('country_code', sa.Text),
80 sa.Column('search_name_count', sa.Integer),
81 sa.Column('operator', sa.Text))
84 async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
85 """ Analyze the given list of phrases and return the
88 log().section('Analyze query (using Legacy tokenizer)')
92 for row in await self.conn.execute(sa.select(*(sa.func.make_standard_name(p.text)
94 normalized = [qmod.Phrase(p.ptype, r) for r, p in zip(row, phrases) if r]
97 query = qmod.QueryStruct(normalized)
98 log().var_dump('Normalized query', query.source)
102 parts, words = self.split_query(query)
103 lookup_words = list(words.keys())
104 log().var_dump('Split query', parts)
105 log().var_dump('Extracted words', lookup_words)
107 for row in await self.lookup_in_db(lookup_words):
108 for trange in words[row.word_token.strip()]:
109 token, ttype = self.make_token(row)
110 if ttype == qmod.TokenType.CATEGORY:
111 if trange.start == 0:
112 query.add_token(trange, qmod.TokenType.CATEGORY, token)
113 elif ttype == qmod.TokenType.QUALIFIER:
114 query.add_token(trange, qmod.TokenType.QUALIFIER, token)
115 if trange.start == 0 or trange.end == query.num_token_slots():
117 token.penalty += 0.1 * (query.num_token_slots())
118 query.add_token(trange, qmod.TokenType.CATEGORY, token)
119 elif ttype != qmod.TokenType.PARTIAL or trange.start + 1 == trange.end:
120 query.add_token(trange, ttype, token)
122 self.add_extra_tokens(query, parts)
123 self.rerank_tokens(query)
125 log().table_dump('Word tokens', _dump_word_tokens(query))
130 def split_query(self, query: qmod.QueryStruct) -> Tuple[List[str],
131 Dict[str, List[qmod.TokenRange]]]:
132 """ Transliterate the phrases and split them into tokens.
134 Returns a list of transliterated tokens and a dictionary
135 of words for lookup together with their position.
137 parts: List[str] = []
139 words = defaultdict(list)
140 for phrase in query.source:
141 query.nodes[-1].ptype = phrase.ptype
142 for trans in phrase.text.split(' '):
144 for term in trans.split(' '):
147 query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
148 query.nodes[-1].btype = qmod.BreakType.WORD
149 query.nodes[-1].btype = qmod.BreakType.PHRASE
150 for word, wrange in yield_words(parts, phrase_start):
151 words[word].append(wrange)
152 phrase_start = len(parts)
153 query.nodes[-1].btype = qmod.BreakType.END
158 async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
159 """ Return the token information from the database for the
162 t = self.conn.t.meta.tables['word']
164 sql = t.select().where(t.c.word_token.in_(words + [' ' + w for w in words]))
166 return await self.conn.execute(sql)
169 def make_token(self, row: SaRow) -> Tuple[LegacyToken, qmod.TokenType]:
170 """ Create a LegacyToken from the row of the word table.
171 Also determines the type of token.
176 rowclass = getattr(row, 'class')
178 if row.country_code is not None:
179 ttype = qmod.TokenType.COUNTRY
180 lookup_word = row.country_code
181 elif rowclass is not None:
182 if rowclass == 'place' and row.type == 'house':
183 ttype = qmod.TokenType.HOUSENUMBER
184 lookup_word = row.word_token[1:]
185 elif rowclass == 'place' and row.type == 'postcode':
186 ttype = qmod.TokenType.POSTCODE
187 lookup_word = row.word_token[1:]
189 ttype = qmod.TokenType.CATEGORY if row.operator in ('in', 'near')\
190 else qmod.TokenType.QUALIFIER
191 lookup_word = row.word
192 elif row.word_token.startswith(' '):
193 ttype = qmod.TokenType.WORD
194 lookup_word = row.word or row.word_token[1:]
196 ttype = qmod.TokenType.PARTIAL
197 lookup_word = row.word_token
199 if row.search_name_count > self.max_word_freq:
202 return LegacyToken(penalty=penalty, token=row.word_id,
203 count=row.search_name_count or 1,
204 lookup_word=lookup_word,
205 word_token=row.word_token.strip(),
206 category=(rowclass, row.type) if rowclass is not None else None,
207 country=row.country_code,
208 operator=row.operator,
209 is_indexed=is_indexed),\
213 def add_extra_tokens(self, query: qmod.QueryStruct, parts: List[str]) -> None:
214 """ Add tokens to query that are not saved in the database.
216 for part, node, i in zip(parts, query.nodes, range(1000)):
217 if len(part) <= 4 and part.isdigit()\
218 and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
219 query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
220 LegacyToken(penalty=0.5, token=0, count=1,
221 lookup_word=part, word_token=part,
222 category=None, country=None,
223 operator=None, is_indexed=True))
226 def rerank_tokens(self, query: qmod.QueryStruct) -> None:
227 """ Add penalties to tokens that depend on presence of other token.
229 for _, node, tlist in query.iter_token_lists():
230 if tlist.ttype == qmod.TokenType.POSTCODE:
231 for repl in node.starting:
232 if repl.end == tlist.end and repl.ttype != qmod.TokenType.POSTCODE \
233 and (repl.ttype != qmod.TokenType.HOUSENUMBER
234 or len(tlist.tokens[0].lookup_word) > 4):
235 repl.add_penalty(0.39)
236 elif tlist.ttype == qmod.TokenType.HOUSENUMBER:
237 if any(c.isdigit() for c in tlist.tokens[0].lookup_word):
238 for repl in node.starting:
239 if repl.end == tlist.end and repl.ttype != qmod.TokenType.HOUSENUMBER \
240 and (repl.ttype != qmod.TokenType.HOUSENUMBER
241 or len(tlist.tokens[0].lookup_word) <= 3):
242 repl.add_penalty(0.5 - tlist.tokens[0].penalty)
246 def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
247 yield ['type', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
248 for node in query.nodes:
249 for tlist in node.starting:
250 for token in tlist.tokens:
251 t = cast(LegacyToken, token)
252 yield [tlist.ttype.name, t.token, t.word_token or '',
253 t.lookup_word or '', t.penalty, t.count, t.info]
256 async def create_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer:
257 """ Create and set up a new query analyzer for a database based
258 on the ICU tokenizer.
260 out = LegacyQueryAnalyzer(conn)