1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2023 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Convertion from token assignment to an abstract DB search.
10 from typing import Optional, List, Tuple, Iterator, Dict
13 from nominatim.api.types import SearchDetails, DataLayer
14 from nominatim.api.search.query import QueryStruct, Token, TokenType, TokenRange, BreakType
15 from nominatim.api.search.token_assignment import TokenAssignment
16 import nominatim.api.search.db_search_fields as dbf
17 import nominatim.api.search.db_searches as dbs
20 def wrap_near_search(categories: List[Tuple[str, str]],
21 search: dbs.AbstractSearch) -> dbs.NearSearch:
22 """ Create a new search that wraps the given search in a search
23 for near places of the given category.
25 return dbs.NearSearch(penalty=search.penalty,
26 categories=dbf.WeightedCategories(categories,
27 [0.0] * len(categories)),
31 def build_poi_search(category: List[Tuple[str, str]],
32 countries: Optional[List[str]]) -> dbs.PoiSearch:
33 """ Create a new search for places by the given category, possibly
34 constraint to the given countries.
37 ccs = dbf.WeightedStrings(countries, [0.0] * len(countries))
39 ccs = dbf.WeightedStrings([], [])
41 class _PoiData(dbf.SearchData):
43 qualifiers = dbf.WeightedCategories(category, [0.0] * len(category))
46 return dbs.PoiSearch(_PoiData())
50 """ Build the abstract search queries from token assignments.
53 def __init__(self, query: QueryStruct, details: SearchDetails) -> None:
55 self.details = details
59 def configured_for_country(self) -> bool:
60 """ Return true if the search details are configured to
61 allow countries in the result.
63 return self.details.min_rank <= 4 and self.details.max_rank >= 4 \
64 and self.details.layer_enabled(DataLayer.ADDRESS)
68 def configured_for_postcode(self) -> bool:
69 """ Return true if the search details are configured to
70 allow postcodes in the result.
72 return self.details.min_rank <= 5 and self.details.max_rank >= 11\
73 and self.details.layer_enabled(DataLayer.ADDRESS)
77 def configured_for_housenumbers(self) -> bool:
78 """ Return true if the search details are configured to
79 allow addresses in the result.
81 return self.details.max_rank >= 30 \
82 and self.details.layer_enabled(DataLayer.ADDRESS)
85 def build(self, assignment: TokenAssignment) -> Iterator[dbs.AbstractSearch]:
86 """ Yield all possible abstract searches for the given token assignment.
88 sdata = self.get_search_data(assignment)
92 categories = self.get_search_categories(assignment)
94 if assignment.name is None:
95 if categories and not sdata.postcodes:
96 sdata.qualifiers = categories
98 builder = self.build_poi_search(sdata)
99 elif assignment.housenumber:
100 hnr_tokens = self.query.get_tokens(assignment.housenumber,
101 TokenType.HOUSENUMBER)
102 builder = self.build_housenumber_search(sdata, hnr_tokens, assignment.address)
104 builder = self.build_special_search(sdata, assignment.address,
107 builder = self.build_name_search(sdata, assignment.name, assignment.address,
111 penalty = min(categories.penalties)
112 categories.penalties = [p - penalty for p in categories.penalties]
113 for search in builder:
114 yield dbs.NearSearch(penalty + assignment.penalty, categories, search)
116 for search in builder:
117 search.penalty += assignment.penalty
121 def build_poi_search(self, sdata: dbf.SearchData) -> Iterator[dbs.AbstractSearch]:
122 """ Build abstract search query for a simple category search.
123 This kind of search requires an additional geographic constraint.
125 if not sdata.housenumbers \
126 and ((self.details.viewbox and self.details.bounded_viewbox) or self.details.near):
127 yield dbs.PoiSearch(sdata)
130 def build_special_search(self, sdata: dbf.SearchData,
131 address: List[TokenRange],
132 is_category: bool) -> Iterator[dbs.AbstractSearch]:
133 """ Build abstract search queries for searches that do not involve
137 # No special searches over qualifiers supported.
140 if sdata.countries and not address and not sdata.postcodes \
141 and self.configured_for_country:
142 yield dbs.CountrySearch(sdata)
144 if sdata.postcodes and (is_category or self.configured_for_postcode):
145 penalty = 0.0 if sdata.countries else 0.1
147 sdata.lookups = [dbf.FieldLookup('nameaddress_vector',
148 [t.token for r in address
149 for t in self.query.get_partials_list(r)],
152 yield dbs.PostcodeSearch(penalty, sdata)
155 def build_housenumber_search(self, sdata: dbf.SearchData, hnrs: List[Token],
156 address: List[TokenRange]) -> Iterator[dbs.AbstractSearch]:
157 """ Build a simple address search for special entries where the
158 housenumber is the main name token.
160 sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], 'lookup_any')]
162 partials = [t for trange in address
163 for t in self.query.get_partials_list(trange)]
165 if len(partials) != 1 or partials[0].count < 10000:
166 sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
167 [t.token for t in partials], 'lookup_all'))
169 sdata.lookups.append(
170 dbf.FieldLookup('nameaddress_vector',
172 in self.query.get_tokens(address[0], TokenType.WORD)],
175 sdata.housenumbers = dbf.WeightedStrings([], [])
176 yield dbs.PlaceSearch(0.05, sdata, sum(t.count for t in hnrs))
179 def build_name_search(self, sdata: dbf.SearchData,
180 name: TokenRange, address: List[TokenRange],
181 is_category: bool) -> Iterator[dbs.AbstractSearch]:
182 """ Build abstract search queries for simple name or address searches.
184 if is_category or not sdata.housenumbers or self.configured_for_housenumbers:
185 ranking = self.get_name_ranking(name)
186 name_penalty = ranking.normalize_penalty()
188 sdata.rankings.append(ranking)
189 for penalty, count, lookup in self.yield_lookups(name, address):
190 sdata.lookups = lookup
191 yield dbs.PlaceSearch(penalty + name_penalty, sdata, count)
194 def yield_lookups(self, name: TokenRange, address: List[TokenRange])\
195 -> Iterator[Tuple[float, int, List[dbf.FieldLookup]]]:
196 """ Yield all variants how the given name and address should best
197 be searched for. This takes into account how frequent the terms
198 are and tries to find a lookup that optimizes index use.
200 penalty = 0.0 # extra penalty
201 name_partials = self.query.get_partials_list(name)
202 name_tokens = [t.token for t in name_partials]
204 addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
205 addr_tokens = [t.token for t in addr_partials]
207 partials_indexed = all(t.is_indexed for t in name_partials) \
208 and all(t.is_indexed for t in addr_partials)
209 exp_count = min(t.count for t in name_partials) / (2**(len(name_partials) - 1))
211 if (len(name_partials) > 3 or exp_count < 8000) and partials_indexed:
212 yield penalty, exp_count, dbf.lookup_by_names(name_tokens, addr_tokens)
215 # Partial term to frequent. Try looking up by rare full names first.
216 name_fulls = self.query.get_tokens(name, TokenType.WORD)
217 fulls_count = sum(t.count for t in name_fulls)
218 # At this point drop unindexed partials from the address.
219 # This might yield wrong results, nothing we can do about that.
220 if not partials_indexed:
221 addr_tokens = [t.token for t in addr_partials if t.is_indexed]
222 penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
223 # Any of the full names applies with all of the partials from the address
224 yield penalty, fulls_count / (2**len(addr_partials)),\
225 dbf.lookup_by_any_name([t.token for t in name_fulls], addr_tokens,
226 'restrict' if fulls_count < 10000 else 'lookup_all')
228 # To catch remaining results, lookup by name and address
229 # We only do this if there is a reasonable number of results expected.
230 exp_count = exp_count / (2**len(addr_partials)) if addr_partials else exp_count
231 if exp_count < 10000 and all(t.is_indexed for t in name_partials):
232 lookup = [dbf.FieldLookup('name_vector', name_tokens, 'lookup_all')]
234 lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, 'lookup_all'))
235 penalty += 0.35 * max(0, 5 - len(name_partials) - len(addr_tokens))
236 yield penalty, exp_count, lookup
239 def get_name_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
240 """ Create a ranking expression for a name term in the given range.
242 name_fulls = self.query.get_tokens(trange, TokenType.WORD)
243 ranks = [dbf.RankedTokens(t.penalty, [t.token]) for t in name_fulls]
244 ranks.sort(key=lambda r: r.penalty)
245 # Fallback, sum of penalty for partials
246 name_partials = self.query.get_partials_list(trange)
247 default = sum(t.penalty for t in name_partials) + 0.2
248 return dbf.FieldRanking('name_vector', default, ranks)
251 def get_addr_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
252 """ Create a list of ranking expressions for an address term
253 for the given ranges.
255 todo: List[Tuple[int, int, dbf.RankedTokens]] = []
256 heapq.heappush(todo, (0, trange.start, dbf.RankedTokens(0.0, [])))
257 ranks: List[dbf.RankedTokens] = []
259 while todo: # pylint: disable=too-many-nested-blocks
260 neglen, pos, rank = heapq.heappop(todo)
261 for tlist in self.query.nodes[pos].starting:
262 if tlist.ttype in (TokenType.PARTIAL, TokenType.WORD):
263 if tlist.end < trange.end:
264 chgpenalty = PENALTY_WORDCHANGE[self.query.nodes[tlist.end].btype]
265 if tlist.ttype == TokenType.PARTIAL:
266 penalty = rank.penalty + chgpenalty \
267 + max(t.penalty for t in tlist.tokens)
268 heapq.heappush(todo, (neglen - 1, tlist.end,
269 dbf.RankedTokens(penalty, rank.tokens)))
271 for t in tlist.tokens:
272 heapq.heappush(todo, (neglen - 1, tlist.end,
273 rank.with_token(t, chgpenalty)))
274 elif tlist.end == trange.end:
275 if tlist.ttype == TokenType.PARTIAL:
276 ranks.append(dbf.RankedTokens(rank.penalty
277 + max(t.penalty for t in tlist.tokens),
280 ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
282 # Too many variants, bail out and only add
283 # Worst-case Fallback: sum of penalty of partials
284 name_partials = self.query.get_partials_list(trange)
285 default = sum(t.penalty for t in name_partials) + 0.2
286 ranks.append(dbf.RankedTokens(rank.penalty + default, []))
287 # Bail out of outer loop
291 ranks.sort(key=lambda r: len(r.tokens))
292 default = ranks[0].penalty + 0.3
294 ranks.sort(key=lambda r: r.penalty)
296 return dbf.FieldRanking('nameaddress_vector', default, ranks)
299 def get_search_data(self, assignment: TokenAssignment) -> Optional[dbf.SearchData]:
300 """ Collect the tokens for the non-name search fields in the
303 sdata = dbf.SearchData()
304 sdata.penalty = assignment.penalty
305 if assignment.country:
306 tokens = self.query.get_tokens(assignment.country, TokenType.COUNTRY)
307 if self.details.countries:
308 tokens = [t for t in tokens if t.lookup_word in self.details.countries]
311 sdata.set_strings('countries', tokens)
312 elif self.details.countries:
313 sdata.countries = dbf.WeightedStrings(self.details.countries,
314 [0.0] * len(self.details.countries))
315 if assignment.housenumber:
316 sdata.set_strings('housenumbers',
317 self.query.get_tokens(assignment.housenumber,
318 TokenType.HOUSENUMBER))
319 if assignment.postcode:
320 sdata.set_strings('postcodes',
321 self.query.get_tokens(assignment.postcode,
323 if assignment.qualifier:
324 sdata.set_qualifiers(self.query.get_tokens(assignment.qualifier,
325 TokenType.QUALIFIER))
327 if assignment.address:
328 sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
335 def get_search_categories(self,
336 assignment: TokenAssignment) -> Optional[dbf.WeightedCategories]:
337 """ Collect tokens for category search or use the categories
338 requested per parameter.
339 Returns None if no category search is requested.
341 if assignment.category:
342 tokens: Dict[Tuple[str, str], float] = {}
343 for t in self.query.get_tokens(assignment.category, TokenType.CATEGORY):
344 cat = t.get_category()
345 if (not self.details.categories or cat in self.details.categories)\
346 and t.penalty < tokens.get(cat, 1000.0):
347 tokens[cat] = t.penalty
348 return dbf.WeightedCategories(list(tokens.keys()), list(tokens.values()))
350 if self.details.categories:
351 return dbf.WeightedCategories(self.details.categories,
352 [0.0] * len(self.details.categories))
357 PENALTY_WORDCHANGE = {
358 BreakType.START: 0.0,
360 BreakType.PHRASE: 0.0,