]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/api/search/geocoder.py
Merge pull request #3292 from lonvia/faster-country-search
[nominatim.git] / nominatim / api / search / geocoder.py
1 # SPDX-License-Identifier: GPL-3.0-or-later
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2023 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Public interface to the search code.
9 """
10 from typing import List, Any, Optional, Iterator, Tuple, Dict
11 import itertools
12 import re
13 import datetime as dt
14 import difflib
15
16 from nominatim.api.connection import SearchConnection
17 from nominatim.api.types import SearchDetails
18 from nominatim.api.results import SearchResult, SearchResults, add_result_details
19 from nominatim.api.search.token_assignment import yield_token_assignments
20 from nominatim.api.search.db_search_builder import SearchBuilder, build_poi_search, wrap_near_search
21 from nominatim.api.search.db_searches import AbstractSearch
22 from nominatim.api.search.query_analyzer_factory import make_query_analyzer, AbstractQueryAnalyzer
23 from nominatim.api.search.query import Phrase, QueryStruct
24 from nominatim.api.logging import log
25
26 class ForwardGeocoder:
27     """ Main class responsible for place search.
28     """
29
30     def __init__(self, conn: SearchConnection,
31                  params: SearchDetails, timeout: Optional[int]) -> None:
32         self.conn = conn
33         self.params = params
34         self.timeout = dt.timedelta(seconds=timeout or 1000000)
35         self.query_analyzer: Optional[AbstractQueryAnalyzer] = None
36
37
38     @property
39     def limit(self) -> int:
40         """ Return the configured maximum number of search results.
41         """
42         return self.params.max_results
43
44
45     async def build_searches(self,
46                              phrases: List[Phrase]) -> Tuple[QueryStruct, List[AbstractSearch]]:
47         """ Analyse the query and return the tokenized query and list of
48             possible searches over it.
49         """
50         if self.query_analyzer is None:
51             self.query_analyzer = await make_query_analyzer(self.conn)
52
53         query = await self.query_analyzer.analyze_query(phrases)
54
55         searches: List[AbstractSearch] = []
56         if query.num_token_slots() > 0:
57             # 2. Compute all possible search interpretations
58             log().section('Compute abstract searches')
59             search_builder = SearchBuilder(query, self.params)
60             num_searches = 0
61             for assignment in yield_token_assignments(query):
62                 searches.extend(search_builder.build(assignment))
63                 if num_searches < len(searches):
64                     log().table_dump('Searches for assignment',
65                                      _dump_searches(searches, query, num_searches))
66                 num_searches = len(searches)
67             searches.sort(key=lambda s: (s.penalty, s.SEARCH_PRIO))
68
69         return query, searches
70
71
72     async def execute_searches(self, query: QueryStruct,
73                                searches: List[AbstractSearch]) -> SearchResults:
74         """ Run the abstract searches against the database until a result
75             is found.
76         """
77         log().section('Execute database searches')
78         results: Dict[Any, SearchResult] = {}
79
80         end_time = dt.datetime.now() + self.timeout
81
82         min_ranking = searches[0].penalty + 2.0
83         prev_penalty = 0.0
84         for i, search in enumerate(searches):
85             if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 20):
86                 break
87             log().table_dump(f"{i + 1}. Search", _dump_searches([search], query))
88             log().var_dump('Params', self.params)
89             lookup_results = await search.lookup(self.conn, self.params)
90             for result in lookup_results:
91                 rhash = (result.source_table, result.place_id,
92                          result.housenumber, result.country_code)
93                 prevresult = results.get(rhash)
94                 if prevresult:
95                     prevresult.accuracy = min(prevresult.accuracy, result.accuracy)
96                 else:
97                     results[rhash] = result
98                 min_ranking = min(min_ranking, result.accuracy * 1.2)
99             log().result_dump('Results', ((r.accuracy, r) for r in lookup_results))
100             prev_penalty = search.penalty
101             if dt.datetime.now() >= end_time:
102                 break
103
104         return SearchResults(results.values())
105
106
107     def sort_and_cut_results(self, results: SearchResults) -> SearchResults:
108         """ Remove badly matching results, sort by ranking and
109             limit to the configured number of results.
110         """
111         if results:
112             min_ranking = min(r.ranking for r in results)
113             results = SearchResults(r for r in results if r.ranking < min_ranking + 0.5)
114             results.sort(key=lambda r: r.ranking)
115
116         if results:
117             min_rank = results[0].rank_search
118             results = SearchResults(r for r in results
119                                     if r.ranking + 0.05 * (r.rank_search - min_rank)
120                                        < min_ranking + 0.5)
121
122             results = SearchResults(results[:self.limit])
123
124         return results
125
126
127     def rerank_by_query(self, query: QueryStruct, results: SearchResults) -> None:
128         """ Adjust the accuracy of the localized result according to how well
129             they match the original query.
130         """
131         assert self.query_analyzer is not None
132         qwords = [word for phrase in query.source
133                        for word in re.split('[, ]+', phrase.text) if word]
134         if not qwords:
135             return
136
137         for result in results:
138             # Negative importance indicates ordering by distance, which is
139             # more important than word matching.
140             if not result.display_name\
141                or (result.importance is not None and result.importance < 0):
142                 continue
143             distance = 0.0
144             norm = self.query_analyzer.normalize_text(result.display_name)
145             words = set((w for w in norm.split(' ') if w))
146             if not words:
147                 continue
148             for qword in qwords:
149                 wdist = max(difflib.SequenceMatcher(a=qword, b=w).quick_ratio() for w in words)
150                 if wdist < 0.5:
151                     distance += len(qword)
152                 else:
153                     distance += (1.0 - wdist) * len(qword)
154             # Compensate for the fact that country names do not get a
155             # match penalty yet by the tokenizer.
156             # Temporary hack that needs to be removed!
157             if result.rank_address == 4:
158                 distance *= 2
159             result.accuracy += distance * 0.4 / sum(len(w) for w in qwords)
160
161
162     async def lookup_pois(self, categories: List[Tuple[str, str]],
163                           phrases: List[Phrase]) -> SearchResults:
164         """ Look up places by category. If phrase is given, a place search
165             over the phrase will be executed first and places close to the
166             results returned.
167         """
168         log().function('forward_lookup_pois', categories=categories, params=self.params)
169
170         if phrases:
171             query, searches = await self.build_searches(phrases)
172
173             if query:
174                 searches = [wrap_near_search(categories, s) for s in searches[:50]]
175                 results = await self.execute_searches(query, searches)
176                 await add_result_details(self.conn, results, self.params)
177                 log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
178                 results = self.sort_and_cut_results(results)
179             else:
180                 results = SearchResults()
181         else:
182             search = build_poi_search(categories, self.params.countries)
183             results = await search.lookup(self.conn, self.params)
184             await add_result_details(self.conn, results, self.params)
185
186         log().result_dump('Final Results', ((r.accuracy, r) for r in results))
187
188         return results
189
190
191     async def lookup(self, phrases: List[Phrase]) -> SearchResults:
192         """ Look up a single free-text query.
193         """
194         log().function('forward_lookup', phrases=phrases, params=self.params)
195         results = SearchResults()
196
197         if self.params.is_impossible():
198             return results
199
200         query, searches = await self.build_searches(phrases)
201
202         if searches:
203             # Execute SQL until an appropriate result is found.
204             results = await self.execute_searches(query, searches[:50])
205             await add_result_details(self.conn, results, self.params)
206             log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
207             self.rerank_by_query(query, results)
208             log().result_dump('Results after reranking', ((r.accuracy, r) for r in results))
209             results = self.sort_and_cut_results(results)
210             log().result_dump('Final Results', ((r.accuracy, r) for r in results))
211
212         return results
213
214
215 # pylint: disable=invalid-name,too-many-locals
216 def _dump_searches(searches: List[AbstractSearch], query: QueryStruct,
217                    start: int = 0) -> Iterator[Optional[List[Any]]]:
218     yield ['Penalty', 'Lookups', 'Housenr', 'Postcode', 'Countries',
219            'Qualifier', 'Catgeory', 'Rankings']
220
221     def tk(tl: List[int]) -> str:
222         tstr = [f"{query.find_lookup_word_by_id(t)}({t})" for t in tl]
223
224         return f"[{','.join(tstr)}]"
225
226     def fmt_ranking(f: Any) -> str:
227         if not f:
228             return ''
229         ranks = ','.join((f"{tk(r.tokens)}^{r.penalty:.3g}" for r in f.rankings))
230         if len(ranks) > 100:
231             ranks = ranks[:100] + '...'
232         return f"{f.column}({ranks},def={f.default:.3g})"
233
234     def fmt_lookup(l: Any) -> str:
235         if not l:
236             return ''
237
238         return f"{l.lookup_type}({l.column}{tk(l.tokens)})"
239
240
241     def fmt_cstr(c: Any) -> str:
242         if not c:
243             return ''
244
245         return f'{c[0]}^{c[1]}'
246
247     for search in searches[start:]:
248         fields = ('lookups', 'rankings', 'countries', 'housenumbers',
249                   'postcodes', 'qualifiers')
250         if hasattr(search, 'search'):
251             iters = itertools.zip_longest([f"{search.penalty:.3g}"],
252                                           *(getattr(search.search, attr, []) for attr in fields),
253                                           getattr(search, 'categories', []),
254                                           fillvalue='')
255         else:
256             iters = itertools.zip_longest([f"{search.penalty:.3g}"],
257                                           *(getattr(search, attr, []) for attr in fields),
258                                           [],
259                                           fillvalue='')
260         for penalty, lookup, rank, cc, hnr, pc, qual, cat in iters:
261             yield [penalty, fmt_lookup(lookup), fmt_cstr(hnr),
262                    fmt_cstr(pc), fmt_cstr(cc), fmt_cstr(qual), fmt_cstr(cat), fmt_ranking(rank)]
263         yield None