]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/api/search/geocoder.py
added subcommand to clean deleted relations for issue # 2444
[nominatim.git] / nominatim / api / search / geocoder.py
1 # SPDX-License-Identifier: GPL-3.0-or-later
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2023 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Public interface to the search code.
9 """
10 from typing import List, Any, Optional, Iterator, Tuple, Dict
11 import itertools
12 import re
13 import datetime as dt
14 import difflib
15
16 from nominatim.api.connection import SearchConnection
17 from nominatim.api.types import SearchDetails
18 from nominatim.api.results import SearchResult, SearchResults, add_result_details
19 from nominatim.api.search.token_assignment import yield_token_assignments
20 from nominatim.api.search.db_search_builder import SearchBuilder, build_poi_search, wrap_near_search
21 from nominatim.api.search.db_searches import AbstractSearch
22 from nominatim.api.search.query_analyzer_factory import make_query_analyzer, AbstractQueryAnalyzer
23 from nominatim.api.search.query import Phrase, QueryStruct
24 from nominatim.api.logging import log
25
26 class ForwardGeocoder:
27     """ Main class responsible for place search.
28     """
29
30     def __init__(self, conn: SearchConnection,
31                  params: SearchDetails, timeout: Optional[int]) -> None:
32         self.conn = conn
33         self.params = params
34         self.timeout = dt.timedelta(seconds=timeout or 1000000)
35         self.query_analyzer: Optional[AbstractQueryAnalyzer] = None
36
37
38     @property
39     def limit(self) -> int:
40         """ Return the configured maximum number of search results.
41         """
42         return self.params.max_results
43
44
45     async def build_searches(self,
46                              phrases: List[Phrase]) -> Tuple[QueryStruct, List[AbstractSearch]]:
47         """ Analyse the query and return the tokenized query and list of
48             possible searches over it.
49         """
50         if self.query_analyzer is None:
51             self.query_analyzer = await make_query_analyzer(self.conn)
52
53         query = await self.query_analyzer.analyze_query(phrases)
54
55         searches: List[AbstractSearch] = []
56         if query.num_token_slots() > 0:
57             # 2. Compute all possible search interpretations
58             log().section('Compute abstract searches')
59             search_builder = SearchBuilder(query, self.params)
60             num_searches = 0
61             for assignment in yield_token_assignments(query):
62                 searches.extend(search_builder.build(assignment))
63                 if num_searches < len(searches):
64                     log().table_dump('Searches for assignment',
65                                      _dump_searches(searches, query, num_searches))
66                 num_searches = len(searches)
67             searches.sort(key=lambda s: s.penalty)
68
69         return query, searches
70
71
72     async def execute_searches(self, query: QueryStruct,
73                                searches: List[AbstractSearch]) -> SearchResults:
74         """ Run the abstract searches against the database until a result
75             is found.
76         """
77         log().section('Execute database searches')
78         results: Dict[Any, SearchResult] = {}
79
80         end_time = dt.datetime.now() + self.timeout
81
82         min_ranking = 1000.0
83         prev_penalty = 0.0
84         for i, search in enumerate(searches):
85             if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 20):
86                 break
87             log().table_dump(f"{i + 1}. Search", _dump_searches([search], query))
88             lookup_results = await search.lookup(self.conn, self.params)
89             for result in lookup_results:
90                 rhash = (result.source_table, result.place_id,
91                          result.housenumber, result.country_code)
92                 prevresult = results.get(rhash)
93                 if prevresult:
94                     prevresult.accuracy = min(prevresult.accuracy, result.accuracy)
95                 else:
96                     results[rhash] = result
97                 min_ranking = min(min_ranking, result.ranking + 0.5, search.penalty + 0.3)
98             log().result_dump('Results', ((r.accuracy, r) for r in lookup_results))
99             prev_penalty = search.penalty
100             if dt.datetime.now() >= end_time:
101                 break
102
103         return SearchResults(results.values())
104
105
106     def sort_and_cut_results(self, results: SearchResults) -> SearchResults:
107         """ Remove badly matching results, sort by ranking and
108             limit to the configured number of results.
109         """
110         if results:
111             min_ranking = min(r.ranking for r in results)
112             results = SearchResults(r for r in results if r.ranking < min_ranking + 0.5)
113             results.sort(key=lambda r: r.ranking)
114
115         if results:
116             min_rank = results[0].rank_search
117             results = SearchResults(r for r in results
118                                     if r.ranking + 0.05 * (r.rank_search - min_rank)
119                                        < min_ranking + 0.5)
120
121             results = SearchResults(results[:self.limit])
122
123         return results
124
125
126     def rerank_by_query(self, query: QueryStruct, results: SearchResults) -> None:
127         """ Adjust the accuracy of the localized result according to how well
128             they match the original query.
129         """
130         assert self.query_analyzer is not None
131         qwords = [word for phrase in query.source
132                        for word in re.split('[, ]+', phrase.text) if word]
133         if not qwords:
134             return
135
136         for result in results:
137             if not result.display_name:
138                 continue
139             distance = 0.0
140             norm = self.query_analyzer.normalize_text(result.display_name)
141             words = set((w for w in norm.split(' ') if w))
142             if not words:
143                 continue
144             for qword in qwords:
145                 wdist = max(difflib.SequenceMatcher(a=qword, b=w).quick_ratio() for w in words)
146                 if wdist < 0.5:
147                     distance += len(qword)
148                 else:
149                     distance += (1.0 - wdist) * len(qword)
150             # Compensate for the fact that country names do not get a
151             # match penalty yet by the tokenizer.
152             # Temporary hack that needs to be removed!
153             if result.rank_address == 4:
154                 distance *= 2
155             result.accuracy += distance * 0.4 / sum(len(w) for w in qwords)
156
157
158     async def lookup_pois(self, categories: List[Tuple[str, str]],
159                           phrases: List[Phrase]) -> SearchResults:
160         """ Look up places by category. If phrase is given, a place search
161             over the phrase will be executed first and places close to the
162             results returned.
163         """
164         log().function('forward_lookup_pois', categories=categories, params=self.params)
165
166         if phrases:
167             query, searches = await self.build_searches(phrases)
168
169             if query:
170                 searches = [wrap_near_search(categories, s) for s in searches[:50]]
171                 results = await self.execute_searches(query, searches)
172                 await add_result_details(self.conn, results, self.params)
173                 log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
174                 results = self.sort_and_cut_results(results)
175             else:
176                 results = SearchResults()
177         else:
178             search = build_poi_search(categories, self.params.countries)
179             results = await search.lookup(self.conn, self.params)
180             await add_result_details(self.conn, results, self.params)
181
182         log().result_dump('Final Results', ((r.accuracy, r) for r in results))
183
184         return results
185
186
187     async def lookup(self, phrases: List[Phrase]) -> SearchResults:
188         """ Look up a single free-text query.
189         """
190         log().function('forward_lookup', phrases=phrases, params=self.params)
191         results = SearchResults()
192
193         if self.params.is_impossible():
194             return results
195
196         query, searches = await self.build_searches(phrases)
197
198         if searches:
199             # Execute SQL until an appropriate result is found.
200             results = await self.execute_searches(query, searches[:50])
201             await add_result_details(self.conn, results, self.params)
202             log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
203             self.rerank_by_query(query, results)
204             log().result_dump('Results after reranking', ((r.accuracy, r) for r in results))
205             results = self.sort_and_cut_results(results)
206             log().result_dump('Final Results', ((r.accuracy, r) for r in results))
207
208         return results
209
210
211 # pylint: disable=invalid-name,too-many-locals
212 def _dump_searches(searches: List[AbstractSearch], query: QueryStruct,
213                    start: int = 0) -> Iterator[Optional[List[Any]]]:
214     yield ['Penalty', 'Lookups', 'Housenr', 'Postcode', 'Countries',
215            'Qualifier', 'Catgeory', 'Rankings']
216
217     def tk(tl: List[int]) -> str:
218         tstr = [f"{query.find_lookup_word_by_id(t)}({t})" for t in tl]
219
220         return f"[{','.join(tstr)}]"
221
222     def fmt_ranking(f: Any) -> str:
223         if not f:
224             return ''
225         ranks = ','.join((f"{tk(r.tokens)}^{r.penalty:.3g}" for r in f.rankings))
226         if len(ranks) > 100:
227             ranks = ranks[:100] + '...'
228         return f"{f.column}({ranks},def={f.default:.3g})"
229
230     def fmt_lookup(l: Any) -> str:
231         if not l:
232             return ''
233
234         return f"{l.lookup_type}({l.column}{tk(l.tokens)})"
235
236
237     def fmt_cstr(c: Any) -> str:
238         if not c:
239             return ''
240
241         return f'{c[0]}^{c[1]}'
242
243     for search in searches[start:]:
244         fields = ('lookups', 'rankings', 'countries', 'housenumbers',
245                   'postcodes', 'qualifiers')
246         if hasattr(search, 'search'):
247             iters = itertools.zip_longest([f"{search.penalty:.3g}"],
248                                           *(getattr(search.search, attr, []) for attr in fields),
249                                           getattr(search, 'categories', []),
250                                           fillvalue='')
251         else:
252             iters = itertools.zip_longest([f"{search.penalty:.3g}"],
253                                           *(getattr(search, attr, []) for attr in fields),
254                                           [],
255                                           fillvalue='')
256         for penalty, lookup, rank, cc, hnr, pc, qual, cat in iters:
257             yield [penalty, fmt_lookup(lookup), fmt_cstr(hnr),
258                    fmt_cstr(pc), fmt_cstr(cc), fmt_cstr(qual), fmt_cstr(cat), fmt_ranking(rank)]
259         yield None