]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/api/search/geocoder.py
Merge remote-tracking branch 'upstream/master'
[nominatim.git] / nominatim / api / search / geocoder.py
index 0ef649d99ab8857924ef7d86bc3aa96865608666..7ff3ed08af9dcfa23c8072d4d57152a253930007 100644 (file)
@@ -7,12 +7,15 @@
 """
 Public interface to the search code.
 """
-from typing import List, Any, Optional, Iterator, Tuple
+from typing import List, Any, Optional, Iterator, Tuple, Dict
 import itertools
+import re
+import datetime as dt
+import difflib
 
 from nominatim.api.connection import SearchConnection
 from nominatim.api.types import SearchDetails
-from nominatim.api.results import SearchResults, add_result_details
+from nominatim.api.results import SearchResult, SearchResults, add_result_details
 from nominatim.api.search.token_assignment import yield_token_assignments
 from nominatim.api.search.db_search_builder import SearchBuilder, build_poi_search, wrap_near_search
 from nominatim.api.search.db_searches import AbstractSearch
@@ -24,9 +27,11 @@ class ForwardGeocoder:
     """ Main class responsible for place search.
     """
 
-    def __init__(self, conn: SearchConnection, params: SearchDetails) -> None:
+    def __init__(self, conn: SearchConnection,
+                 params: SearchDetails, timeout: Optional[int]) -> None:
         self.conn = conn
         self.params = params
+        self.timeout = dt.timedelta(seconds=timeout or 1000000)
         self.query_analyzer: Optional[AbstractQueryAnalyzer] = None
 
 
@@ -55,8 +60,9 @@ class ForwardGeocoder:
             num_searches = 0
             for assignment in yield_token_assignments(query):
                 searches.extend(search_builder.build(assignment))
-                log().table_dump('Searches for assignment',
-                                 _dump_searches(searches, query, num_searches))
+                if num_searches < len(searches):
+                    log().table_dump('Searches for assignment',
+                                     _dump_searches(searches, query, num_searches))
                 num_searches = len(searches)
             searches.sort(key=lambda s: s.penalty)
 
@@ -69,39 +75,86 @@ class ForwardGeocoder:
             is found.
         """
         log().section('Execute database searches')
-        results = SearchResults()
+        results: Dict[Any, SearchResult] = {}
+
+        end_time = dt.datetime.now() + self.timeout
 
-        num_results = 0
         min_ranking = 1000.0
         prev_penalty = 0.0
         for i, search in enumerate(searches):
             if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 20):
                 break
             log().table_dump(f"{i + 1}. Search", _dump_searches([search], query))
-            for result in await search.lookup(self.conn, self.params):
-                results.append(result)
+            lookup_results = await search.lookup(self.conn, self.params)
+            for result in lookup_results:
+                rhash = (result.source_table, result.place_id,
+                         result.housenumber, result.country_code)
+                prevresult = results.get(rhash)
+                if prevresult:
+                    prevresult.accuracy = min(prevresult.accuracy, result.accuracy)
+                else:
+                    results[rhash] = result
                 min_ranking = min(min_ranking, result.ranking + 0.5, search.penalty + 0.3)
-            log().result_dump('Results', ((r.accuracy, r) for r in results[num_results:]))
-            num_results = len(results)
+            log().result_dump('Results', ((r.accuracy, r) for r in lookup_results))
             prev_penalty = search.penalty
+            if dt.datetime.now() >= end_time:
+                break
 
+        return SearchResults(results.values())
+
+
+    def sort_and_cut_results(self, results: SearchResults) -> SearchResults:
+        """ Remove badly matching results, sort by ranking and
+            limit to the configured number of results.
+        """
         if results:
             min_ranking = min(r.ranking for r in results)
             results = SearchResults(r for r in results if r.ranking < min_ranking + 0.5)
+            results.sort(key=lambda r: r.ranking)
 
         if results:
-            min_rank = min(r.rank_search for r in results)
-
+            min_rank = results[0].rank_search
             results = SearchResults(r for r in results
                                     if r.ranking + 0.05 * (r.rank_search - min_rank)
                                        < min_ranking + 0.5)
 
-            results.sort(key=lambda r: r.accuracy - r.calculated_importance())
             results = SearchResults(results[:self.limit])
 
         return results
 
 
+    def rerank_by_query(self, query: QueryStruct, results: SearchResults) -> None:
+        """ Adjust the accuracy of the localized result according to how well
+            they match the original query.
+        """
+        assert self.query_analyzer is not None
+        qwords = [word for phrase in query.source
+                       for word in re.split('[, ]+', phrase.text) if word]
+        if not qwords:
+            return
+
+        for result in results:
+            if not result.display_name:
+                continue
+            distance = 0.0
+            norm = self.query_analyzer.normalize_text(result.display_name)
+            words = set((w for w in norm.split(' ') if w))
+            if not words:
+                continue
+            for qword in qwords:
+                wdist = max(difflib.SequenceMatcher(a=qword, b=w).quick_ratio() for w in words)
+                if wdist < 0.5:
+                    distance += len(qword)
+                else:
+                    distance += (1.0 - wdist) * len(qword)
+            # Compensate for the fact that country names do not get a
+            # match penalty yet by the tokenizer.
+            # Temporary hack that needs to be removed!
+            if result.rank_address == 4:
+                distance *= 2
+            result.accuracy += distance * 0.4 / sum(len(w) for w in qwords)
+
+
     async def lookup_pois(self, categories: List[Tuple[str, str]],
                           phrases: List[Phrase]) -> SearchResults:
         """ Look up places by category. If phrase is given, a place search
@@ -116,13 +169,16 @@ class ForwardGeocoder:
             if query:
                 searches = [wrap_near_search(categories, s) for s in searches[:50]]
                 results = await self.execute_searches(query, searches)
+                await add_result_details(self.conn, results, self.params)
+                log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
+                results = self.sort_and_cut_results(results)
             else:
                 results = SearchResults()
         else:
             search = build_poi_search(categories, self.params.countries)
             results = await search.lookup(self.conn, self.params)
+            await add_result_details(self.conn, results, self.params)
 
-        await add_result_details(self.conn, results, self.params)
         log().result_dump('Final Results', ((r.accuracy, r) for r in results))
 
         return results
@@ -143,6 +199,10 @@ class ForwardGeocoder:
             # Execute SQL until an appropriate result is found.
             results = await self.execute_searches(query, searches[:50])
             await add_result_details(self.conn, results, self.params)
+            log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results))
+            self.rerank_by_query(query, results)
+            log().result_dump('Results after reranking', ((r.accuracy, r) for r in results))
+            results = self.sort_and_cut_results(results)
             log().result_dump('Final Results', ((r.accuracy, r) for r in results))
 
         return results
@@ -151,7 +211,8 @@ class ForwardGeocoder:
 # pylint: disable=invalid-name,too-many-locals
 def _dump_searches(searches: List[AbstractSearch], query: QueryStruct,
                    start: int = 0) -> Iterator[Optional[List[Any]]]:
-    yield ['Penalty', 'Lookups', 'Housenr', 'Postcode', 'Countries', 'Qualifier', 'Rankings']
+    yield ['Penalty', 'Lookups', 'Housenr', 'Postcode', 'Countries',
+           'Qualifier', 'Catgeory', 'Rankings']
 
     def tk(tl: List[int]) -> str:
         tstr = [f"{query.find_lookup_word_by_id(t)}({t})" for t in tl]
@@ -181,11 +242,18 @@ def _dump_searches(searches: List[AbstractSearch], query: QueryStruct,
 
     for search in searches[start:]:
         fields = ('lookups', 'rankings', 'countries', 'housenumbers',
-                  'postcodes', 'qualifier')
-        iters = itertools.zip_longest([f"{search.penalty:.3g}"],
-                                      *(getattr(search, attr, []) for attr in fields),
-                                      fillvalue= '')
-        for penalty, lookup, rank, cc, hnr, pc, qual in iters:
+                  'postcodes', 'qualifiers')
+        if hasattr(search, 'search'):
+            iters = itertools.zip_longest([f"{search.penalty:.3g}"],
+                                          *(getattr(search.search, attr, []) for attr in fields),
+                                          getattr(search, 'categories', []),
+                                          fillvalue='')
+        else:
+            iters = itertools.zip_longest([f"{search.penalty:.3g}"],
+                                          *(getattr(search, attr, []) for attr in fields),
+                                          [],
+                                          fillvalue='')
+        for penalty, lookup, rank, cc, hnr, pc, qual, cat in iters:
             yield [penalty, fmt_lookup(lookup), fmt_cstr(hnr),
-                   fmt_cstr(pc), fmt_cstr(cc), fmt_cstr(qual), fmt_ranking(rank)]
+                   fmt_cstr(pc), fmt_cstr(cc), fmt_cstr(qual), fmt_cstr(cat), fmt_ranking(rank)]
         yield None