From: Sarah Hoffmann Date: Tue, 19 Sep 2023 14:18:09 +0000 (+0200) Subject: rerank results by query X-Git-Tag: v4.4.0~97^2~2 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/fd26310d6adc5fc5685bdd0de36afa66e85b9c9c rerank results by query The algorithm is similar to the PHP reranking and uses the terms from the display name to check against the query terms. However instead of exact matching it uses a per-word-edit-distance, so that it is less strict when it comes to mismatching accents or other one letter differences. Country names get a higher penalty because they don't receive a penalty during token matching right now. This will work badly with the legacy tokenizer. Given that it is marked for removal, it is simply not worth optimising for it. --- diff --git a/nominatim/api/search/geocoder.py b/nominatim/api/search/geocoder.py index f88bffbd..5dbc0948 100644 --- a/nominatim/api/search/geocoder.py +++ b/nominatim/api/search/geocoder.py @@ -9,7 +9,9 @@ Public interface to the search code. """ from typing import List, Any, Optional, Iterator, Tuple import itertools +import re import datetime as dt +import difflib from nominatim.api.connection import SearchConnection from nominatim.api.types import SearchDetails @@ -92,23 +94,56 @@ class ForwardGeocoder: if dt.datetime.now() >= end_time: break + return results + + + def sort_and_cut_results(self, results: SearchResults) -> SearchResults: + """ Remove badly matching results, sort by ranking and + limit to the configured number of results. + """ if results: min_ranking = min(r.ranking for r in results) results = SearchResults(r for r in results if r.ranking < min_ranking + 0.5) + results.sort(key=lambda r: r.ranking) if results: - min_rank = min(r.rank_search for r in results) - + min_rank = results[0].rank_search results = SearchResults(r for r in results if r.ranking + 0.05 * (r.rank_search - min_rank) < min_ranking + 0.5) - results.sort(key=lambda r: r.accuracy - r.calculated_importance()) results = SearchResults(results[:self.limit]) return results + def rerank_by_query(self, query: QueryStruct, results: SearchResults) -> None: + """ Adjust the accuracy of the localized result according to how well + they match the original query. + """ + assert self.query_analyzer is not None + qwords = [word for phrase in query.source + for word in re.split('[, ]+', phrase.text) if word] + if not qwords: + return + + for result in results: + if not result.display_name: + continue + distance = 0.0 + norm = self.query_analyzer.normalize_text(result.display_name) + words = set((w for w in norm.split(' ') if w)) + if not words: + continue + for qword in qwords: + wdist = max(difflib.SequenceMatcher(a=qword, b=w).quick_ratio() for w in words) + if wdist < 0.5: + distance += len(qword) + else: + distance += (1.0 - wdist) * len(qword) + result.accuracy += distance * 0.5 / sum(len(w) for w in qwords) + + async def lookup_pois(self, categories: List[Tuple[str, str]], phrases: List[Phrase]) -> SearchResults: """ Look up places by category. If phrase is given, a place search @@ -123,13 +158,16 @@ class ForwardGeocoder: if query: searches = [wrap_near_search(categories, s) for s in searches[:50]] results = await self.execute_searches(query, searches) + await add_result_details(self.conn, results, self.params) + log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results)) + results = self.sort_and_cut_results(results) else: results = SearchResults() else: search = build_poi_search(categories, self.params.countries) results = await search.lookup(self.conn, self.params) + await add_result_details(self.conn, results, self.params) - await add_result_details(self.conn, results, self.params) log().result_dump('Final Results', ((r.accuracy, r) for r in results)) return results @@ -150,6 +188,10 @@ class ForwardGeocoder: # Execute SQL until an appropriate result is found. results = await self.execute_searches(query, searches[:50]) await add_result_details(self.conn, results, self.params) + log().result_dump('Preliminary Results', ((r.accuracy, r) for r in results)) + self.rerank_by_query(query, results) + log().result_dump('Results after reranking', ((r.accuracy, r) for r in results)) + results = self.sort_and_cut_results(results) log().result_dump('Final Results', ((r.accuracy, r) for r in results)) return results diff --git a/nominatim/api/search/legacy_tokenizer.py b/nominatim/api/search/legacy_tokenizer.py index 3346584c..26e4c126 100644 --- a/nominatim/api/search/legacy_tokenizer.py +++ b/nominatim/api/search/legacy_tokenizer.py @@ -127,6 +127,15 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer): return query + def normalize_text(self, text: str) -> str: + """ Bring the given text into a normalized form. + + This only removes case, so some difference with the normalization + in the phrase remains. + """ + return text.lower() + + def split_query(self, query: qmod.QueryStruct) -> Tuple[List[str], Dict[str, List[qmod.TokenRange]]]: """ Transliterate the phrases and split them into tokens. diff --git a/nominatim/api/search/query_analyzer_factory.py b/nominatim/api/search/query_analyzer_factory.py index 35649d0f..bbc1eb6b 100644 --- a/nominatim/api/search/query_analyzer_factory.py +++ b/nominatim/api/search/query_analyzer_factory.py @@ -30,6 +30,15 @@ class AbstractQueryAnalyzer(ABC): """ + @abstractmethod + def normalize_text(self, text: str) -> str: + """ Bring the given text into a normalized form. That is the + standardized form search will work with. All information removed + at this stage is inevitably lost. + """ + + + async def make_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer: """ Create a query analyzer for the tokenizer used by the database. """