From: Sarah Hoffmann Date: Wed, 23 Aug 2023 21:04:12 +0000 (+0200) Subject: further tweak search containing very frequent tokens X-Git-Tag: v4.3.0~17^2~1 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/dcdda314e21fae2bcafb5c0a2883a1a921cb0300 further tweak search containing very frequent tokens Excluding non-rare full names is not really possible because it makes addresses with street names like 'main st' unsearchable. This tries to leav all names in but refrain from ordering results by accuracy when too many results are expected. This means that the DB will simply get the first n results without any particular order. --- diff --git a/nominatim/api/search/db_search_builder.py b/nominatim/api/search/db_search_builder.py index 377c4be7..c9e48b0f 100644 --- a/nominatim/api/search/db_search_builder.py +++ b/nominatim/api/search/db_search_builder.py @@ -212,39 +212,27 @@ class SearchBuilder: yield penalty, exp_count, dbf.lookup_by_names(name_tokens, addr_tokens) return - exp_count = exp_count / (2**len(addr_partials)) if addr_partials else exp_count - # Partial term to frequent. Try looking up by rare full names first. name_fulls = self.query.get_tokens(name, TokenType.WORD) - rare_names = list(filter(lambda t: t.count < 10000, name_fulls)) + fulls_count = sum(t.count for t in name_fulls) / (2**len(addr_partials)) # At this point drop unindexed partials from the address. # This might yield wrong results, nothing we can do about that. if not partials_indexed: addr_tokens = [t.token for t in addr_partials if t.is_indexed] penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed) - if rare_names: - # Any of the full names applies with all of the partials from the address - yield penalty, sum(t.count for t in rare_names),\ - dbf.lookup_by_any_name([t.token for t in rare_names], addr_tokens) + # Any of the full names applies with all of the partials from the address + yield penalty, fulls_count,\ + dbf.lookup_by_any_name([t.token for t in name_fulls], addr_tokens, + 'restrict' if fulls_count < 10000 else 'lookup_all') # To catch remaining results, lookup by name and address # We only do this if there is a reasonable number of results expected. - if exp_count < 10000: - if all(t.is_indexed for t in name_partials): - lookup = [dbf.FieldLookup('name_vector', name_tokens, 'lookup_all')] - else: - # we don't have the partials, try with the non-rare names - non_rare_names = [t.token for t in name_fulls if t.count >= 10000] - if not non_rare_names: - return - lookup = [dbf.FieldLookup('name_vector', non_rare_names, 'lookup_any')] + exp_count = exp_count / (2**len(addr_partials)) if addr_partials else exp_count + if exp_count < 10000 and all(t.is_indexed for t in name_partials): + lookup = [dbf.FieldLookup('name_vector', name_tokens, 'lookup_all')] if addr_tokens: lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, 'lookup_all')) - penalty += 0.1 * max(0, 5 - len(name_partials) - len(addr_tokens)) - if len(rare_names) == len(name_fulls): - # if there already was a search for all full tokens, - # avoid this if anything has been found - penalty += 0.25 + penalty += 0.35 * max(0, 5 - len(name_partials) - len(addr_tokens)) yield penalty, exp_count, lookup diff --git a/nominatim/api/search/db_search_fields.py b/nominatim/api/search/db_search_fields.py index 2b2e3e56..612e9059 100644 --- a/nominatim/api/search/db_search_fields.py +++ b/nominatim/api/search/db_search_fields.py @@ -224,14 +224,15 @@ def lookup_by_names(name_tokens: List[int], addr_tokens: List[int]) -> List[Fiel return lookup -def lookup_by_any_name(name_tokens: List[int], addr_tokens: List[int]) -> List[FieldLookup]: +def lookup_by_any_name(name_tokens: List[int], addr_tokens: List[int], + lookup_type: str) -> List[FieldLookup]: """ Create a lookup list where name tokens are looked up via index and only one of the name tokens must be present. Potential address tokens are used to restrict the search further. """ lookup = [FieldLookup('name_vector', name_tokens, 'lookup_any')] if addr_tokens: - lookup.append(FieldLookup('nameaddress_vector', addr_tokens, 'restrict')) + lookup.append(FieldLookup('nameaddress_vector', addr_tokens, lookup_type)) return lookup diff --git a/nominatim/api/search/db_searches.py b/nominatim/api/search/db_searches.py index 34a4037a..d4b9c018 100644 --- a/nominatim/api/search/db_searches.py +++ b/nominatim/api/search/db_searches.py @@ -643,13 +643,16 @@ class PlaceSearch(AbstractSearch): .label('importance')) sql = sql.order_by(sa.desc(sa.text('importance'))) else: - sql = sql.order_by(penalty - sa.case((tsearch.c.importance > 0, tsearch.c.importance), - else_=0.75001-(sa.cast(tsearch.c.search_rank, sa.Float())/40))) + if self.expected_count < 10000: + sql = sql.order_by(penalty - sa.case((tsearch.c.importance > 0, tsearch.c.importance), + else_=0.75001-(sa.cast(tsearch.c.search_rank, sa.Float())/40))) sql = sql.add_columns(t.c.importance) - sql = sql.add_columns(penalty.label('accuracy'))\ - .order_by(sa.text('accuracy')) + sql = sql.add_columns(penalty.label('accuracy')) + + if self.expected_count < 10000: + sql = sql.order_by(sa.text('accuracy')) if self.housenumbers: hnr_regexp = f"\\m({'|'.join(self.housenumbers.values)})\\M"