further tweak search containing very frequent tokens

author Sarah Hoffmann <lonvia@denofr.de>

Wed, 23 Aug 2023 21:04:12 +0000 (23:04 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Wed, 23 Aug 2023 21:04:12 +0000 (23:04 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Wed, 23 Aug 2023 21:04:12 +0000 (23:04 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Wed, 23 Aug 2023 21:04:12 +0000 (23:04 +0200)
diff --git a/nominatim/api/search/db_search_builder.py b/nominatim/api/search/db_search_builder.py

index 377c4be7da959b156b19d531ab5e1511fc5be515..c9e48b0f3784f1bb7f6cd6cc9934b25c757c7b33 100644 (file)
--- a/nominatim/api/search/db_search_builder.py
+++ b/nominatim/api/search/db_search_builder.py
@@ -212,39 +212,27 @@ class SearchBuilder:
              yield penalty, exp_count, dbf.lookup_by_names(name_tokens, addr_tokens)
              return
  
              yield penalty, exp_count, dbf.lookup_by_names(name_tokens, addr_tokens)
              return
  
-        exp_count = exp_count / (2**len(addr_partials)) if addr_partials else exp_count
-
          # Partial term to frequent. Try looking up by rare full names first.
          name_fulls = self.query.get_tokens(name, TokenType.WORD)
          # Partial term to frequent. Try looking up by rare full names first.
          name_fulls = self.query.get_tokens(name, TokenType.WORD)
-        rare_names = list(filter(lambda t: t.count < 10000, name_fulls))
+        fulls_count = sum(t.count for t in name_fulls) / (2**len(addr_partials))
          # At this point drop unindexed partials from the address.
          # This might yield wrong results, nothing we can do about that.
          if not partials_indexed:
              addr_tokens = [t.token for t in addr_partials if t.is_indexed]
              penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
          # At this point drop unindexed partials from the address.
          # This might yield wrong results, nothing we can do about that.
          if not partials_indexed:
              addr_tokens = [t.token for t in addr_partials if t.is_indexed]
              penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
-        if rare_names:
-            # Any of the full names applies with all of the partials from the address
-            yield penalty, sum(t.count for t in rare_names),\
-                  dbf.lookup_by_any_name([t.token for t in rare_names], addr_tokens)
+        # Any of the full names applies with all of the partials from the address
+        yield penalty, fulls_count,\
+              dbf.lookup_by_any_name([t.token for t in name_fulls], addr_tokens,
+                                     'restrict' if fulls_count < 10000 else 'lookup_all')
  
          # To catch remaining results, lookup by name and address
          # We only do this if there is a reasonable number of results expected.
  
          # To catch remaining results, lookup by name and address
          # We only do this if there is a reasonable number of results expected.
-        if exp_count < 10000:
-            if all(t.is_indexed for t in name_partials):
-                lookup = [dbf.FieldLookup('name_vector', name_tokens, 'lookup_all')]
-            else:
-                # we don't have the partials, try with the non-rare names
-                non_rare_names = [t.token for t in name_fulls if t.count >= 10000]
-                if not non_rare_names:
-                    return
-                lookup = [dbf.FieldLookup('name_vector', non_rare_names, 'lookup_any')]
+        exp_count = exp_count / (2**len(addr_partials)) if addr_partials else exp_count
+        if exp_count < 10000 and all(t.is_indexed for t in name_partials):
+            lookup = [dbf.FieldLookup('name_vector', name_tokens, 'lookup_all')]
              if addr_tokens:
                  lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, 'lookup_all'))
              if addr_tokens:
                  lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, 'lookup_all'))
-            penalty += 0.1 * max(0, 5 - len(name_partials) - len(addr_tokens))
-            if len(rare_names) == len(name_fulls):
-                # if there already was a search for all full tokens,
-                # avoid this if anything has been found
-                penalty += 0.25
+            penalty += 0.35 * max(0, 5 - len(name_partials) - len(addr_tokens))
              yield penalty, exp_count, lookup
  
  
              yield penalty, exp_count, lookup
  
  
diff --git a/nominatim/api/search/db_search_fields.py b/nominatim/api/search/db_search_fields.py

index 2b2e3e56761e6ff15012297c144c52e30c4d74b1..612e90597df2064a4ba6bf19221076093c5c55f7 100644 (file)
--- a/nominatim/api/search/db_search_fields.py
+++ b/nominatim/api/search/db_search_fields.py
@@ -224,14 +224,15 @@ def lookup_by_names(name_tokens: List[int], addr_tokens: List[int]) -> List[Fiel
      return lookup
  
  
      return lookup
  
  
-def lookup_by_any_name(name_tokens: List[int], addr_tokens: List[int]) -> List[FieldLookup]:
+def lookup_by_any_name(name_tokens: List[int], addr_tokens: List[int],
+                       lookup_type: str) -> List[FieldLookup]:
      """ Create a lookup list where name tokens are looked up via index
          and only one of the name tokens must be present.
          Potential address tokens are used to restrict the search further.
      """
      lookup = [FieldLookup('name_vector', name_tokens, 'lookup_any')]
      if addr_tokens:
      """ Create a lookup list where name tokens are looked up via index
          and only one of the name tokens must be present.
          Potential address tokens are used to restrict the search further.
      """
      lookup = [FieldLookup('name_vector', name_tokens, 'lookup_any')]
      if addr_tokens:
-        lookup.append(FieldLookup('nameaddress_vector', addr_tokens, 'restrict'))
+        lookup.append(FieldLookup('nameaddress_vector', addr_tokens, lookup_type))
  
      return lookup
  
  
      return lookup
  
diff --git a/nominatim/api/search/db_searches.py b/nominatim/api/search/db_searches.py

index 34a4037a356b466fbb03b1e73c39583243620af7..d4b9c018b88b3eea556a7e52d2c72b9bb907aed4 100644 (file)
--- a/nominatim/api/search/db_searches.py
+++ b/nominatim/api/search/db_searches.py
@@ -643,13 +643,16 @@ class PlaceSearch(AbstractSearch):
                                        .label('importance'))
              sql = sql.order_by(sa.desc(sa.text('importance')))
          else:
                                        .label('importance'))
              sql = sql.order_by(sa.desc(sa.text('importance')))
          else:
-            sql = sql.order_by(penalty - sa.case((tsearch.c.importance > 0, tsearch.c.importance),
-                                  else_=0.75001-(sa.cast(tsearch.c.search_rank, sa.Float())/40)))
+            if self.expected_count < 10000:
+                sql = sql.order_by(penalty - sa.case((tsearch.c.importance > 0, tsearch.c.importance),
+                                                     else_=0.75001-(sa.cast(tsearch.c.search_rank, sa.Float())/40)))
              sql = sql.add_columns(t.c.importance)
  
  
              sql = sql.add_columns(t.c.importance)
  
  
-        sql = sql.add_columns(penalty.label('accuracy'))\
-                 .order_by(sa.text('accuracy'))
+        sql = sql.add_columns(penalty.label('accuracy'))
+
+        if self.expected_count < 10000:
+            sql = sql.order_by(sa.text('accuracy'))
  
          if self.housenumbers:
              hnr_regexp = f"\\m({'|'.join(self.housenumbers.values)})\\M"
  
          if self.housenumbers:
              hnr_regexp = f"\\m({'|'.join(self.housenumbers.values)})\\M"
author	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 23 Aug 2023 21:04:12 +0000 (23:04 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 23 Aug 2023 21:04:12 +0000 (23:04 +0200)
nominatim/api/search/db_search_builder.py		patch \| blob \| history
nominatim/api/search/db_search_fields.py		patch \| blob \| history
nominatim/api/search/db_searches.py		patch \| blob \| history