add tests for interaction of category parameter with category terms

[nominatim.git] / nominatim / api / search / db_search_builder.py
diff --git a/nominatim/api/search/db_search_builder.py b/nominatim/api/search/db_search_builder.py

index 377c4be7da959b156b19d531ab5e1511fc5be515..f89d8b62e827178928bcc3c8d7fd210a30ef00a8 100644 (file)
--- a/nominatim/api/search/db_search_builder.py
+++ b/nominatim/api/search/db_search_builder.py
@@ -7,7 +7,7 @@
  """
  Convertion from token assignment to an abstract DB search.
  """
  """
  Convertion from token assignment to an abstract DB search.
  """
-from typing import Optional, List, Tuple, Iterator
+from typing import Optional, List, Tuple, Iterator, Dict
  import heapq
  
  from nominatim.api.types import SearchDetails, DataLayer
  import heapq
  
  from nominatim.api.types import SearchDetails, DataLayer
@@ -89,12 +89,14 @@ class SearchBuilder:
          if sdata is None:
              return
  
          if sdata is None:
              return
  
-        categories = self.get_search_categories(assignment)
+        near_items = self.get_near_items(assignment)
+        if near_items is not None and not near_items:
+            return # impossible compbination of near items and category parameter
  
          if assignment.name is None:
  
          if assignment.name is None:
-            if categories and not sdata.postcodes:
-                sdata.qualifiers = categories
-                categories = None
+            if near_items and not sdata.postcodes:
+                sdata.qualifiers = near_items
+                near_items = None
                  builder = self.build_poi_search(sdata)
              elif assignment.housenumber:
                  hnr_tokens = self.query.get_tokens(assignment.housenumber,
                  builder = self.build_poi_search(sdata)
              elif assignment.housenumber:
                  hnr_tokens = self.query.get_tokens(assignment.housenumber,
@@ -102,16 +104,16 @@ class SearchBuilder:
                  builder = self.build_housenumber_search(sdata, hnr_tokens, assignment.address)
              else:
                  builder = self.build_special_search(sdata, assignment.address,
                  builder = self.build_housenumber_search(sdata, hnr_tokens, assignment.address)
              else:
                  builder = self.build_special_search(sdata, assignment.address,
-                                                    bool(categories))
+                                                    bool(near_items))
          else:
              builder = self.build_name_search(sdata, assignment.name, assignment.address,
          else:
              builder = self.build_name_search(sdata, assignment.name, assignment.address,
-                                             bool(categories))
+                                             bool(near_items))
  
  
-        if categories:
-            penalty = min(categories.penalties)
-            categories.penalties = [p - penalty for p in categories.penalties]
+        if near_items:
+            penalty = min(near_items.penalties)
+            near_items.penalties = [p - penalty for p in near_items.penalties]
              for search in builder:
              for search in builder:
-                yield dbs.NearSearch(penalty + assignment.penalty, categories, search)
+                yield dbs.NearSearch(penalty + assignment.penalty, near_items, search)
          else:
              for search in builder:
                  search.penalty += assignment.penalty
          else:
              for search in builder:
                  search.penalty += assignment.penalty
@@ -206,45 +208,33 @@ class SearchBuilder:
  
          partials_indexed = all(t.is_indexed for t in name_partials) \
                             and all(t.is_indexed for t in addr_partials)
  
          partials_indexed = all(t.is_indexed for t in name_partials) \
                             and all(t.is_indexed for t in addr_partials)
-        exp_count = min(t.count for t in name_partials)
+        exp_count = min(t.count for t in name_partials) / (2**(len(name_partials) - 1))
  
  
-        if (len(name_partials) > 3 or exp_count < 3000) and partials_indexed:
+        if (len(name_partials) > 3 or exp_count < 8000) and partials_indexed:
              yield penalty, exp_count, dbf.lookup_by_names(name_tokens, addr_tokens)
              return
  
              yield penalty, exp_count, dbf.lookup_by_names(name_tokens, addr_tokens)
              return
  
-        exp_count = exp_count / (2**len(addr_partials)) if addr_partials else exp_count
-
          # Partial term to frequent. Try looking up by rare full names first.
          name_fulls = self.query.get_tokens(name, TokenType.WORD)
          # Partial term to frequent. Try looking up by rare full names first.
          name_fulls = self.query.get_tokens(name, TokenType.WORD)
-        rare_names = list(filter(lambda t: t.count < 10000, name_fulls))
+        fulls_count = sum(t.count for t in name_fulls)
          # At this point drop unindexed partials from the address.
          # This might yield wrong results, nothing we can do about that.
          if not partials_indexed:
              addr_tokens = [t.token for t in addr_partials if t.is_indexed]
              penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
          # At this point drop unindexed partials from the address.
          # This might yield wrong results, nothing we can do about that.
          if not partials_indexed:
              addr_tokens = [t.token for t in addr_partials if t.is_indexed]
              penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
-        if rare_names:
-            # Any of the full names applies with all of the partials from the address
-            yield penalty, sum(t.count for t in rare_names),\
-                  dbf.lookup_by_any_name([t.token for t in rare_names], addr_tokens)
+        # Any of the full names applies with all of the partials from the address
+        yield penalty, fulls_count / (2**len(addr_partials)),\
+              dbf.lookup_by_any_name([t.token for t in name_fulls], addr_tokens,
+                                     'restrict' if fulls_count < 10000 else 'lookup_all')
  
          # To catch remaining results, lookup by name and address
          # We only do this if there is a reasonable number of results expected.
  
          # To catch remaining results, lookup by name and address
          # We only do this if there is a reasonable number of results expected.
-        if exp_count < 10000:
-            if all(t.is_indexed for t in name_partials):
-                lookup = [dbf.FieldLookup('name_vector', name_tokens, 'lookup_all')]
-            else:
-                # we don't have the partials, try with the non-rare names
-                non_rare_names = [t.token for t in name_fulls if t.count >= 10000]
-                if not non_rare_names:
-                    return
-                lookup = [dbf.FieldLookup('name_vector', non_rare_names, 'lookup_any')]
+        exp_count = exp_count / (2**len(addr_partials)) if addr_partials else exp_count
+        if exp_count < 10000 and all(t.is_indexed for t in name_partials):
+            lookup = [dbf.FieldLookup('name_vector', name_tokens, 'lookup_all')]
              if addr_tokens:
                  lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, 'lookup_all'))
              if addr_tokens:
                  lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, 'lookup_all'))
-            penalty += 0.1 * max(0, 5 - len(name_partials) - len(addr_tokens))
-            if len(rare_names) == len(name_fulls):
-                # if there already was a search for all full tokens,
-                # avoid this if anything has been found
-                penalty += 0.25
+            penalty += 0.35 * max(0, 5 - len(name_partials) - len(addr_tokens))
              yield penalty, exp_count, lookup
  
  
              yield penalty, exp_count, lookup
  
  
@@ -333,8 +323,15 @@ class SearchBuilder:
                                self.query.get_tokens(assignment.postcode,
                                                      TokenType.POSTCODE))
          if assignment.qualifier:
                                self.query.get_tokens(assignment.postcode,
                                                      TokenType.POSTCODE))
          if assignment.qualifier:
-            sdata.set_qualifiers(self.query.get_tokens(assignment.qualifier,
-                                                       TokenType.QUALIFIER))
+            tokens = self.query.get_tokens(assignment.qualifier, TokenType.QUALIFIER)
+            if self.details.categories:
+                tokens = [t for t in tokens if t.get_category() in self.details.categories]
+                if not tokens:
+                    return None
+            sdata.set_qualifiers(tokens)
+        elif self.details.categories:
+            sdata.qualifiers = dbf.WeightedCategories(self.details.categories,
+                                                      [0.0] * len(self.details.categories))
  
          if assignment.address:
              sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
  
          if assignment.address:
              sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
@@ -344,23 +341,22 @@ class SearchBuilder:
          return sdata
  
  
          return sdata
  
  
-    def get_search_categories(self,
-                              assignment: TokenAssignment) -> Optional[dbf.WeightedCategories]:
-        """ Collect tokens for category search or use the categories
+    def get_near_items(self, assignment: TokenAssignment) -> Optional[dbf.WeightedCategories]:
+        """ Collect tokens for near items search or use the categories
              requested per parameter.
              Returns None if no category search is requested.
          """
              requested per parameter.
              Returns None if no category search is requested.
          """
-        if assignment.category:
-            tokens = [t for t in self.query.get_tokens(assignment.category,
-                                                       TokenType.CATEGORY)
-                      if not self.details.categories
-                         or t.get_category() in self.details.categories]
-            return dbf.WeightedCategories([t.get_category() for t in tokens],
-                                          [t.penalty for t in tokens])
-
-        if self.details.categories:
-            return dbf.WeightedCategories(self.details.categories,
-                                          [0.0] * len(self.details.categories))
+        if assignment.near_item:
+            tokens: Dict[Tuple[str, str], float] = {}
+            for t in self.query.get_tokens(assignment.near_item, TokenType.NEAR_ITEM):
+                cat = t.get_category()
+                # The category of a near search will be that of near_item.
+                # Thus, if search is restricted to a category parameter,
+                # the two sets must intersect.
+                if (not self.details.categories or cat in self.details.categories)\
+                   and t.penalty < tokens.get(cat, 1000.0):
+                    tokens[cat] = t.penalty
+            return dbf.WeightedCategories(list(tokens.keys()), list(tokens.values()))
  
          return None
  
  
          return None