]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/api/search/db_search_builder.py
Merge pull request #3385 from mtmail/docs-reverse-only
[nominatim.git] / nominatim / api / search / db_search_builder.py
index fd8cc7af90ffb3aa71581aac842e602d82cc0d39..c2f98c4775621719faa359f72b8c36137613edee 100644 (file)
@@ -5,7 +5,7 @@
 # Copyright (C) 2023 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 # Copyright (C) 2023 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
-Convertion from token assignment to an abstract DB search.
+Conversion from token assignment to an abstract DB search.
 """
 from typing import Optional, List, Tuple, Iterator, Dict
 import heapq
 """
 from typing import Optional, List, Tuple, Iterator, Dict
 import heapq
@@ -166,21 +166,22 @@ class SearchBuilder:
         sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], lookups.LookupAny)]
         expected_count = sum(t.count for t in hnrs)
 
         sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], lookups.LookupAny)]
         expected_count = sum(t.count for t in hnrs)
 
-        partials = [t for trange in address
-                       for t in self.query.get_partials_list(trange)]
+        partials = {t.token: t.addr_count for trange in address
+                       for t in self.query.get_partials_list(trange)}
 
         if expected_count < 8000:
             sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
 
         if expected_count < 8000:
             sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
-                                                 [t.token for t in partials], lookups.Restrict))
-        elif len(partials) != 1 or partials[0].count < 10000:
+                                                 list(partials), lookups.Restrict))
+        elif len(partials) != 1 or list(partials.values())[0] < 10000:
             sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
             sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
-                                                 [t.token for t in partials], lookups.LookupAll))
+                                                 list(partials), lookups.LookupAll))
         else:
         else:
+            addr_fulls = [t.token for t
+                          in self.query.get_tokens(address[0], TokenType.WORD)]
+            if len(addr_fulls) > 5:
+                return
             sdata.lookups.append(
             sdata.lookups.append(
-                dbf.FieldLookup('nameaddress_vector',
-                                [t.token for t
-                                 in self.query.get_tokens(address[0], TokenType.WORD)],
-                                lookups.LookupAny))
+                dbf.FieldLookup('nameaddress_vector', addr_fulls, lookups.LookupAny))
 
         sdata.housenumbers = dbf.WeightedStrings([], [])
         yield dbs.PlaceSearch(0.05, sdata, expected_count)
 
         sdata.housenumbers = dbf.WeightedStrings([], [])
         yield dbs.PlaceSearch(0.05, sdata, expected_count)
@@ -208,46 +209,96 @@ class SearchBuilder:
             are and tries to find a lookup that optimizes index use.
         """
         penalty = 0.0 # extra penalty
             are and tries to find a lookup that optimizes index use.
         """
         penalty = 0.0 # extra penalty
-        name_partials = self.query.get_partials_list(name)
-        name_tokens = [t.token for t in name_partials]
+        name_partials = {t.token: t for t in self.query.get_partials_list(name)}
 
         addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
 
         addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
-        addr_tokens = [t.token for t in addr_partials]
+        addr_tokens = list({t.token for t in addr_partials})
 
 
-        partials_indexed = all(t.is_indexed for t in name_partials) \
+        partials_indexed = all(t.is_indexed for t in name_partials.values()) \
                            and all(t.is_indexed for t in addr_partials)
                            and all(t.is_indexed for t in addr_partials)
-        exp_count = min(t.count for t in name_partials) / (2**(len(name_partials) - 1))
+        exp_count = min(t.count for t in name_partials.values()) / (2**(len(name_partials) - 1))
 
         if (len(name_partials) > 3 or exp_count < 8000) and partials_indexed:
 
         if (len(name_partials) > 3 or exp_count < 8000) and partials_indexed:
-            yield penalty, exp_count, dbf.lookup_by_names(name_tokens, addr_tokens)
+            yield penalty, exp_count, dbf.lookup_by_names(list(name_partials.keys()), addr_tokens)
             return
 
             return
 
+        addr_count = min(t.addr_count for t in addr_partials) if addr_partials else 30000
         # Partial term to frequent. Try looking up by rare full names first.
         name_fulls = self.query.get_tokens(name, TokenType.WORD)
         if name_fulls:
             fulls_count = sum(t.count for t in name_fulls)
         # Partial term to frequent. Try looking up by rare full names first.
         name_fulls = self.query.get_tokens(name, TokenType.WORD)
         if name_fulls:
             fulls_count = sum(t.count for t in name_fulls)
-            # At this point drop unindexed partials from the address.
-            # This might yield wrong results, nothing we can do about that.
-            if not partials_indexed:
-                addr_tokens = [t.token for t in addr_partials if t.is_indexed]
+            if partials_indexed:
                 penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
                 penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
-            # Any of the full names applies with all of the partials from the address
-            yield penalty, fulls_count / (2**len(addr_partials)),\
-                  dbf.lookup_by_any_name([t.token for t in name_fulls],
-                                         addr_tokens, fulls_count > 10000)
+
+            if fulls_count < 50000 or addr_count < 30000:
+                yield penalty,fulls_count / (2**len(addr_tokens)), \
+                    self.get_full_name_ranking(name_fulls, addr_partials,
+                                               fulls_count > 30000 / max(1, len(addr_tokens)))
 
         # To catch remaining results, lookup by name and address
         # We only do this if there is a reasonable number of results expected.
 
         # To catch remaining results, lookup by name and address
         # We only do this if there is a reasonable number of results expected.
-        exp_count = exp_count / (2**len(addr_partials)) if addr_partials else exp_count
-        if exp_count < 10000 and all(t.is_indexed for t in name_partials):
-            lookup = [dbf.FieldLookup('name_vector', name_tokens, lookups.LookupAll)]
-            if addr_tokens:
-                lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, lookups.LookupAll))
-            penalty += 0.35 * max(0, 5 - len(name_partials) - len(addr_tokens))
-            yield penalty, exp_count, lookup
+        exp_count = exp_count / (2**len(addr_tokens)) if addr_tokens else exp_count
+        if exp_count < 10000 and addr_count < 20000\
+           and all(t.is_indexed for t in name_partials.values()):
+            penalty += 0.35 * max(1 if name_fulls else 0.1,
+                                  5 - len(name_partials) - len(addr_tokens))
+            yield penalty, exp_count,\
+                  self.get_name_address_ranking(list(name_partials.keys()), addr_partials)
+
+
+    def get_name_address_ranking(self, name_tokens: List[int],
+                                 addr_partials: List[Token]) -> List[dbf.FieldLookup]:
+        """ Create a ranking expression looking up by name and address.
+        """
+        lookup = [dbf.FieldLookup('name_vector', name_tokens, lookups.LookupAll)]
+
+        addr_restrict_tokens = []
+        addr_lookup_tokens = []
+        for t in addr_partials:
+            if t.is_indexed:
+                if t.addr_count > 20000:
+                    addr_restrict_tokens.append(t.token)
+                else:
+                    addr_lookup_tokens.append(t.token)
+
+        if addr_restrict_tokens:
+            lookup.append(dbf.FieldLookup('nameaddress_vector',
+                                          addr_restrict_tokens, lookups.Restrict))
+        if addr_lookup_tokens:
+            lookup.append(dbf.FieldLookup('nameaddress_vector',
+                                          addr_lookup_tokens, lookups.LookupAll))
+
+        return lookup
+
+
+    def get_full_name_ranking(self, name_fulls: List[Token], addr_partials: List[Token],
+                              use_lookup: bool) -> List[dbf.FieldLookup]:
+        """ Create a ranking expression with full name terms and
+            additional address lookup. When 'use_lookup' is true, then
+            address lookups will use the index, when the occurences are not
+            too many.
+        """
+        # At this point drop unindexed partials from the address.
+        # This might yield wrong results, nothing we can do about that.
+        if use_lookup:
+            addr_restrict_tokens = []
+            addr_lookup_tokens = []
+            for t in addr_partials:
+                if t.is_indexed:
+                    if t.addr_count > 20000:
+                        addr_restrict_tokens.append(t.token)
+                    else:
+                        addr_lookup_tokens.append(t.token)
+        else:
+            addr_restrict_tokens = [t.token for t in addr_partials if t.is_indexed]
+            addr_lookup_tokens = []
 
 
+        return dbf.lookup_by_any_name([t.token for t in name_fulls],
+                                      addr_restrict_tokens, addr_lookup_tokens)
 
 
-    def get_name_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
+
+    def get_name_ranking(self, trange: TokenRange,
+                         db_field: str = 'name_vector') -> dbf.FieldRanking:
         """ Create a ranking expression for a name term in the given range.
         """
         name_fulls = self.query.get_tokens(trange, TokenType.WORD)
         """ Create a ranking expression for a name term in the given range.
         """
         name_fulls = self.query.get_tokens(trange, TokenType.WORD)
@@ -256,7 +307,7 @@ class SearchBuilder:
         # Fallback, sum of penalty for partials
         name_partials = self.query.get_partials_list(trange)
         default = sum(t.penalty for t in name_partials) + 0.2
         # Fallback, sum of penalty for partials
         name_partials = self.query.get_partials_list(trange)
         default = sum(t.penalty for t in name_partials) + 0.2
-        return dbf.FieldRanking('name_vector', default, ranks)
+        return dbf.FieldRanking(db_field, default, ranks)
 
 
     def get_addr_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
 
 
     def get_addr_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
@@ -314,11 +365,9 @@ class SearchBuilder:
         sdata = dbf.SearchData()
         sdata.penalty = assignment.penalty
         if assignment.country:
         sdata = dbf.SearchData()
         sdata.penalty = assignment.penalty
         if assignment.country:
-            tokens = self.query.get_tokens(assignment.country, TokenType.COUNTRY)
-            if self.details.countries:
-                tokens = [t for t in tokens if t.lookup_word in self.details.countries]
-                if not tokens:
-                    return None
+            tokens = self.get_country_tokens(assignment.country)
+            if not tokens:
+                return None
             sdata.set_strings('countries', tokens)
         elif self.details.countries:
             sdata.countries = dbf.WeightedStrings(self.details.countries,
             sdata.set_strings('countries', tokens)
         elif self.details.countries:
             sdata.countries = dbf.WeightedStrings(self.details.countries,
@@ -332,24 +381,54 @@ class SearchBuilder:
                               self.query.get_tokens(assignment.postcode,
                                                     TokenType.POSTCODE))
         if assignment.qualifier:
                               self.query.get_tokens(assignment.postcode,
                                                     TokenType.POSTCODE))
         if assignment.qualifier:
-            tokens = self.query.get_tokens(assignment.qualifier, TokenType.QUALIFIER)
-            if self.details.categories:
-                tokens = [t for t in tokens if t.get_category() in self.details.categories]
-                if not tokens:
-                    return None
+            tokens = self.get_qualifier_tokens(assignment.qualifier)
+            if not tokens:
+                return None
             sdata.set_qualifiers(tokens)
         elif self.details.categories:
             sdata.qualifiers = dbf.WeightedCategories(self.details.categories,
                                                       [0.0] * len(self.details.categories))
 
         if assignment.address:
             sdata.set_qualifiers(tokens)
         elif self.details.categories:
             sdata.qualifiers = dbf.WeightedCategories(self.details.categories,
                                                       [0.0] * len(self.details.categories))
 
         if assignment.address:
-            sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
+            if not assignment.name and assignment.housenumber:
+                # housenumber search: the first item needs to be handled like
+                # a name in ranking or penalties are not comparable with
+                # normal searches.
+                sdata.set_ranking([self.get_name_ranking(assignment.address[0],
+                                                         db_field='nameaddress_vector')]
+                                  + [self.get_addr_ranking(r) for r in assignment.address[1:]])
+            else:
+                sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
         else:
             sdata.rankings = []
 
         return sdata
 
 
         else:
             sdata.rankings = []
 
         return sdata
 
 
+    def get_country_tokens(self, trange: TokenRange) -> List[Token]:
+        """ Return the list of country tokens for the given range,
+            optionally filtered by the country list from the details
+            parameters.
+        """
+        tokens = self.query.get_tokens(trange, TokenType.COUNTRY)
+        if self.details.countries:
+            tokens = [t for t in tokens if t.lookup_word in self.details.countries]
+
+        return tokens
+
+
+    def get_qualifier_tokens(self, trange: TokenRange) -> List[Token]:
+        """ Return the list of qualifier tokens for the given range,
+            optionally filtered by the qualifier list from the details
+            parameters.
+        """
+        tokens = self.query.get_tokens(trange, TokenType.QUALIFIER)
+        if self.details.categories:
+            tokens = [t for t in tokens if t.get_category() in self.details.categories]
+
+        return tokens
+
+
     def get_near_items(self, assignment: TokenAssignment) -> Optional[dbf.WeightedCategories]:
         """ Collect tokens for near items search or use the categories
             requested per parameter.
     def get_near_items(self, assignment: TokenAssignment) -> Optional[dbf.WeightedCategories]:
         """ Collect tokens for near items search or use the categories
             requested per parameter.