From: Sarah Hoffmann <lonvia@denofr.de>
Date: Mon, 31 Jul 2023 12:27:39 +0000 (+0200)
Subject: do not lookup by address vector when only few tokens are available
X-Git-Tag: v4.3.0~42
X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/afdbdb02a12193d75ba7670514779eb25fd3aa25?ds=inline

do not lookup by address vector when only few tokens are available

Names of countries and states are exceedingly rare in the word count
but are very frequent in the address. A short name has the danger
of producing too many results.
---

diff --git a/nominatim/api/search/db_search_builder.py b/nominatim/api/search/db_search_builder.py
index fc444aa2..7c6d13f0 100644
--- a/nominatim/api/search/db_search_builder.py
+++ b/nominatim/api/search/db_search_builder.py
@@ -212,7 +212,7 @@ class SearchBuilder:
 
         exp_count = min(exp_count, min(t.count for t in addr_partials)) \
                     if addr_partials else exp_count
-        if exp_count < 1000 and partials_indexed:
+        if exp_count < 1000 and len(addr_tokens) > 3 and partials_indexed:
             # Lookup by address partials and restrict results through name terms.
             # Give this a small penalty because lookups in the address index are
             # more expensive
diff --git a/test/python/api/search/test_db_search_builder.py b/test/python/api/search/test_db_search_builder.py
index 63589ffc..0e5a8bfc 100644
--- a/test/python/api/search/test_db_search_builder.py
+++ b/test/python/api/search/test_db_search_builder.py
@@ -332,9 +332,10 @@ def test_name_only_search_with_countries():
     assert not search.housenumbers.values
 
 
-def make_counted_searches(name_part, name_full, address_part, address_full):
+def make_counted_searches(name_part, name_full, address_part, address_full,
+                          num_address_parts=1):
     q = QueryStruct([Phrase(PhraseType.NONE, '')])
-    for i in range(2):
+    for i in range(1 + num_address_parts):
         q.add_node(BreakType.WORD, PhraseType.NONE)
     q.add_node(BreakType.END, PhraseType.NONE)
 
@@ -342,15 +343,16 @@ def make_counted_searches(name_part, name_full, address_part, address_full):
                 MyToken(0.5, 1, name_part, 'name_part', True))
     q.add_token(TokenRange(0, 1), TokenType.WORD,
                 MyToken(0, 101, name_full, 'name_full', True))
-    q.add_token(TokenRange(1, 2), TokenType.PARTIAL,
-                MyToken(0.5, 2, address_part, 'address_part', True))
-    q.add_token(TokenRange(1, 2), TokenType.WORD,
-                MyToken(0, 102, address_full, 'address_full', True))
+    for i in range(num_address_parts):
+        q.add_token(TokenRange(i + 1, i + 2), TokenType.PARTIAL,
+                    MyToken(0.5, 2, address_part, 'address_part', True))
+        q.add_token(TokenRange(i + 1, i + 2), TokenType.WORD,
+                    MyToken(0, 102, address_full, 'address_full', True))
 
     builder = SearchBuilder(q, SearchDetails())
 
     return list(builder.build(TokenAssignment(name=TokenRange(0, 1),
-                                              address=[TokenRange(1, 2)])))
+                                              address=[TokenRange(1, 1 + num_address_parts)])))
 
 
 def test_infrequent_partials_in_name():
@@ -368,7 +370,7 @@ def test_infrequent_partials_in_name():
 
 
 def test_frequent_partials_in_name_but_not_in_address():
-    searches = make_counted_searches(10000, 1, 1, 1)
+    searches = make_counted_searches(10000, 1, 1, 1, num_address_parts=4)
 
     assert len(searches) == 1
     search = searches[0]