From afdbdb02a12193d75ba7670514779eb25fd3aa25 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Mon, 31 Jul 2023 14:27:39 +0200
Subject: [PATCH] do not lookup by address vector when only few tokens are
 available

Names of countries and states are exceedingly rare in the word count
but are very frequent in the address. A short name has the danger
of producing too many results.
---
 nominatim/api/search/db_search_builder.py      |  2 +-
 .../api/search/test_db_search_builder.py       | 18 ++++++++++--------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/nominatim/api/search/db_search_builder.py b/nominatim/api/search/db_search_builder.py
index fc444aa2..7c6d13f0 100644
--- a/nominatim/api/search/db_search_builder.py
+++ b/nominatim/api/search/db_search_builder.py
@@ -212,7 +212,7 @@ class SearchBuilder:
 
         exp_count = min(exp_count, min(t.count for t in addr_partials)) \
                     if addr_partials else exp_count
-        if exp_count < 1000 and partials_indexed:
+        if exp_count < 1000 and len(addr_tokens) > 3 and partials_indexed:
             # Lookup by address partials and restrict results through name terms.
             # Give this a small penalty because lookups in the address index are
             # more expensive
diff --git a/test/python/api/search/test_db_search_builder.py b/test/python/api/search/test_db_search_builder.py
index 63589ffc..0e5a8bfc 100644
--- a/test/python/api/search/test_db_search_builder.py
+++ b/test/python/api/search/test_db_search_builder.py
@@ -332,9 +332,10 @@ def test_name_only_search_with_countries():
     assert not search.housenumbers.values
 
 
-def make_counted_searches(name_part, name_full, address_part, address_full):
+def make_counted_searches(name_part, name_full, address_part, address_full,
+                          num_address_parts=1):
     q = QueryStruct([Phrase(PhraseType.NONE, '')])
-    for i in range(2):
+    for i in range(1 + num_address_parts):
         q.add_node(BreakType.WORD, PhraseType.NONE)
     q.add_node(BreakType.END, PhraseType.NONE)
 
@@ -342,15 +343,16 @@ def make_counted_searches(name_part, name_full, address_part, address_full):
                 MyToken(0.5, 1, name_part, 'name_part', True))
     q.add_token(TokenRange(0, 1), TokenType.WORD,
                 MyToken(0, 101, name_full, 'name_full', True))
-    q.add_token(TokenRange(1, 2), TokenType.PARTIAL,
-                MyToken(0.5, 2, address_part, 'address_part', True))
-    q.add_token(TokenRange(1, 2), TokenType.WORD,
-                MyToken(0, 102, address_full, 'address_full', True))
+    for i in range(num_address_parts):
+        q.add_token(TokenRange(i + 1, i + 2), TokenType.PARTIAL,
+                    MyToken(0.5, 2, address_part, 'address_part', True))
+        q.add_token(TokenRange(i + 1, i + 2), TokenType.WORD,
+                    MyToken(0, 102, address_full, 'address_full', True))
 
     builder = SearchBuilder(q, SearchDetails())
 
     return list(builder.build(TokenAssignment(name=TokenRange(0, 1),
-                                              address=[TokenRange(1, 2)])))
+                                              address=[TokenRange(1, 1 + num_address_parts)])))
 
 
 def test_infrequent_partials_in_name():
@@ -368,7 +370,7 @@ def test_infrequent_partials_in_name():
 
 
 def test_frequent_partials_in_name_but_not_in_address():
-    searches = make_counted_searches(10000, 1, 1, 1)
+    searches = make_counted_searches(10000, 1, 1, 1, num_address_parts=4)
 
     assert len(searches) == 1
     search = searches[0]
-- 
2.39.5