From afdbdb02a12193d75ba7670514779eb25fd3aa25 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 31 Jul 2023 14:27:39 +0200 Subject: [PATCH] do not lookup by address vector when only few tokens are available Names of countries and states are exceedingly rare in the word count but are very frequent in the address. A short name has the danger of producing too many results. --- nominatim/api/search/db_search_builder.py | 2 +- .../api/search/test_db_search_builder.py | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/nominatim/api/search/db_search_builder.py b/nominatim/api/search/db_search_builder.py index fc444aa2..7c6d13f0 100644 --- a/nominatim/api/search/db_search_builder.py +++ b/nominatim/api/search/db_search_builder.py @@ -212,7 +212,7 @@ class SearchBuilder: exp_count = min(exp_count, min(t.count for t in addr_partials)) \ if addr_partials else exp_count - if exp_count < 1000 and partials_indexed: + if exp_count < 1000 and len(addr_tokens) > 3 and partials_indexed: # Lookup by address partials and restrict results through name terms. # Give this a small penalty because lookups in the address index are # more expensive diff --git a/test/python/api/search/test_db_search_builder.py b/test/python/api/search/test_db_search_builder.py index 63589ffc..0e5a8bfc 100644 --- a/test/python/api/search/test_db_search_builder.py +++ b/test/python/api/search/test_db_search_builder.py @@ -332,9 +332,10 @@ def test_name_only_search_with_countries(): assert not search.housenumbers.values -def make_counted_searches(name_part, name_full, address_part, address_full): +def make_counted_searches(name_part, name_full, address_part, address_full, + num_address_parts=1): q = QueryStruct([Phrase(PhraseType.NONE, '')]) - for i in range(2): + for i in range(1 + num_address_parts): q.add_node(BreakType.WORD, PhraseType.NONE) q.add_node(BreakType.END, PhraseType.NONE) @@ -342,15 +343,16 @@ def make_counted_searches(name_part, name_full, address_part, address_full): MyToken(0.5, 1, name_part, 'name_part', True)) q.add_token(TokenRange(0, 1), TokenType.WORD, MyToken(0, 101, name_full, 'name_full', True)) - q.add_token(TokenRange(1, 2), TokenType.PARTIAL, - MyToken(0.5, 2, address_part, 'address_part', True)) - q.add_token(TokenRange(1, 2), TokenType.WORD, - MyToken(0, 102, address_full, 'address_full', True)) + for i in range(num_address_parts): + q.add_token(TokenRange(i + 1, i + 2), TokenType.PARTIAL, + MyToken(0.5, 2, address_part, 'address_part', True)) + q.add_token(TokenRange(i + 1, i + 2), TokenType.WORD, + MyToken(0, 102, address_full, 'address_full', True)) builder = SearchBuilder(q, SearchDetails()) return list(builder.build(TokenAssignment(name=TokenRange(0, 1), - address=[TokenRange(1, 2)]))) + address=[TokenRange(1, 1 + num_address_parts)]))) def test_infrequent_partials_in_name(): @@ -368,7 +370,7 @@ def test_infrequent_partials_in_name(): def test_frequent_partials_in_name_but_not_in_address(): - searches = make_counted_searches(10000, 1, 1, 1) + searches = make_counted_searches(10000, 1, 1, 1, num_address_parts=4) assert len(searches) == 1 search = searches[0] -- 2.39.5