From a7f5c6c8f54eea4e3d9746141b2e0ac2d5722a4a Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sun, 26 Nov 2023 20:58:50 +0100 Subject: [PATCH] drop category tokens when they make up a full phrase --- nominatim/api/search/query.py | 14 ++++--- .../api/search/test_api_search_query.py | 37 ++++++++++++++++++- .../api/search/test_db_search_builder.py | 13 +++---- .../api/search/test_token_assignment.py | 16 +++----- 4 files changed, 55 insertions(+), 25 deletions(-) diff --git a/nominatim/api/search/query.py b/nominatim/api/search/query.py index 5d75eb0f..4bf009a5 100644 --- a/nominatim/api/search/query.py +++ b/nominatim/api/search/query.py @@ -70,14 +70,16 @@ class PhraseType(enum.Enum): COUNTRY = enum.auto() """ Contains the country name or code. """ - def compatible_with(self, ttype: TokenType) -> bool: + def compatible_with(self, ttype: TokenType, + is_full_phrase: bool) -> bool: """ Check if the given token type can be used with the phrase type. """ if self == PhraseType.NONE: - return True + return not is_full_phrase or ttype != TokenType.QUALIFIER if self == PhraseType.AMENITY: - return ttype in (TokenType.WORD, TokenType.PARTIAL, - TokenType.QUALIFIER, TokenType.CATEGORY) + return ttype in (TokenType.WORD, TokenType.PARTIAL)\ + or (is_full_phrase and ttype == TokenType.CATEGORY)\ + or (not is_full_phrase and ttype == TokenType.QUALIFIER) if self == PhraseType.STREET: return ttype in (TokenType.WORD, TokenType.PARTIAL, TokenType.HOUSENUMBER) if self == PhraseType.POSTCODE: @@ -244,7 +246,9 @@ class QueryStruct: be added to, then the token is silently dropped. """ snode = self.nodes[trange.start] - if snode.ptype.compatible_with(ttype): + full_phrase = snode.btype in (BreakType.START, BreakType.PHRASE)\ + and self.nodes[trange.end].btype in (BreakType.PHRASE, BreakType.END) + if snode.ptype.compatible_with(ttype, full_phrase): tlist = snode.get_tokens(trange.end, ttype) if tlist is None: snode.starting.append(TokenList(trange.end, ttype, [token])) diff --git a/test/python/api/search/test_api_search_query.py b/test/python/api/search/test_api_search_query.py index f8c9c2dc..69a17412 100644 --- a/test/python/api/search/test_api_search_query.py +++ b/test/python/api/search/test_api_search_query.py @@ -28,12 +28,12 @@ def mktoken(tid: int): ('COUNTRY', 'COUNTRY'), ('POSTCODE', 'POSTCODE')]) def test_phrase_compatible(ptype, ttype): - assert query.PhraseType[ptype].compatible_with(query.TokenType[ttype]) + assert query.PhraseType[ptype].compatible_with(query.TokenType[ttype], False) @pytest.mark.parametrize('ptype', ['COUNTRY', 'POSTCODE']) def test_phrase_incompatible(ptype): - assert not query.PhraseType[ptype].compatible_with(query.TokenType.PARTIAL) + assert not query.PhraseType[ptype].compatible_with(query.TokenType.PARTIAL, True) def test_query_node_empty(): @@ -99,3 +99,36 @@ def test_query_struct_incompatible_token(): assert q.get_tokens(query.TokenRange(0, 1), query.TokenType.PARTIAL) == [] assert len(q.get_tokens(query.TokenRange(1, 2), query.TokenType.COUNTRY)) == 1 + + +def test_query_struct_amenity_single_word(): + q = query.QueryStruct([query.Phrase(query.PhraseType.AMENITY, 'bar')]) + q.add_node(query.BreakType.END, query.PhraseType.NONE) + + q.add_token(query.TokenRange(0, 1), query.TokenType.PARTIAL, mktoken(1)) + q.add_token(query.TokenRange(0, 1), query.TokenType.CATEGORY, mktoken(2)) + q.add_token(query.TokenRange(0, 1), query.TokenType.QUALIFIER, mktoken(3)) + + assert len(q.get_tokens(query.TokenRange(0, 1), query.TokenType.PARTIAL)) == 1 + assert len(q.get_tokens(query.TokenRange(0, 1), query.TokenType.CATEGORY)) == 1 + assert len(q.get_tokens(query.TokenRange(0, 1), query.TokenType.QUALIFIER)) == 0 + + +def test_query_struct_amenity_two_words(): + q = query.QueryStruct([query.Phrase(query.PhraseType.AMENITY, 'foo bar')]) + q.add_node(query.BreakType.WORD, query.PhraseType.AMENITY) + q.add_node(query.BreakType.END, query.PhraseType.NONE) + + for trange in [(0, 1), (1, 2)]: + q.add_token(query.TokenRange(*trange), query.TokenType.PARTIAL, mktoken(1)) + q.add_token(query.TokenRange(*trange), query.TokenType.CATEGORY, mktoken(2)) + q.add_token(query.TokenRange(*trange), query.TokenType.QUALIFIER, mktoken(3)) + + assert len(q.get_tokens(query.TokenRange(0, 1), query.TokenType.PARTIAL)) == 1 + assert len(q.get_tokens(query.TokenRange(0, 1), query.TokenType.CATEGORY)) == 0 + assert len(q.get_tokens(query.TokenRange(0, 1), query.TokenType.QUALIFIER)) == 1 + + assert len(q.get_tokens(query.TokenRange(1, 2), query.TokenType.PARTIAL)) == 1 + assert len(q.get_tokens(query.TokenRange(1, 2), query.TokenType.CATEGORY)) == 0 + assert len(q.get_tokens(query.TokenRange(1, 2), query.TokenType.QUALIFIER)) == 1 + diff --git a/test/python/api/search/test_db_search_builder.py b/test/python/api/search/test_db_search_builder.py index c93b8ead..c10a6c77 100644 --- a/test/python/api/search/test_db_search_builder.py +++ b/test/python/api/search/test_db_search_builder.py @@ -21,21 +21,18 @@ class MyToken(Token): def make_query(*args): - q = None + q = QueryStruct([Phrase(PhraseType.NONE, '')]) - for tlist in args: - if q is None: - q = QueryStruct([Phrase(PhraseType.NONE, '')]) - else: - q.add_node(BreakType.WORD, PhraseType.NONE) + for _ in range(max(inner[0] for tlist in args for inner in tlist)): + q.add_node(BreakType.WORD, PhraseType.NONE) + q.add_node(BreakType.END, PhraseType.NONE) - start = len(q.nodes) - 1 + for start, tlist in enumerate(args): for end, ttype, tinfo in tlist: for tid, word in tinfo: q.add_token(TokenRange(start, end), ttype, MyToken(0.5 if ttype == TokenType.PARTIAL else 0.0, tid, 1, word, True)) - q.add_node(BreakType.END, PhraseType.NONE) return q diff --git a/test/python/api/search/test_token_assignment.py b/test/python/api/search/test_token_assignment.py index dc123403..6dc25b1e 100644 --- a/test/python/api/search/test_token_assignment.py +++ b/test/python/api/search/test_token_assignment.py @@ -18,21 +18,17 @@ class MyToken(Token): def make_query(*args): - q = None + q = QueryStruct([Phrase(args[0][1], '')]) dummy = MyToken(3.0, 45, 1, 'foo', True) - for btype, ptype, tlist in args: - if q is None: - q = QueryStruct([Phrase(ptype, '')]) - else: - q.add_node(btype, ptype) + for btype, ptype, _ in args[1:]: + q.add_node(btype, ptype) + q.add_node(BreakType.END, PhraseType.NONE) - start = len(q.nodes) - 1 - for end, ttype in tlist: + for start, t in enumerate(args): + for end, ttype in t[2]: q.add_token(TokenRange(start, end), ttype, dummy) - q.add_node(BreakType.END, PhraseType.NONE) - return q -- 2.39.5