]> git.openstreetmap.org Git - nominatim.git/blob - test/python/api/search/test_token_assignment.py
Merge remote-tracking branch 'upstream/master'
[nominatim.git] / test / python / api / search / test_token_assignment.py
1 # SPDX-License-Identifier: GPL-3.0-or-later
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2025 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Test for creation of token assignments from tokenized queries.
9 """
10 import pytest
11
12 from nominatim_api.search.query import QueryStruct, Phrase, TokenRange, Token
13 import nominatim_api.search.query as qmod
14 from nominatim_api.search.token_assignment import (yield_token_assignments,
15                                                    TokenAssignment,
16                                                    PENALTY_TOKENCHANGE)
17
18
19 class MyToken(Token):
20     def get_category(self):
21         return 'this', 'that'
22
23
24 def make_query(*args):
25     q = QueryStruct([Phrase(args[0][1], '')])
26     dummy = MyToken(penalty=3.0, token=45, count=1, addr_count=1,
27                     lookup_word='foo')
28
29     for btype, ptype, _ in args[1:]:
30         q.add_node(btype, ptype)
31     q.add_node(qmod.BREAK_END, qmod.PHRASE_ANY)
32
33     for start, t in enumerate(args):
34         for end, ttype in t[2]:
35             q.add_token(TokenRange(start, end), ttype, dummy)
36
37     return q
38
39
40 def check_assignments(actual, *expected):
41     todo = list(expected)
42     for assignment in actual:
43         assert assignment in todo, f"Unexpected assignment: {assignment}"
44         todo.remove(assignment)
45
46     assert not todo, f"Missing assignments: {expected}"
47
48
49 def test_query_with_missing_tokens():
50     q = QueryStruct([Phrase(qmod.PHRASE_ANY, '')])
51     q.add_node(qmod.BREAK_END, qmod.PHRASE_ANY)
52
53     assert list(yield_token_assignments(q)) == []
54
55
56 def test_one_word_query():
57     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY,
58                     [(1, qmod.TOKEN_PARTIAL),
59                      (1, qmod.TOKEN_WORD),
60                      (1, qmod.TOKEN_HOUSENUMBER)]))
61
62     res = list(yield_token_assignments(q))
63     assert res == [TokenAssignment(name=TokenRange(0, 1))]
64
65
66 def test_single_postcode():
67     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY,
68                     [(1, qmod.TOKEN_POSTCODE)]))
69
70     res = list(yield_token_assignments(q))
71     assert res == [TokenAssignment(postcode=TokenRange(0, 1))]
72
73
74 def test_single_country_name():
75     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY,
76                     [(1, qmod.TOKEN_COUNTRY)]))
77
78     res = list(yield_token_assignments(q))
79     assert res == [TokenAssignment(country=TokenRange(0, 1))]
80
81
82 def test_single_word_poi_search():
83     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY,
84                     [(1, qmod.TOKEN_NEAR_ITEM),
85                      (1, qmod.TOKEN_QUALIFIER)]))
86
87     res = list(yield_token_assignments(q))
88     assert res == [TokenAssignment(near_item=TokenRange(0, 1))]
89
90
91 @pytest.mark.parametrize('btype', [qmod.BREAK_WORD, qmod.BREAK_PART, qmod.BREAK_TOKEN])
92 def test_multiple_simple_words(btype):
93     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_PARTIAL)]),
94                    (btype, qmod.PHRASE_ANY, [(2, qmod.TOKEN_PARTIAL)]),
95                    (btype, qmod.PHRASE_ANY, [(3, qmod.TOKEN_PARTIAL)]))
96
97     penalty = PENALTY_TOKENCHANGE[btype]
98
99     check_assignments(yield_token_assignments(q),
100                       TokenAssignment(name=TokenRange(0, 3)),
101                       TokenAssignment(penalty=penalty, name=TokenRange(0, 2),
102                                       address=[TokenRange(2, 3)]),
103                       TokenAssignment(penalty=penalty, name=TokenRange(0, 1),
104                                       address=[TokenRange(1, 3)]),
105                       TokenAssignment(penalty=penalty, name=TokenRange(1, 3),
106                                       address=[TokenRange(0, 1)]),
107                       TokenAssignment(penalty=penalty, name=TokenRange(2, 3),
108                                       address=[TokenRange(0, 2)]))
109
110
111 def test_multiple_words_respect_phrase_break():
112     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_PARTIAL)]),
113                    (qmod.BREAK_PHRASE, qmod.PHRASE_ANY, [(2, qmod.TOKEN_PARTIAL)]))
114
115     check_assignments(yield_token_assignments(q),
116                       TokenAssignment(name=TokenRange(0, 1),
117                                       address=[TokenRange(1, 2)]),
118                       TokenAssignment(name=TokenRange(1, 2),
119                                       address=[TokenRange(0, 1)]))
120
121
122 def test_housenumber_and_street():
123     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_HOUSENUMBER)]),
124                    (qmod.BREAK_PHRASE, qmod.PHRASE_ANY, [(2, qmod.TOKEN_PARTIAL)]))
125
126     check_assignments(yield_token_assignments(q),
127                       TokenAssignment(name=TokenRange(1, 2),
128                                       housenumber=TokenRange(0, 1)),
129                       TokenAssignment(address=[TokenRange(1, 2)],
130                                       housenumber=TokenRange(0, 1)))
131
132
133 def test_housenumber_and_street_backwards():
134     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_PARTIAL)]),
135                    (qmod.BREAK_PHRASE, qmod.PHRASE_ANY, [(2, qmod.TOKEN_HOUSENUMBER)]))
136
137     check_assignments(yield_token_assignments(q),
138                       TokenAssignment(name=TokenRange(0, 1),
139                                       housenumber=TokenRange(1, 2)),
140                       TokenAssignment(address=[TokenRange(0, 1)],
141                                       housenumber=TokenRange(1, 2)))
142
143
144 def test_housenumber_and_postcode():
145     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_PARTIAL)]),
146                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(2, qmod.TOKEN_HOUSENUMBER)]),
147                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(3, qmod.TOKEN_PARTIAL)]),
148                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(4, qmod.TOKEN_POSTCODE)]))
149
150     check_assignments(yield_token_assignments(q),
151                       TokenAssignment(penalty=pytest.approx(0.3),
152                                       name=TokenRange(0, 1),
153                                       housenumber=TokenRange(1, 2),
154                                       address=[TokenRange(2, 3)],
155                                       postcode=TokenRange(3, 4)),
156                       TokenAssignment(penalty=pytest.approx(0.3),
157                                       housenumber=TokenRange(1, 2),
158                                       address=[TokenRange(0, 1), TokenRange(2, 3)],
159                                       postcode=TokenRange(3, 4)))
160
161
162 def test_postcode_and_housenumber():
163     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_PARTIAL)]),
164                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(2, qmod.TOKEN_POSTCODE)]),
165                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(3, qmod.TOKEN_PARTIAL)]),
166                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(4, qmod.TOKEN_HOUSENUMBER)]))
167
168     check_assignments(yield_token_assignments(q),
169                       TokenAssignment(penalty=pytest.approx(0.3),
170                                       name=TokenRange(2, 3),
171                                       housenumber=TokenRange(3, 4),
172                                       address=[TokenRange(0, 1)],
173                                       postcode=TokenRange(1, 2)),
174                       TokenAssignment(penalty=pytest.approx(0.3),
175                                       housenumber=TokenRange(3, 4),
176                                       address=[TokenRange(0, 1), TokenRange(2, 3)],
177                                       postcode=TokenRange(1, 2)))
178
179
180 def test_country_housenumber_postcode():
181     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_COUNTRY)]),
182                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(2, qmod.TOKEN_PARTIAL)]),
183                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(3, qmod.TOKEN_HOUSENUMBER)]),
184                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(4, qmod.TOKEN_POSTCODE)]))
185
186     check_assignments(yield_token_assignments(q))
187
188
189 @pytest.mark.parametrize('ttype', [qmod.TOKEN_POSTCODE, qmod.TOKEN_COUNTRY,
190                                    qmod.TOKEN_NEAR_ITEM, qmod.TOKEN_QUALIFIER])
191 def test_housenumber_with_only_special_terms(ttype):
192     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_HOUSENUMBER)]),
193                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(2, ttype)]))
194
195     check_assignments(yield_token_assignments(q))
196
197
198 @pytest.mark.parametrize('ttype', [qmod.TOKEN_POSTCODE, qmod.TOKEN_HOUSENUMBER, qmod.TOKEN_COUNTRY])
199 def test_multiple_special_tokens(ttype):
200     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, ttype)]),
201                    (qmod.BREAK_PHRASE, qmod.PHRASE_ANY, [(2, qmod.TOKEN_PARTIAL)]),
202                    (qmod.BREAK_PHRASE, qmod.PHRASE_ANY, [(3, ttype)]))
203
204     check_assignments(yield_token_assignments(q))
205
206
207 def test_housenumber_many_phrases():
208     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_PARTIAL)]),
209                    (qmod.BREAK_PHRASE, qmod.PHRASE_ANY, [(2, qmod.TOKEN_PARTIAL)]),
210                    (qmod.BREAK_PHRASE, qmod.PHRASE_ANY, [(3, qmod.TOKEN_PARTIAL)]),
211                    (qmod.BREAK_PHRASE, qmod.PHRASE_ANY, [(4, qmod.TOKEN_HOUSENUMBER)]),
212                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(5, qmod.TOKEN_PARTIAL)]))
213
214     check_assignments(yield_token_assignments(q),
215                       TokenAssignment(penalty=0.1,
216                                       name=TokenRange(4, 5),
217                                       housenumber=TokenRange(3, 4),
218                                       address=[TokenRange(0, 1), TokenRange(1, 2),
219                                                TokenRange(2, 3)]),
220                       TokenAssignment(penalty=0.1,
221                                       housenumber=TokenRange(3, 4),
222                                       address=[TokenRange(0, 1), TokenRange(1, 2),
223                                                TokenRange(2, 3), TokenRange(4, 5)]))
224
225
226 def test_country_at_beginning():
227     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_COUNTRY)]),
228                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(2, qmod.TOKEN_PARTIAL)]))
229
230     check_assignments(yield_token_assignments(q),
231                       TokenAssignment(penalty=0.1, name=TokenRange(1, 2),
232                                       country=TokenRange(0, 1)))
233
234
235 def test_country_at_end():
236     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_PARTIAL)]),
237                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(2, qmod.TOKEN_COUNTRY)]))
238
239     check_assignments(yield_token_assignments(q),
240                       TokenAssignment(penalty=0.1, name=TokenRange(0, 1),
241                                       country=TokenRange(1, 2)))
242
243
244 def test_country_in_middle():
245     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_PARTIAL)]),
246                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(2, qmod.TOKEN_COUNTRY)]),
247                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(3, qmod.TOKEN_PARTIAL)]))
248
249     check_assignments(yield_token_assignments(q))
250
251
252 def test_postcode_with_designation():
253     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_POSTCODE)]),
254                    (qmod.BREAK_PHRASE, qmod.PHRASE_ANY, [(2, qmod.TOKEN_PARTIAL)]))
255
256     check_assignments(yield_token_assignments(q),
257                       TokenAssignment(penalty=0.1, name=TokenRange(1, 2),
258                                       postcode=TokenRange(0, 1)),
259                       TokenAssignment(postcode=TokenRange(0, 1),
260                                       address=[TokenRange(1, 2)]))
261
262
263 def test_postcode_with_designation_backwards():
264     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_PARTIAL)]),
265                    (qmod.BREAK_PHRASE, qmod.PHRASE_ANY, [(2, qmod.TOKEN_POSTCODE)]))
266
267     check_assignments(yield_token_assignments(q),
268                       TokenAssignment(name=TokenRange(0, 1),
269                                       postcode=TokenRange(1, 2)),
270                       TokenAssignment(penalty=0.1, postcode=TokenRange(1, 2),
271                                       address=[TokenRange(0, 1)]))
272
273
274 def test_near_item_at_beginning():
275     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_NEAR_ITEM)]),
276                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(2, qmod.TOKEN_PARTIAL)]))
277
278     check_assignments(yield_token_assignments(q),
279                       TokenAssignment(penalty=0.1, name=TokenRange(1, 2),
280                                       near_item=TokenRange(0, 1)))
281
282
283 def test_near_item_at_end():
284     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_PARTIAL)]),
285                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(2, qmod.TOKEN_NEAR_ITEM)]))
286
287     check_assignments(yield_token_assignments(q),
288                       TokenAssignment(penalty=0.1, name=TokenRange(0, 1),
289                                       near_item=TokenRange(1, 2)))
290
291
292 def test_near_item_in_middle():
293     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_PARTIAL)]),
294                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(2, qmod.TOKEN_NEAR_ITEM)]),
295                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(3, qmod.TOKEN_PARTIAL)]))
296
297     check_assignments(yield_token_assignments(q))
298
299
300 def test_qualifier_at_beginning():
301     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_QUALIFIER)]),
302                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(2, qmod.TOKEN_PARTIAL)]),
303                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(3, qmod.TOKEN_PARTIAL)]))
304
305     check_assignments(yield_token_assignments(q),
306                       TokenAssignment(penalty=0.1, name=TokenRange(1, 3),
307                                       qualifier=TokenRange(0, 1)),
308                       TokenAssignment(penalty=0.2, name=TokenRange(1, 2),
309                                       qualifier=TokenRange(0, 1),
310                                       address=[TokenRange(2, 3)]))
311
312
313 def test_qualifier_after_name():
314     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_PARTIAL)]),
315                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(2, qmod.TOKEN_PARTIAL)]),
316                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(3, qmod.TOKEN_QUALIFIER)]),
317                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(4, qmod.TOKEN_PARTIAL)]),
318                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(5, qmod.TOKEN_PARTIAL)]))
319
320     check_assignments(yield_token_assignments(q),
321                       TokenAssignment(penalty=0.2, name=TokenRange(0, 2),
322                                       qualifier=TokenRange(2, 3),
323                                       address=[TokenRange(3, 5)]),
324                       TokenAssignment(penalty=0.2, name=TokenRange(3, 5),
325                                       qualifier=TokenRange(2, 3),
326                                       address=[TokenRange(0, 2)]))
327
328
329 def test_qualifier_before_housenumber():
330     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_QUALIFIER)]),
331                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(2, qmod.TOKEN_HOUSENUMBER)]),
332                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(3, qmod.TOKEN_PARTIAL)]))
333
334     check_assignments(yield_token_assignments(q))
335
336
337 def test_qualifier_after_housenumber():
338     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_HOUSENUMBER)]),
339                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(2, qmod.TOKEN_QUALIFIER)]),
340                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(3, qmod.TOKEN_PARTIAL)]))
341
342     check_assignments(yield_token_assignments(q))
343
344
345 def test_qualifier_in_middle_of_phrase():
346     q = make_query((qmod.BREAK_START, qmod.PHRASE_ANY, [(1, qmod.TOKEN_PARTIAL)]),
347                    (qmod.BREAK_PHRASE, qmod.PHRASE_ANY, [(2, qmod.TOKEN_PARTIAL)]),
348                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(3, qmod.TOKEN_QUALIFIER)]),
349                    (qmod.BREAK_WORD, qmod.PHRASE_ANY, [(4, qmod.TOKEN_PARTIAL)]),
350                    (qmod.BREAK_PHRASE, qmod.PHRASE_ANY, [(5, qmod.TOKEN_PARTIAL)]))
351
352     check_assignments(yield_token_assignments(q))