]> git.openstreetmap.org Git - nominatim.git/blob - test/python/api/search/test_token_assignment.py
Merge pull request #3367 from lonvia/address-word-counts
[nominatim.git] / test / python / api / search / test_token_assignment.py
1 # SPDX-License-Identifier: GPL-3.0-or-later
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2023 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Test for creation of token assignments from tokenized queries.
9 """
10 import pytest
11
12 from nominatim.api.search.query import QueryStruct, Phrase, PhraseType, BreakType, TokenType, TokenRange, Token
13 from nominatim.api.search.token_assignment import yield_token_assignments, TokenAssignment, PENALTY_TOKENCHANGE
14
15 class MyToken(Token):
16     def get_category(self):
17         return 'this', 'that'
18
19
20 def make_query(*args):
21     q = QueryStruct([Phrase(args[0][1], '')])
22     dummy = MyToken(penalty=3.0, token=45, count=1, addr_count=1,
23                     lookup_word='foo', is_indexed=True)
24
25     for btype, ptype, _ in args[1:]:
26         q.add_node(btype, ptype)
27     q.add_node(BreakType.END, PhraseType.NONE)
28
29     for start, t in enumerate(args):
30         for end, ttype in t[2]:
31             q.add_token(TokenRange(start, end), ttype, dummy)
32
33     return q
34
35
36 def check_assignments(actual, *expected):
37     todo = list(expected)
38     for assignment in actual:
39         assert assignment in todo, f"Unexpected assignment: {assignment}"
40         todo.remove(assignment)
41
42     assert not todo, f"Missing assignments: {expected}"
43
44
45 def test_query_with_missing_tokens():
46     q = QueryStruct([Phrase(PhraseType.NONE, '')])
47     q.add_node(BreakType.END, PhraseType.NONE)
48
49     assert list(yield_token_assignments(q)) == []
50
51
52 def test_one_word_query():
53     q = make_query((BreakType.START, PhraseType.NONE,
54                     [(1, TokenType.PARTIAL),
55                      (1, TokenType.WORD),
56                      (1, TokenType.HOUSENUMBER)]))
57
58     res = list(yield_token_assignments(q))
59     assert res == [TokenAssignment(name=TokenRange(0, 1))]
60
61
62 def test_single_postcode():
63     q = make_query((BreakType.START, PhraseType.NONE,
64                     [(1, TokenType.POSTCODE)]))
65
66     res = list(yield_token_assignments(q))
67     assert res == [TokenAssignment(postcode=TokenRange(0, 1))]
68
69
70 def test_single_country_name():
71     q = make_query((BreakType.START, PhraseType.NONE,
72                     [(1, TokenType.COUNTRY)]))
73
74     res = list(yield_token_assignments(q))
75     assert res == [TokenAssignment(country=TokenRange(0, 1))]
76
77
78 def test_single_word_poi_search():
79     q = make_query((BreakType.START, PhraseType.NONE,
80                     [(1, TokenType.NEAR_ITEM),
81                      (1, TokenType.QUALIFIER)]))
82
83     res = list(yield_token_assignments(q))
84     assert res == [TokenAssignment(near_item=TokenRange(0, 1))]
85
86
87 @pytest.mark.parametrize('btype', [BreakType.WORD, BreakType.PART, BreakType.TOKEN])
88 def test_multiple_simple_words(btype):
89     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
90                    (btype, PhraseType.NONE, [(2, TokenType.PARTIAL)]),
91                    (btype, PhraseType.NONE, [(3, TokenType.PARTIAL)]))
92
93     penalty = PENALTY_TOKENCHANGE[btype]
94
95     check_assignments(yield_token_assignments(q),
96                       TokenAssignment(name=TokenRange(0, 3)),
97                       TokenAssignment(penalty=penalty, name=TokenRange(0, 2),
98                                       address=[TokenRange(2, 3)]),
99                       TokenAssignment(penalty=penalty, name=TokenRange(0, 1),
100                                       address=[TokenRange(1, 3)]),
101                       TokenAssignment(penalty=penalty, name=TokenRange(1, 3),
102                                       address=[TokenRange(0, 1)]),
103                       TokenAssignment(penalty=penalty, name=TokenRange(2, 3),
104                                       address=[TokenRange(0, 2)])
105                      )
106
107
108 def test_multiple_words_respect_phrase_break():
109     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
110                    (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.PARTIAL)]))
111
112     check_assignments(yield_token_assignments(q),
113                       TokenAssignment(name=TokenRange(0, 1),
114                                       address=[TokenRange(1, 2)]),
115                       TokenAssignment(name=TokenRange(1, 2),
116                                       address=[TokenRange(0, 1)]))
117
118
119 def test_housenumber_and_street():
120     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.HOUSENUMBER)]),
121                    (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.PARTIAL)]))
122
123     check_assignments(yield_token_assignments(q),
124                       TokenAssignment(name=TokenRange(1, 2),
125                                       housenumber=TokenRange(0, 1)),
126                       TokenAssignment(address=[TokenRange(1, 2)],
127                                       housenumber=TokenRange(0, 1)))
128
129
130 def test_housenumber_and_street_backwards():
131     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
132                    (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.HOUSENUMBER)]))
133
134     check_assignments(yield_token_assignments(q),
135                       TokenAssignment(name=TokenRange(0, 1),
136                                       housenumber=TokenRange(1, 2)),
137                       TokenAssignment(address=[TokenRange(0, 1)],
138                                       housenumber=TokenRange(1, 2)))
139
140
141 def test_housenumber_and_postcode():
142     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
143                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.HOUSENUMBER)]),
144                    (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]),
145                    (BreakType.WORD, PhraseType.NONE, [(4, TokenType.POSTCODE)]))
146
147     check_assignments(yield_token_assignments(q),
148                       TokenAssignment(penalty=pytest.approx(0.3),
149                                       name=TokenRange(0, 1),
150                                       housenumber=TokenRange(1, 2),
151                                       address=[TokenRange(2, 3)],
152                                       postcode=TokenRange(3, 4)),
153                       TokenAssignment(penalty=pytest.approx(0.3),
154                                       housenumber=TokenRange(1, 2),
155                                       address=[TokenRange(0, 1), TokenRange(2, 3)],
156                                       postcode=TokenRange(3, 4)))
157
158 def test_postcode_and_housenumber():
159     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
160                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.POSTCODE)]),
161                    (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]),
162                    (BreakType.WORD, PhraseType.NONE, [(4, TokenType.HOUSENUMBER)]))
163
164     check_assignments(yield_token_assignments(q),
165                       TokenAssignment(penalty=pytest.approx(0.3),
166                                       name=TokenRange(2, 3),
167                                       housenumber=TokenRange(3, 4),
168                                       address=[TokenRange(0, 1)],
169                                       postcode=TokenRange(1, 2)),
170                       TokenAssignment(penalty=pytest.approx(0.3),
171                                       housenumber=TokenRange(3, 4),
172                                       address=[TokenRange(0, 1), TokenRange(2, 3)],
173                                       postcode=TokenRange(1, 2)))
174
175
176 def test_country_housenumber_postcode():
177     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.COUNTRY)]),
178                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.PARTIAL)]),
179                    (BreakType.WORD, PhraseType.NONE, [(3, TokenType.HOUSENUMBER)]),
180                    (BreakType.WORD, PhraseType.NONE, [(4, TokenType.POSTCODE)]))
181
182     check_assignments(yield_token_assignments(q))
183
184
185 @pytest.mark.parametrize('ttype', [TokenType.POSTCODE, TokenType.COUNTRY,
186                                    TokenType.NEAR_ITEM, TokenType.QUALIFIER])
187 def test_housenumber_with_only_special_terms(ttype):
188     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.HOUSENUMBER)]),
189                    (BreakType.WORD, PhraseType.NONE, [(2, ttype)]))
190
191     check_assignments(yield_token_assignments(q))
192
193
194 @pytest.mark.parametrize('ttype', [TokenType.POSTCODE, TokenType.HOUSENUMBER, TokenType.COUNTRY])
195 def test_multiple_special_tokens(ttype):
196     q = make_query((BreakType.START, PhraseType.NONE, [(1, ttype)]),
197                    (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.PARTIAL)]),
198                    (BreakType.PHRASE, PhraseType.NONE, [(3, ttype)]))
199
200     check_assignments(yield_token_assignments(q))
201
202
203 def test_housenumber_many_phrases():
204     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
205                    (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.PARTIAL)]),
206                    (BreakType.PHRASE, PhraseType.NONE, [(3, TokenType.PARTIAL)]),
207                    (BreakType.PHRASE, PhraseType.NONE, [(4, TokenType.HOUSENUMBER)]),
208                    (BreakType.WORD, PhraseType.NONE, [(5, TokenType.PARTIAL)]))
209
210     check_assignments(yield_token_assignments(q),
211                       TokenAssignment(penalty=0.1,
212                                       name=TokenRange(4, 5),
213                                       housenumber=TokenRange(3, 4),\
214                                       address=[TokenRange(0, 1), TokenRange(1, 2),
215                                                TokenRange(2, 3)]),
216                       TokenAssignment(penalty=0.1,
217                                       housenumber=TokenRange(3, 4),\
218                                       address=[TokenRange(0, 1), TokenRange(1, 2),
219                                                TokenRange(2, 3), TokenRange(4, 5)]))
220
221
222 def test_country_at_beginning():
223     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.COUNTRY)]),
224                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.PARTIAL)]))
225
226     check_assignments(yield_token_assignments(q),
227                       TokenAssignment(penalty=0.1, name=TokenRange(1, 2),
228                                       country=TokenRange(0, 1)))
229
230
231 def test_country_at_end():
232     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
233                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.COUNTRY)]))
234
235     check_assignments(yield_token_assignments(q),
236                       TokenAssignment(penalty=0.1, name=TokenRange(0, 1),
237                                       country=TokenRange(1, 2)))
238
239
240 def test_country_in_middle():
241     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
242                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.COUNTRY)]),
243                    (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]))
244
245     check_assignments(yield_token_assignments(q))
246
247
248 def test_postcode_with_designation():
249     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.POSTCODE)]),
250                    (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.PARTIAL)]))
251
252     check_assignments(yield_token_assignments(q),
253                       TokenAssignment(penalty=0.1, name=TokenRange(1, 2),
254                                       postcode=TokenRange(0, 1)),
255                       TokenAssignment(postcode=TokenRange(0, 1),
256                                       address=[TokenRange(1, 2)]))
257
258
259 def test_postcode_with_designation_backwards():
260     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
261                    (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.POSTCODE)]))
262
263     check_assignments(yield_token_assignments(q),
264                       TokenAssignment(name=TokenRange(0, 1),
265                                       postcode=TokenRange(1, 2)),
266                       TokenAssignment(penalty=0.1, postcode=TokenRange(1, 2),
267                                       address=[TokenRange(0, 1)]))
268
269
270 def test_near_item_at_beginning():
271     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.NEAR_ITEM)]),
272                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.PARTIAL)]))
273
274     check_assignments(yield_token_assignments(q),
275                       TokenAssignment(penalty=0.1, name=TokenRange(1, 2),
276                                       near_item=TokenRange(0, 1)))
277
278
279 def test_near_item_at_end():
280     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
281                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.NEAR_ITEM)]))
282
283     check_assignments(yield_token_assignments(q),
284                       TokenAssignment(penalty=0.1, name=TokenRange(0, 1),
285                                       near_item=TokenRange(1, 2)))
286
287
288 def test_near_item_in_middle():
289     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
290                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.NEAR_ITEM)]),
291                    (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]))
292
293     check_assignments(yield_token_assignments(q))
294
295
296 def test_qualifier_at_beginning():
297     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.QUALIFIER)]),
298                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.PARTIAL)]),
299                    (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]))
300
301
302     check_assignments(yield_token_assignments(q),
303                       TokenAssignment(penalty=0.1, name=TokenRange(1, 3),
304                                       qualifier=TokenRange(0, 1)),
305                       TokenAssignment(penalty=0.2, name=TokenRange(1, 2),
306                                       qualifier=TokenRange(0, 1),
307                                       address=[TokenRange(2, 3)]))
308
309
310 def test_qualifier_after_name():
311     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
312                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.PARTIAL)]),
313                    (BreakType.WORD, PhraseType.NONE, [(3, TokenType.QUALIFIER)]),
314                    (BreakType.WORD, PhraseType.NONE, [(4, TokenType.PARTIAL)]),
315                    (BreakType.WORD, PhraseType.NONE, [(5, TokenType.PARTIAL)]))
316
317
318     check_assignments(yield_token_assignments(q),
319                       TokenAssignment(penalty=0.2, name=TokenRange(0, 2),
320                                       qualifier=TokenRange(2, 3),
321                                       address=[TokenRange(3, 5)]),
322                       TokenAssignment(penalty=0.2, name=TokenRange(3, 5),
323                                       qualifier=TokenRange(2, 3),
324                                       address=[TokenRange(0, 2)]))
325
326
327 def test_qualifier_before_housenumber():
328     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.QUALIFIER)]),
329                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.HOUSENUMBER)]),
330                    (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]))
331
332     check_assignments(yield_token_assignments(q))
333
334
335 def test_qualifier_after_housenumber():
336     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.HOUSENUMBER)]),
337                    (BreakType.WORD, PhraseType.NONE, [(2, TokenType.QUALIFIER)]),
338                    (BreakType.WORD, PhraseType.NONE, [(3, TokenType.PARTIAL)]))
339
340     check_assignments(yield_token_assignments(q))
341
342
343 def test_qualifier_in_middle_of_phrase():
344     q = make_query((BreakType.START, PhraseType.NONE, [(1, TokenType.PARTIAL)]),
345                    (BreakType.PHRASE, PhraseType.NONE, [(2, TokenType.PARTIAL)]),
346                    (BreakType.WORD, PhraseType.NONE, [(3, TokenType.QUALIFIER)]),
347                    (BreakType.WORD, PhraseType.NONE, [(4, TokenType.PARTIAL)]),
348                    (BreakType.PHRASE, PhraseType.NONE, [(5, TokenType.PARTIAL)]))
349
350     check_assignments(yield_token_assignments(q))
351