]> git.openstreetmap.org Git - nominatim.git/blob - test/python/api/search/test_db_search_builder.py
Merge pull request #3542 from lonvia/remove-legacy-tokenizer
[nominatim.git] / test / python / api / search / test_db_search_builder.py
1 # SPDX-License-Identifier: GPL-3.0-or-later
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2023 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Tests for creating abstract searches from token assignments.
9 """
10 import pytest
11
12 from nominatim_api.search.query import Token, TokenRange, BreakType, PhraseType, TokenType, QueryStruct, Phrase
13 from nominatim_api.search.db_search_builder import SearchBuilder
14 from nominatim_api.search.token_assignment import TokenAssignment
15 from nominatim_api.types import SearchDetails
16 import nominatim_api.search.db_searches as dbs
17
18 class MyToken(Token):
19     def get_category(self):
20         return 'this', 'that'
21
22
23 def make_query(*args):
24     q = QueryStruct([Phrase(PhraseType.NONE, '')])
25
26     for _ in range(max(inner[0] for tlist in args for inner in tlist)):
27         q.add_node(BreakType.WORD, PhraseType.NONE)
28     q.add_node(BreakType.END, PhraseType.NONE)
29
30     for start, tlist in enumerate(args):
31         for end, ttype, tinfo in tlist:
32             for tid, word in tinfo:
33                 q.add_token(TokenRange(start, end), ttype,
34                             MyToken(penalty=0.5 if ttype == TokenType.PARTIAL else 0.0,
35                                     token=tid, count=1, addr_count=1,
36                                     lookup_word=word))
37
38
39     return q
40
41
42 def test_country_search():
43     q = make_query([(1, TokenType.COUNTRY, [(2, 'de'), (3, 'en')])])
44     builder = SearchBuilder(q, SearchDetails())
45
46     searches = list(builder.build(TokenAssignment(country=TokenRange(0, 1))))
47
48     assert len(searches) == 1
49
50     search = searches[0]
51
52     assert isinstance(search, dbs.CountrySearch)
53     assert set(search.countries.values) == {'de', 'en'}
54
55
56 def test_country_search_with_country_restriction():
57     q = make_query([(1, TokenType.COUNTRY, [(2, 'de'), (3, 'en')])])
58     builder = SearchBuilder(q, SearchDetails.from_kwargs({'countries': 'en,fr'}))
59
60     searches = list(builder.build(TokenAssignment(country=TokenRange(0, 1))))
61
62     assert len(searches) == 1
63
64     search = searches[0]
65
66     assert isinstance(search, dbs.CountrySearch)
67     assert set(search.countries.values) == {'en'}
68
69
70 def test_country_search_with_conflicting_country_restriction():
71     q = make_query([(1, TokenType.COUNTRY, [(2, 'de'), (3, 'en')])])
72     builder = SearchBuilder(q, SearchDetails.from_kwargs({'countries': 'fr'}))
73
74     searches = list(builder.build(TokenAssignment(country=TokenRange(0, 1))))
75
76     assert len(searches) == 0
77
78
79 def test_postcode_search_simple():
80     q = make_query([(1, TokenType.POSTCODE, [(34, '2367')])])
81     builder = SearchBuilder(q, SearchDetails())
82
83     searches = list(builder.build(TokenAssignment(postcode=TokenRange(0, 1))))
84
85     assert len(searches) == 1
86     search = searches[0]
87
88     assert isinstance(search, dbs.PostcodeSearch)
89     assert search.postcodes.values == ['2367']
90     assert not search.countries.values
91     assert not search.lookups
92     assert not search.rankings
93
94
95 def test_postcode_with_country():
96     q = make_query([(1, TokenType.POSTCODE, [(34, '2367')])],
97                    [(2, TokenType.COUNTRY, [(1, 'xx')])])
98     builder = SearchBuilder(q, SearchDetails())
99
100     searches = list(builder.build(TokenAssignment(postcode=TokenRange(0, 1),
101                                                   country=TokenRange(1, 2))))
102
103     assert len(searches) == 1
104     search = searches[0]
105
106     assert isinstance(search, dbs.PostcodeSearch)
107     assert search.postcodes.values == ['2367']
108     assert search.countries.values == ['xx']
109     assert not search.lookups
110     assert not search.rankings
111
112
113 def test_postcode_with_address():
114     q = make_query([(1, TokenType.POSTCODE, [(34, '2367')])],
115                    [(2, TokenType.PARTIAL, [(100, 'word')])])
116     builder = SearchBuilder(q, SearchDetails())
117
118     searches = list(builder.build(TokenAssignment(postcode=TokenRange(0, 1),
119                                                   address=[TokenRange(1, 2)])))
120
121     assert len(searches) == 1
122     search = searches[0]
123
124     assert isinstance(search, dbs.PostcodeSearch)
125     assert search.postcodes.values == ['2367']
126     assert not search.countries
127     assert search.lookups
128     assert not search.rankings
129
130
131 def test_postcode_with_address_with_full_word():
132     q = make_query([(1, TokenType.POSTCODE, [(34, '2367')])],
133                    [(2, TokenType.PARTIAL, [(100, 'word')]),
134                     (2, TokenType.WORD, [(1, 'full')])])
135     builder = SearchBuilder(q, SearchDetails())
136
137     searches = list(builder.build(TokenAssignment(postcode=TokenRange(0, 1),
138                                                   address=[TokenRange(1, 2)])))
139
140     assert len(searches) == 1
141     search = searches[0]
142
143     assert isinstance(search, dbs.PostcodeSearch)
144     assert search.postcodes.values == ['2367']
145     assert not search.countries
146     assert search.lookups
147     assert len(search.rankings) == 1
148
149
150 @pytest.mark.parametrize('kwargs', [{'viewbox': '0,0,1,1', 'bounded_viewbox': True},
151                                     {'near': '10,10'}])
152 def test_near_item_only(kwargs):
153     q = make_query([(1, TokenType.NEAR_ITEM, [(2, 'foo')])])
154     builder = SearchBuilder(q, SearchDetails.from_kwargs(kwargs))
155
156     searches = list(builder.build(TokenAssignment(near_item=TokenRange(0, 1))))
157
158     assert len(searches) == 1
159
160     search = searches[0]
161
162     assert isinstance(search, dbs.PoiSearch)
163     assert search.qualifiers.values == [('this', 'that')]
164
165
166 @pytest.mark.parametrize('kwargs', [{'viewbox': '0,0,1,1'},
167                                     {}])
168 def test_near_item_skipped(kwargs):
169     q = make_query([(1, TokenType.NEAR_ITEM, [(2, 'foo')])])
170     builder = SearchBuilder(q, SearchDetails.from_kwargs(kwargs))
171
172     searches = list(builder.build(TokenAssignment(near_item=TokenRange(0, 1))))
173
174     assert len(searches) == 0
175
176
177 def test_name_only_search():
178     q = make_query([(1, TokenType.PARTIAL, [(1, 'a')]),
179                     (1, TokenType.WORD, [(100, 'a')])])
180     builder = SearchBuilder(q, SearchDetails())
181
182     searches = list(builder.build(TokenAssignment(name=TokenRange(0, 1))))
183
184     assert len(searches) == 1
185     search = searches[0]
186
187     assert isinstance(search, dbs.PlaceSearch)
188     assert not search.postcodes.values
189     assert not search.countries.values
190     assert not search.housenumbers.values
191     assert not search.qualifiers.values
192     assert len(search.lookups) == 1
193     assert len(search.rankings) == 1
194
195
196 def test_name_with_qualifier():
197     q = make_query([(1, TokenType.PARTIAL, [(1, 'a')]),
198                     (1, TokenType.WORD, [(100, 'a')])],
199                    [(2, TokenType.QUALIFIER, [(55, 'hotel')])])
200     builder = SearchBuilder(q, SearchDetails())
201
202     searches = list(builder.build(TokenAssignment(name=TokenRange(0, 1),
203                                                   qualifier=TokenRange(1, 2))))
204
205     assert len(searches) == 1
206     search = searches[0]
207
208     assert isinstance(search, dbs.PlaceSearch)
209     assert not search.postcodes.values
210     assert not search.countries.values
211     assert not search.housenumbers.values
212     assert search.qualifiers.values == [('this', 'that')]
213     assert len(search.lookups) == 1
214     assert len(search.rankings) == 1
215
216
217 def test_name_with_housenumber_search():
218     q = make_query([(1, TokenType.PARTIAL, [(1, 'a')]),
219                     (1, TokenType.WORD, [(100, 'a')])],
220                    [(2, TokenType.HOUSENUMBER, [(66, '66')])])
221     builder = SearchBuilder(q, SearchDetails())
222
223     searches = list(builder.build(TokenAssignment(name=TokenRange(0, 1),
224                                                   housenumber=TokenRange(1, 2))))
225
226     assert len(searches) == 1
227     search = searches[0]
228
229     assert isinstance(search, dbs.PlaceSearch)
230     assert not search.postcodes.values
231     assert not search.countries.values
232     assert search.housenumbers.values == ['66']
233     assert len(search.lookups) == 1
234     assert len(search.rankings) == 1
235
236
237 def test_name_and_address():
238     q = make_query([(1, TokenType.PARTIAL, [(1, 'a')]),
239                     (1, TokenType.WORD, [(100, 'a')])],
240                    [(2, TokenType.PARTIAL, [(2, 'b')]),
241                     (2, TokenType.WORD, [(101, 'b')])],
242                    [(3, TokenType.PARTIAL, [(3, 'c')]),
243                     (3, TokenType.WORD, [(102, 'c')])]
244                   )
245     builder = SearchBuilder(q, SearchDetails())
246
247     searches = list(builder.build(TokenAssignment(name=TokenRange(0, 1),
248                                                   address=[TokenRange(1, 2),
249                                                            TokenRange(2, 3)])))
250
251     assert len(searches) == 1
252     search = searches[0]
253
254     assert isinstance(search, dbs.PlaceSearch)
255     assert not search.postcodes.values
256     assert not search.countries.values
257     assert not search.housenumbers.values
258     assert len(search.lookups) == 2
259     assert len(search.rankings) == 3
260
261
262 def test_name_and_complex_address():
263     q = make_query([(1, TokenType.PARTIAL, [(1, 'a')]),
264                     (1, TokenType.WORD, [(100, 'a')])],
265                    [(2, TokenType.PARTIAL, [(2, 'b')]),
266                     (3, TokenType.WORD, [(101, 'bc')])],
267                    [(3, TokenType.PARTIAL, [(3, 'c')])],
268                    [(4, TokenType.PARTIAL, [(4, 'd')]),
269                     (4, TokenType.WORD, [(103, 'd')])]
270                   )
271     builder = SearchBuilder(q, SearchDetails())
272
273     searches = list(builder.build(TokenAssignment(name=TokenRange(0, 1),
274                                                   address=[TokenRange(1, 2),
275                                                            TokenRange(2, 4)])))
276
277     assert len(searches) == 1
278     search = searches[0]
279
280     assert isinstance(search, dbs.PlaceSearch)
281     assert not search.postcodes.values
282     assert not search.countries.values
283     assert not search.housenumbers.values
284     assert len(search.lookups) == 2
285     assert len(search.rankings) == 2
286
287
288 def test_name_only_near_search():
289     q = make_query([(1, TokenType.NEAR_ITEM, [(88, 'g')])],
290                    [(2, TokenType.PARTIAL, [(1, 'a')]),
291                     (2, TokenType.WORD, [(100, 'a')])])
292     builder = SearchBuilder(q, SearchDetails())
293
294     searches = list(builder.build(TokenAssignment(name=TokenRange(1, 2),
295                                                   near_item=TokenRange(0, 1))))
296
297     assert len(searches) == 1
298     search = searches[0]
299
300     assert isinstance(search, dbs.NearSearch)
301     assert isinstance(search.search, dbs.PlaceSearch)
302
303
304 def test_name_only_search_with_category():
305     q = make_query([(1, TokenType.PARTIAL, [(1, 'a')]),
306                     (1, TokenType.WORD, [(100, 'a')])])
307     builder = SearchBuilder(q, SearchDetails.from_kwargs({'categories': [('foo', 'bar')]}))
308
309     searches = list(builder.build(TokenAssignment(name=TokenRange(0, 1))))
310
311     assert len(searches) == 1
312     search = searches[0]
313
314     assert isinstance(search, dbs.PlaceSearch)
315     assert search.qualifiers.values == [('foo', 'bar')]
316
317
318 def test_name_with_near_item_search_with_category_mismatch():
319     q = make_query([(1, TokenType.NEAR_ITEM, [(88, 'g')])],
320                    [(2, TokenType.PARTIAL, [(1, 'a')]),
321                     (2, TokenType.WORD, [(100, 'a')])])
322     builder = SearchBuilder(q, SearchDetails.from_kwargs({'categories': [('foo', 'bar')]}))
323
324     searches = list(builder.build(TokenAssignment(name=TokenRange(1, 2),
325                                                   near_item=TokenRange(0, 1))))
326
327     assert len(searches) == 0
328
329
330 def test_name_with_near_item_search_with_category_match():
331     q = make_query([(1, TokenType.NEAR_ITEM, [(88, 'g')])],
332                    [(2, TokenType.PARTIAL, [(1, 'a')]),
333                     (2, TokenType.WORD, [(100, 'a')])])
334     builder = SearchBuilder(q, SearchDetails.from_kwargs({'categories': [('foo', 'bar'),
335                                                                          ('this', 'that')]}))
336
337     searches = list(builder.build(TokenAssignment(name=TokenRange(1, 2),
338                                                   near_item=TokenRange(0, 1))))
339
340     assert len(searches) == 1
341     search = searches[0]
342
343     assert isinstance(search, dbs.NearSearch)
344     assert isinstance(search.search, dbs.PlaceSearch)
345
346
347 def test_name_with_qualifier_search_with_category_mismatch():
348     q = make_query([(1, TokenType.QUALIFIER, [(88, 'g')])],
349                    [(2, TokenType.PARTIAL, [(1, 'a')]),
350                     (2, TokenType.WORD, [(100, 'a')])])
351     builder = SearchBuilder(q, SearchDetails.from_kwargs({'categories': [('foo', 'bar')]}))
352
353     searches = list(builder.build(TokenAssignment(name=TokenRange(1, 2),
354                                                   qualifier=TokenRange(0, 1))))
355
356     assert len(searches) == 0
357
358
359 def test_name_with_qualifier_search_with_category_match():
360     q = make_query([(1, TokenType.QUALIFIER, [(88, 'g')])],
361                    [(2, TokenType.PARTIAL, [(1, 'a')]),
362                     (2, TokenType.WORD, [(100, 'a')])])
363     builder = SearchBuilder(q, SearchDetails.from_kwargs({'categories': [('foo', 'bar'),
364                                                                          ('this', 'that')]}))
365
366     searches = list(builder.build(TokenAssignment(name=TokenRange(1, 2),
367                                                   qualifier=TokenRange(0, 1))))
368
369     assert len(searches) == 1
370     search = searches[0]
371
372     assert isinstance(search, dbs.PlaceSearch)
373     assert search.qualifiers.values == [('this', 'that')]
374
375
376 def test_name_only_search_with_countries():
377     q = make_query([(1, TokenType.PARTIAL, [(1, 'a')]),
378                     (1, TokenType.WORD, [(100, 'a')])])
379     builder = SearchBuilder(q, SearchDetails.from_kwargs({'countries': 'de,en'}))
380
381     searches = list(builder.build(TokenAssignment(name=TokenRange(0, 1))))
382
383     assert len(searches) == 1
384     search = searches[0]
385
386     assert isinstance(search, dbs.PlaceSearch)
387     assert not search.postcodes.values
388     assert set(search.countries.values) == {'de', 'en'}
389     assert not search.housenumbers.values
390
391
392 def make_counted_searches(name_part, name_full, address_part, address_full,
393                           num_address_parts=1):
394     q = QueryStruct([Phrase(PhraseType.NONE, '')])
395     for i in range(1 + num_address_parts):
396         q.add_node(BreakType.WORD, PhraseType.NONE)
397     q.add_node(BreakType.END, PhraseType.NONE)
398
399     q.add_token(TokenRange(0, 1), TokenType.PARTIAL,
400                 MyToken(0.5, 1, name_part, 1, 'name_part'))
401     q.add_token(TokenRange(0, 1), TokenType.WORD,
402                 MyToken(0, 101, name_full, 1, 'name_full'))
403     for i in range(num_address_parts):
404         q.add_token(TokenRange(i + 1, i + 2), TokenType.PARTIAL,
405                     MyToken(0.5, 2, address_part, 1, 'address_part'))
406         q.add_token(TokenRange(i + 1, i + 2), TokenType.WORD,
407                     MyToken(0, 102, address_full, 1, 'address_full'))
408
409     builder = SearchBuilder(q, SearchDetails())
410
411     return list(builder.build(TokenAssignment(name=TokenRange(0, 1),
412                                               address=[TokenRange(1, 1 + num_address_parts)])))
413
414
415 def test_infrequent_partials_in_name():
416     searches = make_counted_searches(1, 1, 1, 1)
417
418     assert len(searches) == 1
419     search = searches[0]
420
421     assert isinstance(search, dbs.PlaceSearch)
422     assert len(search.lookups) == 2
423     assert len(search.rankings) == 2
424
425     assert set((l.column, l.lookup_type.__name__) for l in search.lookups) == \
426             {('name_vector', 'LookupAll'), ('nameaddress_vector', 'Restrict')}
427
428
429 def test_frequent_partials_in_name_and_address():
430     searches = make_counted_searches(9999, 1, 9999, 1)
431
432     assert len(searches) == 2
433
434     assert all(isinstance(s, dbs.PlaceSearch) for s in searches)
435     searches.sort(key=lambda s: s.penalty)
436
437     assert set((l.column, l.lookup_type.__name__) for l in searches[0].lookups) == \
438             {('name_vector', 'LookupAny'), ('nameaddress_vector', 'Restrict')}
439     assert set((l.column, l.lookup_type.__name__) for l in searches[1].lookups) == \
440             {('nameaddress_vector', 'LookupAll'), ('name_vector', 'LookupAll')}
441
442
443 def test_too_frequent_partials_in_name_and_address():
444     searches = make_counted_searches(20000, 1, 10000, 1)
445
446     assert len(searches) == 1
447
448     assert all(isinstance(s, dbs.PlaceSearch) for s in searches)
449     searches.sort(key=lambda s: s.penalty)
450
451     assert set((l.column, l.lookup_type.__name__) for l in searches[0].lookups) == \
452             {('name_vector', 'LookupAny'), ('nameaddress_vector', 'Restrict')}