]> git.openstreetmap.org Git - nominatim.git/blob - test/python/api/search/test_db_search_builder.py
extend word statistics to address index
[nominatim.git] / test / python / api / search / test_db_search_builder.py
1 # SPDX-License-Identifier: GPL-3.0-or-later
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2023 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Tests for creating abstract searches from token assignments.
9 """
10 import pytest
11
12 from nominatim.api.search.query import Token, TokenRange, BreakType, PhraseType, TokenType, QueryStruct, Phrase
13 from nominatim.api.search.db_search_builder import SearchBuilder
14 from nominatim.api.search.token_assignment import TokenAssignment
15 from nominatim.api.types import SearchDetails
16 import nominatim.api.search.db_searches as dbs
17
18 class MyToken(Token):
19     def get_category(self):
20         return 'this', 'that'
21
22
23 def make_query(*args):
24     q = QueryStruct([Phrase(PhraseType.NONE, '')])
25
26     for _ in range(max(inner[0] for tlist in args for inner in tlist)):
27         q.add_node(BreakType.WORD, PhraseType.NONE)
28     q.add_node(BreakType.END, PhraseType.NONE)
29
30     for start, tlist in enumerate(args):
31         for end, ttype, tinfo in tlist:
32             for tid, word in tinfo:
33                 q.add_token(TokenRange(start, end), ttype,
34                             MyToken(0.5 if ttype == TokenType.PARTIAL else 0.0, tid, 1, word, True))
35
36
37     return q
38
39
40 def test_country_search():
41     q = make_query([(1, TokenType.COUNTRY, [(2, 'de'), (3, 'en')])])
42     builder = SearchBuilder(q, SearchDetails())
43
44     searches = list(builder.build(TokenAssignment(country=TokenRange(0, 1))))
45
46     assert len(searches) == 1
47
48     search = searches[0]
49
50     assert isinstance(search, dbs.CountrySearch)
51     assert set(search.countries.values) == {'de', 'en'}
52
53
54 def test_country_search_with_country_restriction():
55     q = make_query([(1, TokenType.COUNTRY, [(2, 'de'), (3, 'en')])])
56     builder = SearchBuilder(q, SearchDetails.from_kwargs({'countries': 'en,fr'}))
57
58     searches = list(builder.build(TokenAssignment(country=TokenRange(0, 1))))
59
60     assert len(searches) == 1
61
62     search = searches[0]
63
64     assert isinstance(search, dbs.CountrySearch)
65     assert set(search.countries.values) == {'en'}
66
67
68 def test_country_search_with_conflicting_country_restriction():
69     q = make_query([(1, TokenType.COUNTRY, [(2, 'de'), (3, 'en')])])
70     builder = SearchBuilder(q, SearchDetails.from_kwargs({'countries': 'fr'}))
71
72     searches = list(builder.build(TokenAssignment(country=TokenRange(0, 1))))
73
74     assert len(searches) == 0
75
76
77 def test_postcode_search_simple():
78     q = make_query([(1, TokenType.POSTCODE, [(34, '2367')])])
79     builder = SearchBuilder(q, SearchDetails())
80
81     searches = list(builder.build(TokenAssignment(postcode=TokenRange(0, 1))))
82
83     assert len(searches) == 1
84     search = searches[0]
85
86     assert isinstance(search, dbs.PostcodeSearch)
87     assert search.postcodes.values == ['2367']
88     assert not search.countries.values
89     assert not search.lookups
90     assert not search.rankings
91
92
93 def test_postcode_with_country():
94     q = make_query([(1, TokenType.POSTCODE, [(34, '2367')])],
95                    [(2, TokenType.COUNTRY, [(1, 'xx')])])
96     builder = SearchBuilder(q, SearchDetails())
97
98     searches = list(builder.build(TokenAssignment(postcode=TokenRange(0, 1),
99                                                   country=TokenRange(1, 2))))
100
101     assert len(searches) == 1
102     search = searches[0]
103
104     assert isinstance(search, dbs.PostcodeSearch)
105     assert search.postcodes.values == ['2367']
106     assert search.countries.values == ['xx']
107     assert not search.lookups
108     assert not search.rankings
109
110
111 def test_postcode_with_address():
112     q = make_query([(1, TokenType.POSTCODE, [(34, '2367')])],
113                    [(2, TokenType.PARTIAL, [(100, 'word')])])
114     builder = SearchBuilder(q, SearchDetails())
115
116     searches = list(builder.build(TokenAssignment(postcode=TokenRange(0, 1),
117                                                   address=[TokenRange(1, 2)])))
118
119     assert len(searches) == 1
120     search = searches[0]
121
122     assert isinstance(search, dbs.PostcodeSearch)
123     assert search.postcodes.values == ['2367']
124     assert not search.countries
125     assert search.lookups
126     assert not search.rankings
127
128
129 def test_postcode_with_address_with_full_word():
130     q = make_query([(1, TokenType.POSTCODE, [(34, '2367')])],
131                    [(2, TokenType.PARTIAL, [(100, 'word')]),
132                     (2, TokenType.WORD, [(1, 'full')])])
133     builder = SearchBuilder(q, SearchDetails())
134
135     searches = list(builder.build(TokenAssignment(postcode=TokenRange(0, 1),
136                                                   address=[TokenRange(1, 2)])))
137
138     assert len(searches) == 1
139     search = searches[0]
140
141     assert isinstance(search, dbs.PostcodeSearch)
142     assert search.postcodes.values == ['2367']
143     assert not search.countries
144     assert search.lookups
145     assert len(search.rankings) == 1
146
147
148 @pytest.mark.parametrize('kwargs', [{'viewbox': '0,0,1,1', 'bounded_viewbox': True},
149                                     {'near': '10,10'}])
150 def test_near_item_only(kwargs):
151     q = make_query([(1, TokenType.NEAR_ITEM, [(2, 'foo')])])
152     builder = SearchBuilder(q, SearchDetails.from_kwargs(kwargs))
153
154     searches = list(builder.build(TokenAssignment(near_item=TokenRange(0, 1))))
155
156     assert len(searches) == 1
157
158     search = searches[0]
159
160     assert isinstance(search, dbs.PoiSearch)
161     assert search.qualifiers.values == [('this', 'that')]
162
163
164 @pytest.mark.parametrize('kwargs', [{'viewbox': '0,0,1,1'},
165                                     {}])
166 def test_near_item_skipped(kwargs):
167     q = make_query([(1, TokenType.NEAR_ITEM, [(2, 'foo')])])
168     builder = SearchBuilder(q, SearchDetails.from_kwargs(kwargs))
169
170     searches = list(builder.build(TokenAssignment(near_item=TokenRange(0, 1))))
171
172     assert len(searches) == 0
173
174
175 def test_name_only_search():
176     q = make_query([(1, TokenType.PARTIAL, [(1, 'a')]),
177                     (1, TokenType.WORD, [(100, 'a')])])
178     builder = SearchBuilder(q, SearchDetails())
179
180     searches = list(builder.build(TokenAssignment(name=TokenRange(0, 1))))
181
182     assert len(searches) == 1
183     search = searches[0]
184
185     assert isinstance(search, dbs.PlaceSearch)
186     assert not search.postcodes.values
187     assert not search.countries.values
188     assert not search.housenumbers.values
189     assert not search.qualifiers.values
190     assert len(search.lookups) == 1
191     assert len(search.rankings) == 1
192
193
194 def test_name_with_qualifier():
195     q = make_query([(1, TokenType.PARTIAL, [(1, 'a')]),
196                     (1, TokenType.WORD, [(100, 'a')])],
197                    [(2, TokenType.QUALIFIER, [(55, 'hotel')])])
198     builder = SearchBuilder(q, SearchDetails())
199
200     searches = list(builder.build(TokenAssignment(name=TokenRange(0, 1),
201                                                   qualifier=TokenRange(1, 2))))
202
203     assert len(searches) == 1
204     search = searches[0]
205
206     assert isinstance(search, dbs.PlaceSearch)
207     assert not search.postcodes.values
208     assert not search.countries.values
209     assert not search.housenumbers.values
210     assert search.qualifiers.values == [('this', 'that')]
211     assert len(search.lookups) == 1
212     assert len(search.rankings) == 1
213
214
215 def test_name_with_housenumber_search():
216     q = make_query([(1, TokenType.PARTIAL, [(1, 'a')]),
217                     (1, TokenType.WORD, [(100, 'a')])],
218                    [(2, TokenType.HOUSENUMBER, [(66, '66')])])
219     builder = SearchBuilder(q, SearchDetails())
220
221     searches = list(builder.build(TokenAssignment(name=TokenRange(0, 1),
222                                                   housenumber=TokenRange(1, 2))))
223
224     assert len(searches) == 1
225     search = searches[0]
226
227     assert isinstance(search, dbs.PlaceSearch)
228     assert not search.postcodes.values
229     assert not search.countries.values
230     assert search.housenumbers.values == ['66']
231     assert len(search.lookups) == 1
232     assert len(search.rankings) == 1
233
234
235 def test_name_and_address():
236     q = make_query([(1, TokenType.PARTIAL, [(1, 'a')]),
237                     (1, TokenType.WORD, [(100, 'a')])],
238                    [(2, TokenType.PARTIAL, [(2, 'b')]),
239                     (2, TokenType.WORD, [(101, 'b')])],
240                    [(3, TokenType.PARTIAL, [(3, 'c')]),
241                     (3, TokenType.WORD, [(102, 'c')])]
242                   )
243     builder = SearchBuilder(q, SearchDetails())
244
245     searches = list(builder.build(TokenAssignment(name=TokenRange(0, 1),
246                                                   address=[TokenRange(1, 2),
247                                                            TokenRange(2, 3)])))
248
249     assert len(searches) == 1
250     search = searches[0]
251
252     assert isinstance(search, dbs.PlaceSearch)
253     assert not search.postcodes.values
254     assert not search.countries.values
255     assert not search.housenumbers.values
256     assert len(search.lookups) == 2
257     assert len(search.rankings) == 3
258
259
260 def test_name_and_complex_address():
261     q = make_query([(1, TokenType.PARTIAL, [(1, 'a')]),
262                     (1, TokenType.WORD, [(100, 'a')])],
263                    [(2, TokenType.PARTIAL, [(2, 'b')]),
264                     (3, TokenType.WORD, [(101, 'bc')])],
265                    [(3, TokenType.PARTIAL, [(3, 'c')])],
266                    [(4, TokenType.PARTIAL, [(4, 'd')]),
267                     (4, TokenType.WORD, [(103, 'd')])]
268                   )
269     builder = SearchBuilder(q, SearchDetails())
270
271     searches = list(builder.build(TokenAssignment(name=TokenRange(0, 1),
272                                                   address=[TokenRange(1, 2),
273                                                            TokenRange(2, 4)])))
274
275     assert len(searches) == 1
276     search = searches[0]
277
278     assert isinstance(search, dbs.PlaceSearch)
279     assert not search.postcodes.values
280     assert not search.countries.values
281     assert not search.housenumbers.values
282     assert len(search.lookups) == 2
283     assert len(search.rankings) == 2
284
285
286 def test_name_only_near_search():
287     q = make_query([(1, TokenType.NEAR_ITEM, [(88, 'g')])],
288                    [(2, TokenType.PARTIAL, [(1, 'a')]),
289                     (2, TokenType.WORD, [(100, 'a')])])
290     builder = SearchBuilder(q, SearchDetails())
291
292     searches = list(builder.build(TokenAssignment(name=TokenRange(1, 2),
293                                                   near_item=TokenRange(0, 1))))
294
295     assert len(searches) == 1
296     search = searches[0]
297
298     assert isinstance(search, dbs.NearSearch)
299     assert isinstance(search.search, dbs.PlaceSearch)
300
301
302 def test_name_only_search_with_category():
303     q = make_query([(1, TokenType.PARTIAL, [(1, 'a')]),
304                     (1, TokenType.WORD, [(100, 'a')])])
305     builder = SearchBuilder(q, SearchDetails.from_kwargs({'categories': [('foo', 'bar')]}))
306
307     searches = list(builder.build(TokenAssignment(name=TokenRange(0, 1))))
308
309     assert len(searches) == 1
310     search = searches[0]
311
312     assert isinstance(search, dbs.PlaceSearch)
313     assert search.qualifiers.values == [('foo', 'bar')]
314
315
316 def test_name_with_near_item_search_with_category_mismatch():
317     q = make_query([(1, TokenType.NEAR_ITEM, [(88, 'g')])],
318                    [(2, TokenType.PARTIAL, [(1, 'a')]),
319                     (2, TokenType.WORD, [(100, 'a')])])
320     builder = SearchBuilder(q, SearchDetails.from_kwargs({'categories': [('foo', 'bar')]}))
321
322     searches = list(builder.build(TokenAssignment(name=TokenRange(1, 2),
323                                                   near_item=TokenRange(0, 1))))
324
325     assert len(searches) == 0
326
327
328 def test_name_with_near_item_search_with_category_match():
329     q = make_query([(1, TokenType.NEAR_ITEM, [(88, 'g')])],
330                    [(2, TokenType.PARTIAL, [(1, 'a')]),
331                     (2, TokenType.WORD, [(100, 'a')])])
332     builder = SearchBuilder(q, SearchDetails.from_kwargs({'categories': [('foo', 'bar'),
333                                                                          ('this', 'that')]}))
334
335     searches = list(builder.build(TokenAssignment(name=TokenRange(1, 2),
336                                                   near_item=TokenRange(0, 1))))
337
338     assert len(searches) == 1
339     search = searches[0]
340
341     assert isinstance(search, dbs.NearSearch)
342     assert isinstance(search.search, dbs.PlaceSearch)
343
344
345 def test_name_with_qualifier_search_with_category_mismatch():
346     q = make_query([(1, TokenType.QUALIFIER, [(88, 'g')])],
347                    [(2, TokenType.PARTIAL, [(1, 'a')]),
348                     (2, TokenType.WORD, [(100, 'a')])])
349     builder = SearchBuilder(q, SearchDetails.from_kwargs({'categories': [('foo', 'bar')]}))
350
351     searches = list(builder.build(TokenAssignment(name=TokenRange(1, 2),
352                                                   qualifier=TokenRange(0, 1))))
353
354     assert len(searches) == 0
355
356
357 def test_name_with_qualifier_search_with_category_match():
358     q = make_query([(1, TokenType.QUALIFIER, [(88, 'g')])],
359                    [(2, TokenType.PARTIAL, [(1, 'a')]),
360                     (2, TokenType.WORD, [(100, 'a')])])
361     builder = SearchBuilder(q, SearchDetails.from_kwargs({'categories': [('foo', 'bar'),
362                                                                          ('this', 'that')]}))
363
364     searches = list(builder.build(TokenAssignment(name=TokenRange(1, 2),
365                                                   qualifier=TokenRange(0, 1))))
366
367     assert len(searches) == 1
368     search = searches[0]
369
370     assert isinstance(search, dbs.PlaceSearch)
371     assert search.qualifiers.values == [('this', 'that')]
372
373
374 def test_name_only_search_with_countries():
375     q = make_query([(1, TokenType.PARTIAL, [(1, 'a')]),
376                     (1, TokenType.WORD, [(100, 'a')])])
377     builder = SearchBuilder(q, SearchDetails.from_kwargs({'countries': 'de,en'}))
378
379     searches = list(builder.build(TokenAssignment(name=TokenRange(0, 1))))
380
381     assert len(searches) == 1
382     search = searches[0]
383
384     assert isinstance(search, dbs.PlaceSearch)
385     assert not search.postcodes.values
386     assert set(search.countries.values) == {'de', 'en'}
387     assert not search.housenumbers.values
388
389
390 def make_counted_searches(name_part, name_full, address_part, address_full,
391                           num_address_parts=1):
392     q = QueryStruct([Phrase(PhraseType.NONE, '')])
393     for i in range(1 + num_address_parts):
394         q.add_node(BreakType.WORD, PhraseType.NONE)
395     q.add_node(BreakType.END, PhraseType.NONE)
396
397     q.add_token(TokenRange(0, 1), TokenType.PARTIAL,
398                 MyToken(0.5, 1, name_part, 'name_part', True))
399     q.add_token(TokenRange(0, 1), TokenType.WORD,
400                 MyToken(0, 101, name_full, 'name_full', True))
401     for i in range(num_address_parts):
402         q.add_token(TokenRange(i + 1, i + 2), TokenType.PARTIAL,
403                     MyToken(0.5, 2, address_part, 'address_part', True))
404         q.add_token(TokenRange(i + 1, i + 2), TokenType.WORD,
405                     MyToken(0, 102, address_full, 'address_full', True))
406
407     builder = SearchBuilder(q, SearchDetails())
408
409     return list(builder.build(TokenAssignment(name=TokenRange(0, 1),
410                                               address=[TokenRange(1, 1 + num_address_parts)])))
411
412
413 def test_infrequent_partials_in_name():
414     searches = make_counted_searches(1, 1, 1, 1)
415
416     assert len(searches) == 1
417     search = searches[0]
418
419     assert isinstance(search, dbs.PlaceSearch)
420     assert len(search.lookups) == 2
421     assert len(search.rankings) == 2
422
423     assert set((l.column, l.lookup_type.__name__) for l in search.lookups) == \
424             {('name_vector', 'LookupAll'), ('nameaddress_vector', 'Restrict')}
425
426
427 def test_frequent_partials_in_name_and_address():
428     searches = make_counted_searches(9999, 1, 9999, 1)
429
430     assert len(searches) == 2
431
432     assert all(isinstance(s, dbs.PlaceSearch) for s in searches)
433     searches.sort(key=lambda s: s.penalty)
434
435     assert set((l.column, l.lookup_type.__name__) for l in searches[0].lookups) == \
436             {('name_vector', 'LookupAny'), ('nameaddress_vector', 'Restrict')}
437     assert set((l.column, l.lookup_type.__name__) for l in searches[1].lookups) == \
438             {('nameaddress_vector', 'LookupAll'), ('name_vector', 'LookupAll')}
439
440
441 def test_too_frequent_partials_in_name_and_address():
442     searches = make_counted_searches(20000, 1, 10000, 1)
443
444     assert len(searches) == 1
445
446     assert all(isinstance(s, dbs.PlaceSearch) for s in searches)
447     searches.sort(key=lambda s: s.penalty)
448
449     assert set((l.column, l.lookup_type.__name__) for l in searches[0].lookups) == \
450             {('name_vector', 'LookupAny'), ('nameaddress_vector', 'Restrict')}