]> git.openstreetmap.org Git - nominatim.git/blob - test/python/test_indexing.py
add more tests for legacy tokenizer
[nominatim.git] / test / python / test_indexing.py
1 """
2 Tests for running the indexing.
3 """
4 import itertools
5 import psycopg2
6 import pytest
7
8 from nominatim.indexer import indexer
9 from nominatim.tokenizer import factory
10
11 class IndexerTestDB:
12
13     def __init__(self, conn):
14         self.placex_id = itertools.count(100000)
15         self.osmline_id = itertools.count(500000)
16         self.postcode_id = itertools.count(700000)
17
18         self.conn = conn
19         self.conn.set_isolation_level(0)
20         with self.conn.cursor() as cur:
21             cur.execute('CREATE EXTENSION hstore')
22             cur.execute("""CREATE TABLE placex (place_id BIGINT,
23                                                 class TEXT,
24                                                 type TEXT,
25                                                 rank_address SMALLINT,
26                                                 rank_search SMALLINT,
27                                                 indexed_status SMALLINT,
28                                                 indexed_date TIMESTAMP,
29                                                 partition SMALLINT,
30                                                 admin_level SMALLINT,
31                                                 address HSTORE,
32                                                 token_info JSONB,
33                                                 geometry_sector INTEGER)""")
34             cur.execute("""CREATE TABLE location_property_osmline (
35                                place_id BIGINT,
36                                osm_id BIGINT,
37                                address HSTORE,
38                                token_info JSONB,
39                                indexed_status SMALLINT,
40                                indexed_date TIMESTAMP,
41                                geometry_sector INTEGER)""")
42             cur.execute("""CREATE TABLE location_postcode (
43                                place_id BIGINT,
44                                indexed_status SMALLINT,
45                                indexed_date TIMESTAMP,
46                                country_code varchar(2),
47                                postcode TEXT)""")
48             cur.execute("""CREATE OR REPLACE FUNCTION date_update() RETURNS TRIGGER
49                            AS $$
50                            BEGIN
51                              IF NEW.indexed_status = 0 and OLD.indexed_status != 0 THEN
52                                NEW.indexed_date = now();
53                              END IF;
54                              RETURN NEW;
55                            END; $$ LANGUAGE plpgsql;""")
56             cur.execute("""CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
57                                                       OUT name HSTORE,
58                                                       OUT address HSTORE,
59                                                       OUT country_feature VARCHAR)
60                            AS $$
61                            BEGIN
62                             address := p.address;
63                             name := p.address;
64                            END;
65                            $$ LANGUAGE plpgsql STABLE;
66                         """)
67             cur.execute("""CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
68                            RETURNS HSTORE AS $$
69                            BEGIN
70                              RETURN in_address;
71                            END;
72                            $$ LANGUAGE plpgsql STABLE;
73                         """)
74
75             for table in ('placex', 'location_property_osmline', 'location_postcode'):
76                 cur.execute("""CREATE TRIGGER {0}_update BEFORE UPDATE ON {0}
77                                FOR EACH ROW EXECUTE PROCEDURE date_update()
78                             """.format(table))
79
80     def scalar(self, query):
81         with self.conn.cursor() as cur:
82             cur.execute(query)
83             return cur.fetchone()[0]
84
85     def add_place(self, cls='place', typ='locality',
86                   rank_search=30, rank_address=30, sector=20):
87         next_id = next(self.placex_id)
88         with self.conn.cursor() as cur:
89             cur.execute("""INSERT INTO placex
90                               (place_id, class, type, rank_search, rank_address,
91                                indexed_status, geometry_sector)
92                               VALUES (%s, %s, %s, %s, %s, 1, %s)""",
93                         (next_id, cls, typ, rank_search, rank_address, sector))
94         return next_id
95
96     def add_admin(self, **kwargs):
97         kwargs['cls'] = 'boundary'
98         kwargs['typ'] = 'administrative'
99         return self.add_place(**kwargs)
100
101     def add_osmline(self, sector=20):
102         next_id = next(self.osmline_id)
103         with self.conn.cursor() as cur:
104             cur.execute("""INSERT INTO location_property_osmline
105                               (place_id, osm_id, indexed_status, geometry_sector)
106                               VALUES (%s, %s, 1, %s)""",
107                         (next_id, next_id, sector))
108         return next_id
109
110     def add_postcode(self, country, postcode):
111         next_id = next(self.postcode_id)
112         with self.conn.cursor() as cur:
113             cur.execute("""INSERT INTO location_postcode
114                             (place_id, indexed_status, country_code, postcode)
115                             VALUES (%s, 1, %s, %s)""",
116                         (next_id, country, postcode))
117         return next_id
118
119     def placex_unindexed(self):
120         return self.scalar('SELECT count(*) from placex where indexed_status > 0')
121
122     def osmline_unindexed(self):
123         return self.scalar('SELECT count(*) from location_property_osmline where indexed_status > 0')
124
125
126 @pytest.fixture
127 def test_db(temp_db_conn):
128     yield IndexerTestDB(temp_db_conn)
129
130
131 @pytest.fixture
132 def test_tokenizer(tokenizer_mock, def_config, tmp_path):
133     def_config.project_dir = tmp_path
134     return factory.create_tokenizer(def_config)
135
136
137 @pytest.mark.parametrize("threads", [1, 15])
138 def test_index_all_by_rank(test_db, threads, test_tokenizer):
139     for rank in range(31):
140         test_db.add_place(rank_address=rank, rank_search=rank)
141     test_db.add_osmline()
142
143     assert 31 == test_db.placex_unindexed()
144     assert 1 == test_db.osmline_unindexed()
145
146     idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
147     idx.index_by_rank(0, 30)
148
149     assert 0 == test_db.placex_unindexed()
150     assert 0 == test_db.osmline_unindexed()
151
152     assert 0 == test_db.scalar("""SELECT count(*) from placex
153                                WHERE indexed_status = 0 and indexed_date is null""")
154     # ranks come in order of rank address
155     assert 0 == test_db.scalar("""
156         SELECT count(*) FROM placex p WHERE rank_address > 0
157           AND indexed_date >= (SELECT min(indexed_date) FROM placex o
158                                WHERE p.rank_address < o.rank_address)""")
159     # placex rank < 30 objects come before interpolations
160     assert 0 == test_db.scalar(
161         """SELECT count(*) FROM placex WHERE rank_address < 30
162              AND indexed_date > (SELECT min(indexed_date) FROM location_property_osmline)""")
163     # placex rank = 30 objects come after interpolations
164     assert 0 == test_db.scalar(
165         """SELECT count(*) FROM placex WHERE rank_address = 30
166              AND indexed_date < (SELECT max(indexed_date) FROM location_property_osmline)""")
167     # rank 0 comes after rank 29 and before rank 30
168     assert 0 == test_db.scalar(
169         """SELECT count(*) FROM placex WHERE rank_address < 30
170              AND indexed_date > (SELECT min(indexed_date) FROM placex WHERE rank_address = 0)""")
171     assert 0 == test_db.scalar(
172         """SELECT count(*) FROM placex WHERE rank_address = 30
173              AND indexed_date < (SELECT max(indexed_date) FROM placex WHERE rank_address = 0)""")
174
175
176 @pytest.mark.parametrize("threads", [1, 15])
177 def test_index_partial_without_30(test_db, threads, test_tokenizer):
178     for rank in range(31):
179         test_db.add_place(rank_address=rank, rank_search=rank)
180     test_db.add_osmline()
181
182     assert 31 == test_db.placex_unindexed()
183     assert 1 == test_db.osmline_unindexed()
184
185     idx = indexer.Indexer('dbname=test_nominatim_python_unittest',
186                           test_tokenizer, threads)
187     idx.index_by_rank(4, 15)
188
189     assert 19 == test_db.placex_unindexed()
190     assert 1 == test_db.osmline_unindexed()
191
192     assert 0 == test_db.scalar("""
193                     SELECT count(*) FROM placex
194                       WHERE indexed_status = 0 AND not rank_address between 4 and 15""")
195
196
197 @pytest.mark.parametrize("threads", [1, 15])
198 def test_index_partial_with_30(test_db, threads, test_tokenizer):
199     for rank in range(31):
200         test_db.add_place(rank_address=rank, rank_search=rank)
201     test_db.add_osmline()
202
203     assert 31 == test_db.placex_unindexed()
204     assert 1 == test_db.osmline_unindexed()
205
206     idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
207     idx.index_by_rank(28, 30)
208
209     assert 27 == test_db.placex_unindexed()
210     assert 0 == test_db.osmline_unindexed()
211
212     assert 0 == test_db.scalar("""
213                     SELECT count(*) FROM placex
214                       WHERE indexed_status = 0 AND rank_address between 1 and 27""")
215
216 @pytest.mark.parametrize("threads", [1, 15])
217 def test_index_boundaries(test_db, threads, test_tokenizer):
218     for rank in range(4, 10):
219         test_db.add_admin(rank_address=rank, rank_search=rank)
220     for rank in range(31):
221         test_db.add_place(rank_address=rank, rank_search=rank)
222     test_db.add_osmline()
223
224     assert 37 == test_db.placex_unindexed()
225     assert 1 == test_db.osmline_unindexed()
226
227     idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
228     idx.index_boundaries(0, 30)
229
230     assert 31 == test_db.placex_unindexed()
231     assert 1 == test_db.osmline_unindexed()
232
233     assert 0 == test_db.scalar("""
234                     SELECT count(*) FROM placex
235                       WHERE indexed_status = 0 AND class != 'boundary'""")
236
237
238 @pytest.mark.parametrize("threads", [1, 15])
239 def test_index_postcodes(test_db, threads, test_tokenizer):
240     for postcode in range(1000):
241         test_db.add_postcode('de', postcode)
242     for postcode in range(32000, 33000):
243         test_db.add_postcode('us', postcode)
244
245     idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
246     idx.index_postcodes()
247
248     assert 0 == test_db.scalar("""SELECT count(*) FROM location_postcode
249                                   WHERE indexed_status != 0""")
250
251
252 @pytest.mark.parametrize("analyse", [True, False])
253 def test_index_full(test_db, analyse, test_tokenizer):
254     for rank in range(4, 10):
255         test_db.add_admin(rank_address=rank, rank_search=rank)
256     for rank in range(31):
257         test_db.add_place(rank_address=rank, rank_search=rank)
258     test_db.add_osmline()
259     for postcode in range(1000):
260         test_db.add_postcode('de', postcode)
261
262     idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, 4)
263     idx.index_full(analyse=analyse)
264
265     assert 0 == test_db.placex_unindexed()
266     assert 0 == test_db.osmline_unindexed()
267     assert 0 == test_db.scalar("""SELECT count(*) FROM location_postcode
268                                   WHERE indexed_status != 0""")
269
270
271 @pytest.mark.parametrize("threads", [1, 15])
272 def test_index_reopen_connection(test_db, threads, monkeypatch, test_tokenizer):
273     monkeypatch.setattr(indexer.WorkerPool, "REOPEN_CONNECTIONS_AFTER", 15)
274
275     for _ in range(1000):
276         test_db.add_place(rank_address=30, rank_search=30)
277
278     idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
279     idx.index_by_rank(28, 30)
280
281     assert 0 == test_db.placex_unindexed()