test/bdd/steps/steps_db_ops.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2024 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 import logging
   8 from itertools import chain
   9
  10 import psycopg
  11 from psycopg import sql as pysql
  12
  13 from place_inserter import PlaceColumn
  14 from table_compare import NominatimID, DBRow
  15
  16 from nominatim_db.indexer import indexer
  17 from nominatim_db.tokenizer import factory as tokenizer_factory
  18
  19 def check_database_integrity(context):
  20     """ Check some generic constraints on the tables.
  21     """
  22     with context.db.cursor(row_factory=psycopg.rows.tuple_row) as cur:
  23         # place_addressline should not have duplicate (place_id, address_place_id)
  24         cur.execute("""SELECT count(*) FROM
  25                         (SELECT place_id, address_place_id, count(*) as c
  26                          FROM place_addressline GROUP BY place_id, address_place_id) x
  27                        WHERE c > 1""")
  28         assert cur.fetchone()[0] == 0, "Duplicates found in place_addressline"
  29
  30         # word table must not have empty word_tokens
  31         if context.nominatim.tokenizer != 'legacy':
  32             cur.execute("SELECT count(*) FROM word WHERE word_token = ''")
  33             assert cur.fetchone()[0] == 0, "Empty word tokens found in word table"
  34
  35
  36
  37 ################################ GIVEN ##################################
  38
  39 @given("the (?P<named>named )?places")
  40 def add_data_to_place_table(context, named):
  41     """ Add entries into the place table. 'named places' makes sure that
  42         the entries get a random name when none is explicitly given.
  43     """
  44     with context.db.cursor() as cur:
  45         cur.execute('ALTER TABLE place DISABLE TRIGGER place_before_insert')
  46         for row in context.table:
  47             PlaceColumn(context).add_row(row, named is not None).db_insert(cur)
  48         cur.execute('ALTER TABLE place ENABLE TRIGGER place_before_insert')
  49
  50 @given("the relations")
  51 def add_data_to_planet_relations(context):
  52     """ Add entries into the osm2pgsql relation middle table. This is needed
  53         for tests on data that looks up members.
  54     """
  55     with context.db.cursor() as cur:
  56         cur.execute("SELECT value FROM osm2pgsql_properties WHERE property = 'db_format'")
  57         row = cur.fetchone()
  58         if row is None or row['value'] == '1':
  59             for r in context.table:
  60                 last_node = 0
  61                 last_way = 0
  62                 parts = []
  63                 if r['members']:
  64                     members = []
  65                     for m in r['members'].split(','):
  66                         mid = NominatimID(m)
  67                         if mid.typ == 'N':
  68                             parts.insert(last_node, int(mid.oid))
  69                             last_node += 1
  70                             last_way += 1
  71                         elif mid.typ == 'W':
  72                             parts.insert(last_way, int(mid.oid))
  73                             last_way += 1
  74                         else:
  75                             parts.append(int(mid.oid))
  76
  77                         members.extend((mid.typ.lower() + mid.oid, mid.cls or ''))
  78                 else:
  79                     members = None
  80
  81                 tags = chain.from_iterable([(h[5:], r[h]) for h in r.headings if h.startswith("tags+")])
  82
  83                 cur.execute("""INSERT INTO planet_osm_rels (id, way_off, rel_off, parts, members, tags)
  84                                VALUES (%s, %s, %s, %s, %s, %s)""",
  85                             (r['id'], last_node, last_way, parts, members, list(tags)))
  86         else:
  87             for r in context.table:
  88                 if r['members']:
  89                     members = []
  90                     for m in r['members'].split(','):
  91                         mid = NominatimID(m)
  92                         members.append({'ref': mid.oid, 'role': mid.cls or '', 'type': mid.typ})
  93                 else:
  94                     members = []
  95
  96                 tags = {h[5:]: r[h] for h in r.headings if h.startswith("tags+")}
  97
  98                 cur.execute("""INSERT INTO planet_osm_rels (id, tags, members)
  99                                VALUES (%s, %s, %s)""",
 100                             (r['id'], psycopg.types.json.Json(tags),
 101                              psycopg.types.json.Json(members)))
 102
 103 @given("the ways")
 104 def add_data_to_planet_ways(context):
 105     """ Add entries into the osm2pgsql way middle table. This is necessary for
 106         tests on that that looks up node ids in this table.
 107     """
 108     with context.db.cursor() as cur:
 109         cur.execute("SELECT value FROM osm2pgsql_properties WHERE property = 'db_format'")
 110         row = cur.fetchone()
 111         json_tags = row is not None and row['value'] != '1'
 112         for r in context.table:
 113             if json_tags:
 114                 tags = psycopg.types.json.Json({h[5:]: r[h] for h in r.headings if h.startswith("tags+")})
 115             else:
 116                 tags = list(chain.from_iterable([(h[5:], r[h])
 117                                                  for h in r.headings if h.startswith("tags+")]))
 118             nodes = [ int(x.strip()) for x in r['nodes'].split(',') ]
 119
 120             cur.execute("INSERT INTO planet_osm_ways (id, nodes, tags) VALUES (%s, %s, %s)",
 121                         (r['id'], nodes, tags))
 122
 123 ################################ WHEN ##################################
 124
 125 @when("importing")
 126 def import_and_index_data_from_place_table(context):
 127     """ Import data previously set up in the place table.
 128     """
 129     context.nominatim.run_nominatim('import', '--continue', 'load-data',
 130                                               '--index-noanalyse', '-q',
 131                                               '--offline')
 132
 133     check_database_integrity(context)
 134
 135     # Remove the output of the input, when all was right. Otherwise it will be
 136     # output when there are errors that had nothing to do with the import
 137     # itself.
 138     context.log_capture.buffer.clear()
 139
 140 @when("updating places")
 141 def update_place_table(context):
 142     """ Update the place table with the given data. Also runs all triggers
 143         related to updates and reindexes the new data.
 144     """
 145     context.nominatim.run_nominatim('refresh', '--functions')
 146     with context.db.cursor() as cur:
 147         for row in context.table:
 148             col = PlaceColumn(context).add_row(row, False)
 149             col.db_delete(cur)
 150             col.db_insert(cur)
 151         cur.execute('SELECT flush_deleted_places()')
 152
 153     context.nominatim.reindex_placex(context.db)
 154     check_database_integrity(context)
 155
 156     # Remove the output of the input, when all was right. Otherwise it will be
 157     # output when there are errors that had nothing to do with the import
 158     # itself.
 159     context.log_capture.buffer.clear()
 160
 161
 162 @when("updating postcodes")
 163 def update_postcodes(context):
 164     """ Rerun the calculation of postcodes.
 165     """
 166     context.nominatim.run_nominatim('refresh', '--postcodes')
 167
 168 @when("marking for delete (?P<oids>.*)")
 169 def delete_places(context, oids):
 170     """ Remove entries from the place table. Multiple ids may be given
 171         separated by commas. Also runs all triggers
 172         related to updates and reindexes the new data.
 173     """
 174     context.nominatim.run_nominatim('refresh', '--functions')
 175     with context.db.cursor() as cur:
 176         cur.execute('TRUNCATE place_to_be_deleted')
 177         for oid in oids.split(','):
 178             NominatimID(oid).query_osm_id(cur, 'DELETE FROM place WHERE {}')
 179         cur.execute('SELECT flush_deleted_places()')
 180
 181     context.nominatim.reindex_placex(context.db)
 182
 183     # Remove the output of the input, when all was right. Otherwise it will be
 184     # output when there are errors that had nothing to do with the import
 185     # itself.
 186     context.log_capture.buffer.clear()
 187
 188 ################################ THEN ##################################
 189
 190 @then("(?P<table>placex|place) contains(?P<exact> exactly)?")
 191 def check_place_contents(context, table, exact):
 192     """ Check contents of place/placex tables. Each row represents a table row
 193         and all data must match. Data not present in the expected table, may
 194         be arbitrary. The rows are identified via the 'object' column which must
 195         have an identifier of the form '<NRW><osm id>[:<class>]'. When multiple
 196         rows match (for example because 'class' was left out and there are
 197         multiple entries for the given OSM object) then all must match. All
 198         expected rows are expected to be present with at least one database row.
 199         When 'exactly' is given, there must not be additional rows in the database.
 200     """
 201     with context.db.cursor() as cur:
 202         expected_content = set()
 203         for row in context.table:
 204             nid = NominatimID(row['object'])
 205             query = 'SELECT *, ST_AsText(geometry) as geomtxt, ST_GeometryType(geometry) as geometrytype'
 206             if table == 'placex':
 207                 query += ' ,ST_X(centroid) as cx, ST_Y(centroid) as cy'
 208             query += " FROM %s WHERE {}" % (table, )
 209             nid.query_osm_id(cur, query)
 210             assert cur.rowcount > 0, "No rows found for " + row['object']
 211
 212             for res in cur:
 213                 if exact:
 214                     expected_content.add((res['osm_type'], res['osm_id'], res['class']))
 215
 216                 DBRow(nid, res, context).assert_row(row, ['object'])
 217
 218         if exact:
 219             cur.execute(pysql.SQL('SELECT osm_type, osm_id, class from')
 220                         + pysql.Identifier(table))
 221             actual = set([(r['osm_type'], r['osm_id'], r['class']) for r in cur])
 222             assert expected_content == actual, \
 223                    f"Missing entries: {expected_content - actual}\n" \
 224                    f"Not expected in table: {actual - expected_content}"
 225
 226
 227 @then("(?P<table>placex|place) has no entry for (?P<oid>.*)")
 228 def check_place_has_entry(context, table, oid):
 229     """ Ensure that no database row for the given object exists. The ID
 230         must be of the form '<NRW><osm id>[:<class>]'.
 231     """
 232     with context.db.cursor() as cur:
 233         NominatimID(oid).query_osm_id(cur, "SELECT * FROM %s where {}" % table)
 234         assert cur.rowcount == 0, \
 235                "Found {} entries for ID {}".format(cur.rowcount, oid)
 236
 237
 238 @then("search_name contains(?P<exclude> not)?")
 239 def check_search_name_contents(context, exclude):
 240     """ Check contents of place/placex tables. Each row represents a table row
 241         and all data must match. Data not present in the expected table, may
 242         be arbitrary. The rows are identified via the 'object' column which must
 243         have an identifier of the form '<NRW><osm id>[:<class>]'. All
 244         expected rows are expected to be present with at least one database row.
 245     """
 246     tokenizer = tokenizer_factory.get_tokenizer_for_db(context.nominatim.get_test_config())
 247
 248     with tokenizer.name_analyzer() as analyzer:
 249         with context.db.cursor() as cur:
 250             for row in context.table:
 251                 nid = NominatimID(row['object'])
 252                 nid.row_by_place_id(cur, 'search_name',
 253                                     ['ST_X(centroid) as cx', 'ST_Y(centroid) as cy'])
 254                 assert cur.rowcount > 0, "No rows found for " + row['object']
 255
 256                 for res in cur:
 257                     db_row = DBRow(nid, res, context)
 258                     for name, value in zip(row.headings, row.cells):
 259                         if name in ('name_vector', 'nameaddress_vector'):
 260                             items = [x.strip() for x in value.split(',')]
 261                             tokens = analyzer.get_word_token_info(items)
 262
 263                             if not exclude:
 264                                 assert len(tokens) >= len(items), \
 265                                        "No word entry found for {}. Entries found: {!s}".format(value, len(tokens))
 266                             for word, token, wid in tokens:
 267                                 if exclude:
 268                                     assert wid not in res[name], \
 269                                            "Found term for {}/{}: {}".format(nid, name, wid)
 270                                 else:
 271                                     assert wid in res[name], \
 272                                            "Missing term for {}/{}: {}".format(nid, name, wid)
 273                         elif name != 'object':
 274                             assert db_row.contains(name, value), db_row.assert_msg(name, value)
 275
 276 @then("search_name has no entry for (?P<oid>.*)")
 277 def check_search_name_has_entry(context, oid):
 278     """ Check that there is noentry in the search_name table for the given
 279         objects. IDs are in format '<NRW><osm id>[:<class>]'.
 280     """
 281     with context.db.cursor() as cur:
 282         NominatimID(oid).row_by_place_id(cur, 'search_name')
 283
 284         assert cur.rowcount == 0, \
 285                "Found {} entries for ID {}".format(cur.rowcount, oid)
 286
 287 @then("location_postcode contains exactly")
 288 def check_location_postcode(context):
 289     """ Check full contents for location_postcode table. Each row represents a table row
 290         and all data must match. Data not present in the expected table, may
 291         be arbitrary. The rows are identified via 'country' and 'postcode' columns.
 292         All rows must be present as excepted and there must not be additional
 293         rows.
 294     """
 295     with context.db.cursor() as cur:
 296         cur.execute("SELECT *, ST_AsText(geometry) as geomtxt FROM location_postcode")
 297         assert cur.rowcount == len(list(context.table)), \
 298             "Postcode table has {} rows, expected {}.".format(cur.rowcount, len(list(context.table)))
 299
 300         results = {}
 301         for row in cur:
 302             key = (row['country_code'], row['postcode'])
 303             assert key not in results, "Postcode table has duplicate entry: {}".format(row)
 304             results[key] = DBRow((row['country_code'],row['postcode']), row, context)
 305
 306         for row in context.table:
 307             db_row = results.get((row['country'],row['postcode']))
 308             assert db_row is not None, \
 309                 f"Missing row for country '{row['country']}' postcode '{row['postcode']}'."
 310
 311             db_row.assert_row(row, ('country', 'postcode'))
 312
 313 @then("there are(?P<exclude> no)? word tokens for postcodes (?P<postcodes>.*)")
 314 def check_word_table_for_postcodes(context, exclude, postcodes):
 315     """ Check that the tokenizer produces postcode tokens for the given
 316         postcodes. The postcodes are a comma-separated list of postcodes.
 317         Whitespace matters.
 318     """
 319     nctx = context.nominatim
 320     tokenizer = tokenizer_factory.get_tokenizer_for_db(nctx.get_test_config())
 321     with tokenizer.name_analyzer() as ana:
 322         plist = [ana.normalize_postcode(p) for p in postcodes.split(',')]
 323
 324     plist.sort()
 325
 326     with context.db.cursor() as cur:
 327         if nctx.tokenizer != 'legacy':
 328             cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
 329                         (plist,))
 330         else:
 331             cur.execute("""SELECT word FROM word WHERE word = any(%s)
 332                              and class = 'place' and type = 'postcode'""",
 333                         (plist,))
 334
 335         found = [row['word'] for row in cur]
 336         assert len(found) == len(set(found)), f"Duplicate rows for postcodes: {found}"
 337
 338     if exclude:
 339         assert len(found) == 0, f"Unexpected postcodes: {found}"
 340     else:
 341         assert set(found) == set(plist), \
 342         f"Missing postcodes {set(plist) - set(found)}. Found: {found}"
 343
 344 @then("place_addressline contains")
 345 def check_place_addressline(context):
 346     """ Check the contents of the place_addressline table. Each row represents
 347         a table row and all data must match. Data not present in the expected
 348         table, may be arbitrary. The rows are identified via the 'object' column,
 349         representing the addressee and the 'address' column, representing the
 350         address item.
 351     """
 352     with context.db.cursor() as cur:
 353         for row in context.table:
 354             nid = NominatimID(row['object'])
 355             pid = nid.get_place_id(cur)
 356             apid = NominatimID(row['address']).get_place_id(cur)
 357             cur.execute(""" SELECT * FROM place_addressline
 358                             WHERE place_id = %s AND address_place_id = %s""",
 359                         (pid, apid))
 360             assert cur.rowcount > 0, \
 361                         "No rows found for place %s and address %s" % (row['object'], row['address'])
 362
 363             for res in cur:
 364                 DBRow(nid, res, context).assert_row(row, ('address', 'object'))
 365
 366 @then("place_addressline doesn't contain")
 367 def check_place_addressline_exclude(context):
 368     """ Check that the place_addressline doesn't contain any entries for the
 369         given addressee/address item pairs.
 370     """
 371     with context.db.cursor() as cur:
 372         for row in context.table:
 373             pid = NominatimID(row['object']).get_place_id(cur)
 374             apid = NominatimID(row['address']).get_place_id(cur, allow_empty=True)
 375             if apid is not None:
 376                 cur.execute(""" SELECT * FROM place_addressline
 377                                 WHERE place_id = %s AND address_place_id = %s""",
 378                             (pid, apid))
 379                 assert cur.rowcount == 0, \
 380                     "Row found for place %s and address %s" % (row['object'], row['address'])
 381
 382 @then("W(?P<oid>\d+) expands to(?P<neg> no)? interpolation")
 383 def check_location_property_osmline(context, oid, neg):
 384     """ Check that the given way is present in the interpolation table.
 385     """
 386     with context.db.cursor() as cur:
 387         cur.execute("""SELECT *, ST_AsText(linegeo) as geomtxt
 388                        FROM location_property_osmline
 389                        WHERE osm_id = %s AND startnumber IS NOT NULL""",
 390                     (oid, ))
 391
 392         if neg:
 393             assert cur.rowcount == 0, "Interpolation found for way {}.".format(oid)
 394             return
 395
 396         todo = list(range(len(list(context.table))))
 397         for res in cur:
 398             for i in todo:
 399                 row = context.table[i]
 400                 if (int(row['start']) == res['startnumber']
 401                     and int(row['end']) == res['endnumber']):
 402                     todo.remove(i)
 403                     break
 404             else:
 405                 assert False, "Unexpected row " + str(res)
 406
 407             DBRow(oid, res, context).assert_row(row, ('start', 'end'))
 408
 409         assert not todo, f"Unmatched lines in table: {list(context.table[i] for i in todo)}"
 410
 411 @then("location_property_osmline contains(?P<exact> exactly)?")
 412 def check_place_contents(context, exact):
 413     """ Check contents of the interpolation table. Each row represents a table row
 414         and all data must match. Data not present in the expected table, may
 415         be arbitrary. The rows are identified via the 'object' column which must
 416         have an identifier of the form '<osm id>[:<startnumber>]'. When multiple
 417         rows match (for example because 'startnumber' was left out and there are
 418         multiple entries for the given OSM object) then all must match. All
 419         expected rows are expected to be present with at least one database row.
 420         When 'exactly' is given, there must not be additional rows in the database.
 421     """
 422     with context.db.cursor() as cur:
 423         expected_content = set()
 424         for row in context.table:
 425             if ':' in row['object']:
 426                 nid, start = row['object'].split(':', 2)
 427                 start = int(start)
 428             else:
 429                 nid, start = row['object'], None
 430
 431             query = """SELECT *, ST_AsText(linegeo) as geomtxt,
 432                               ST_GeometryType(linegeo) as geometrytype
 433                        FROM location_property_osmline WHERE osm_id=%s"""
 434
 435             if ':' in row['object']:
 436                 query += ' and startnumber = %s'
 437                 params = [int(val) for val in row['object'].split(':', 2)]
 438             else:
 439                 params = (int(row['object']), )
 440
 441             cur.execute(query, params)
 442             assert cur.rowcount > 0, "No rows found for " + row['object']
 443
 444             for res in cur:
 445                 if exact:
 446                     expected_content.add((res['osm_id'], res['startnumber']))
 447
 448                 DBRow(nid, res, context).assert_row(row, ['object'])
 449
 450         if exact:
 451             cur.execute('SELECT osm_id, startnumber from location_property_osmline')
 452             actual = set([(r['osm_id'], r['startnumber']) for r in cur])
 453             assert expected_content == actual, \
 454                    f"Missing entries: {expected_content - actual}\n" \
 455                    f"Not expected in table: {actual - expected_content}"
 456