From f74228830d4e757ad59da125b020c08461a61a3e Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 24 Feb 2022 11:35:21 +0100 Subject: [PATCH] bdd: run full import on tests This uncovered a couple of outdated/wrong tests which have been fixed, too. --- nominatim/tokenizer/icu_tokenizer.py | 45 +++++++++----- test/bdd/db/import/placex.feature | 64 ------------------- test/bdd/db/import/postcodes.feature | 68 ++++++++++++++++++++- test/bdd/db/import/rank_computation.feature | 2 - test/bdd/db/query/normalization.feature | 4 +- test/bdd/db/query/search_simple.feature | 2 +- test/bdd/db/update/simple.feature | 4 +- test/bdd/osm2pgsql/update/relation.feature | 18 ------ test/bdd/steps/steps_db_ops.py | 40 +++--------- test/bdd/steps/table_compare.py | 5 +- 10 files changed, 115 insertions(+), 137 deletions(-) diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 98a1daed..9c25b6d7 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -390,17 +390,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def add_country_names(self, country_code, names): - """ Add names for the given country to the search index. + """ Add default names for the given country to the search index. """ # Make sure any name preprocessing for country names applies. info = PlaceInfo({'name': names, 'country_code': country_code, 'rank_address': 4, 'class': 'boundary', 'type': 'administrative'}) self._add_country_full_names(country_code, - self.sanitizer.process_names(info)[0]) + self.sanitizer.process_names(info)[0], + internal=True) - def _add_country_full_names(self, country_code, names): + def _add_country_full_names(self, country_code, names, internal=False): """ Add names for the given country from an already sanitized name list. """ @@ -412,21 +413,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): with self.conn.cursor() as cur: # Get existing names - cur.execute("""SELECT word_token FROM word - WHERE type = 'C' and word = %s""", + cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal + FROM word + WHERE type = 'C' and word = %s""", (country_code, )) - existing_tokens = {t[0] for t in cur} - - # Only add those names that are not yet in the list. - new_tokens = word_tokens - existing_tokens - if new_tokens: - cur.execute("""INSERT INTO word (word_token, type, word) - (SELECT token, 'C', %s - FROM unnest(%s) as token) - """, (country_code, list(new_tokens))) + existing_tokens = {True: set(), False: set()} # internal/external names + for word in cur: + existing_tokens[word[1]].add(word[0]) # Delete names that no longer exist. - gone_tokens = existing_tokens - word_tokens + gone_tokens = existing_tokens[internal] - word_tokens + if internal: + gone_tokens.update(existing_tokens[False] & word_tokens) if gone_tokens: cur.execute("""DELETE FROM word USING unnest(%s) as token @@ -434,6 +432,23 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): and word_token = token""", (list(gone_tokens), country_code)) + # Only add those names that are not yet in the list. + new_tokens = word_tokens - existing_tokens[True] + if not internal: + new_tokens -= existing_tokens[False] + if new_tokens: + if internal: + sql = """INSERT INTO word (word_token, type, word, info) + (SELECT token, 'C', %s, '{"internal": "yes"}' + FROM unnest(%s) as token) + """ + else: + sql = """INSERT INTO word (word_token, type, word) + (SELECT token, 'C', %s + FROM unnest(%s) as token) + """ + cur.execute(sql, (country_code, list(new_tokens))) + def process_place(self, place): """ Determine tokenizer information about the given place. diff --git a/test/bdd/db/import/placex.feature b/test/bdd/db/import/placex.feature index 9b208775..e62a5e5b 100644 --- a/test/bdd/db/import/placex.feature +++ b/test/bdd/db/import/placex.feature @@ -61,70 +61,6 @@ Feature: Import into placex When importing Then placex has no entry for R1 - Scenario: search and address ranks for GB post codes correctly assigned - Given the places - | osm | class | type | postcode | geometry | - | N1 | place | postcode | E45 2CD | country:gb | - | N2 | place | postcode | E45 2 | country:gb | - | N3 | place | postcode | Y45 | country:gb | - When importing - Then placex contains - | object | addr+postcode | country_code | rank_search | rank_address | - | N1 | E45 2CD | gb | 25 | 5 | - | N2 | E45 2 | gb | 23 | 5 | - | N3 | Y45 | gb | 21 | 5 | - - Scenario: wrongly formatted GB postcodes are down-ranked - Given the places - | osm | class | type | postcode | geometry | - | N1 | place | postcode | EA452CD | country:gb | - | N2 | place | postcode | E45 23 | country:gb | - When importing - Then placex contains - | object | country_code | rank_search | rank_address | - | N1 | gb | 30 | 30 | - | N2 | gb | 30 | 30 | - - Scenario: search and address rank for DE postcodes correctly assigned - Given the places - | osm | class | type | postcode | geometry | - | N1 | place | postcode | 56427 | country:de | - | N2 | place | postcode | 5642 | country:de | - | N3 | place | postcode | 5642A | country:de | - | N4 | place | postcode | 564276 | country:de | - When importing - Then placex contains - | object | country_code | rank_search | rank_address | - | N1 | de | 21 | 11 | - | N2 | de | 30 | 30 | - | N3 | de | 30 | 30 | - | N4 | de | 30 | 30 | - - Scenario: search and address rank for other postcodes are correctly assigned - Given the places - | osm | class | type | postcode | geometry | - | N1 | place | postcode | 1 | country:ca | - | N2 | place | postcode | X3 | country:ca | - | N3 | place | postcode | 543 | country:ca | - | N4 | place | postcode | 54dc | country:ca | - | N5 | place | postcode | 12345 | country:ca | - | N6 | place | postcode | 55TT667 | country:ca | - | N7 | place | postcode | 123-65 | country:ca | - | N8 | place | postcode | 12 445 4 | country:ca | - | N9 | place | postcode | A1:bc10 | country:ca | - When importing - Then placex contains - | object | country_code | rank_search | rank_address | - | N1 | ca | 21 | 11 | - | N2 | ca | 21 | 11 | - | N3 | ca | 21 | 11 | - | N4 | ca | 21 | 11 | - | N5 | ca | 21 | 11 | - | N6 | ca | 21 | 11 | - | N7 | ca | 25 | 11 | - | N8 | ca | 25 | 11 | - | N9 | ca | 25 | 11 | - Scenario: search and address ranks for boundaries are correctly assigned Given the named places | osm | class | type | diff --git a/test/bdd/db/import/postcodes.feature b/test/bdd/db/import/postcodes.feature index 4c839db0..37c30ef8 100644 --- a/test/bdd/db/import/postcodes.feature +++ b/test/bdd/db/import/postcodes.feature @@ -153,4 +153,70 @@ Feature: Import of postcodes When sending search query "E4 7EA" Then results contain | type | display_name | - | postcode | E4 7EA | + | postcode | E4 7EA | + + Scenario: search and address ranks for GB post codes correctly assigned + Given the places + | osm | class | type | postcode | geometry | + | N1 | place | postcode | E45 2CD | country:gb | + | N2 | place | postcode | E45 2 | country:gb | + | N3 | place | postcode | Y45 | country:gb | + When importing + Then location_postcode contains exactly + | postcode | country | rank_search | rank_address | + | E45 2CD | gb | 25 | 5 | + | E45 2 | gb | 23 | 5 | + | Y45 | gb | 21 | 5 | + + Scenario: wrongly formatted GB postcodes are down-ranked + Given the places + | osm | class | type | postcode | geometry | + | N1 | place | postcode | EA452CD | country:gb | + | N2 | place | postcode | E45 23 | country:gb | + When importing + Then location_postcode contains exactly + | postcode | country | rank_search | rank_address | + | EA452CD | gb | 30 | 30 | + | E45 23 | gb | 30 | 30 | + + Scenario: search and address rank for DE postcodes correctly assigned + Given the places + | osm | class | type | postcode | geometry | + | N1 | place | postcode | 56427 | country:de | + | N2 | place | postcode | 5642 | country:de | + | N3 | place | postcode | 5642A | country:de | + | N4 | place | postcode | 564276 | country:de | + When importing + Then location_postcode contains exactly + | postcode | country | rank_search | rank_address | + | 56427 | de | 21 | 11 | + | 5642 | de | 30 | 30 | + | 5642A | de | 30 | 30 | + | 564276 | de | 30 | 30 | + + Scenario: search and address rank for other postcodes are correctly assigned + Given the places + | osm | class | type | postcode | geometry | + | N1 | place | postcode | 1 | country:ca | + | N2 | place | postcode | X3 | country:ca | + | N3 | place | postcode | 543 | country:ca | + | N4 | place | postcode | 54dc | country:ca | + | N5 | place | postcode | 12345 | country:ca | + | N6 | place | postcode | 55TT667 | country:ca | + | N7 | place | postcode | 123-65 | country:ca | + | N8 | place | postcode | 12 445 4 | country:ca | + | N9 | place | postcode | A1:bc10 | country:ca | + When importing + Then location_postcode contains exactly + | postcode | country | rank_search | rank_address | + | 1 | ca | 21 | 11 | + | X3 | ca | 21 | 11 | + | 543 | ca | 21 | 11 | + | 54DC | ca | 21 | 11 | + | 12345 | ca | 21 | 11 | + | 55TT667 | ca | 21 | 11 | + | 123-65 | ca | 25 | 11 | + | 12 445 4 | ca | 25 | 11 | + | A1:BC10 | ca | 25 | 11 | + + diff --git a/test/bdd/db/import/rank_computation.feature b/test/bdd/db/import/rank_computation.feature index c8b5de5c..f0dcfe16 100644 --- a/test/bdd/db/import/rank_computation.feature +++ b/test/bdd/db/import/rank_computation.feature @@ -16,7 +16,6 @@ Feature: Rank assignment | N18 | place | city | 0 0 | | N19 | place | island | 0 0 | | N36 | place | house | 0 0 | - | N38 | place | houses | 0 0 | And the named places | osm | class | type | extra+capital | geometry | | N101 | place | city | yes | 0 0 | @@ -35,7 +34,6 @@ Feature: Rank assignment | N19 | 17 | 0 | | N101 | 15 | 16 | | N36 | 30 | 30 | - | N38 | 28 | 0 | Scenario: Ranks for boundaries are assigned according to admin level Given the named places diff --git a/test/bdd/db/query/normalization.feature b/test/bdd/db/query/normalization.feature index 304496e2..162a59a4 100644 --- a/test/bdd/db/query/normalization.feature +++ b/test/bdd/db/query/normalization.feature @@ -164,8 +164,8 @@ Feature: Import and search of names Scenario: Unprintable characters in postcodes are ignored Given the named places - | osm | class | type | address | - | N234 | amenity | prison | 'postcode' : u'1234\u200e' | + | osm | class | type | address | geometry | + | N234 | amenity | prison | 'postcode' : u'1234\u200e' | country:de | When importing And sending search query "1234" Then result 0 has not attributes osm_type diff --git a/test/bdd/db/query/search_simple.feature b/test/bdd/db/query/search_simple.feature index bcd73eaf..3672bb89 100644 --- a/test/bdd/db/query/search_simple.feature +++ b/test/bdd/db/query/search_simple.feature @@ -25,7 +25,7 @@ Feature: Searching of simple objects | osm | class | type | postcode | geometry | | R1 | boundary | postal_code | 54321 | poly-area:1.0 | And sending search query "12345" - Then result 0 has not attributes osm_type + Then exactly 0 results are returned When sending search query "54321" Then results contain | ID | osm | diff --git a/test/bdd/db/update/simple.feature b/test/bdd/db/update/simple.feature index ccaacad0..ccc42c2a 100644 --- a/test/bdd/db/update/simple.feature +++ b/test/bdd/db/update/simple.feature @@ -63,9 +63,7 @@ Feature: Update of simple objects | osm | class | type | postcode | geometry | | N3 | place | postcode | 12345 | 1 -1 | When importing - Then placex contains - | object | class | type | - | N3 | place | postcode | + Then placex has no entry for N3 When updating places | osm | class | type | postcode | housenr | geometry | | N3 | place | house | 12345 | 13 | 1 -1 | diff --git a/test/bdd/osm2pgsql/update/relation.feature b/test/bdd/osm2pgsql/update/relation.feature index fd8b83e3..794ef5c5 100644 --- a/test/bdd/osm2pgsql/update/relation.feature +++ b/test/bdd/osm2pgsql/update/relation.feature @@ -139,21 +139,3 @@ Feature: Update of relations by osm2pgsql Then place contains | object | addr+country | name | | R1 | XX | 'name' : 'Foo' | - - Scenario: Country boundary names are extended when country_code known - When loading osm data - """ - n200 Tamenity=prison x0 y0 - n201 x0 y0.0001 - n202 x0.0001 y0.0001 - n203 x0.0001 y0 - """ - And updating osm data - """ - w1 Nn200,n201,n202,n203,n200 - r1 Ttype=boundary,boundary=administrative,name=Foo,country_code=ch,admin_level=2 Mw1@ - """ - Then place contains - | object | addr+country | name+name:de | name+name | - | R1 | ch | Schweiz | Foo | - diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py index 4970f6df..8df5d617 100644 --- a/test/bdd/steps/steps_db_ops.py +++ b/test/bdd/steps/steps_db_ops.py @@ -93,29 +93,8 @@ def add_data_to_planet_ways(context): def import_and_index_data_from_place_table(context): """ Import data previously set up in the place table. """ - nctx = context.nominatim - - tokenizer = tokenizer_factory.create_tokenizer(nctx.get_test_config()) - context.nominatim.copy_from_place(context.db) - - # XXX use tool function as soon as it is ported - with context.db.cursor() as cur: - with (context.nominatim.src_dir / 'lib-sql' / 'postcode_tables.sql').open('r') as fd: - cur.execute(fd.read()) - cur.execute(""" - INSERT INTO location_postcode - (place_id, indexed_status, country_code, postcode, geometry) - SELECT nextval('seq_place'), 1, country_code, - upper(trim (both ' ' from address->'postcode')) as pc, - ST_Centroid(ST_Collect(ST_Centroid(geometry))) - FROM placex - WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%' - AND geometry IS NOT null - GROUP BY country_code, pc""") - - # Call directly as the refresh function does not include postcodes. - indexer.LOG.setLevel(logging.ERROR) - indexer.Indexer(context.nominatim.get_libpq_dsn(), tokenizer, 1).index_full(analyse=False) + context.nominatim.run_nominatim('import', '--continue', 'load-data', + '--index-noanalyse', '-q') check_database_integrity(context) @@ -268,7 +247,7 @@ def check_location_postcode(context): for row in context.table: db_row = results.get((row['country'],row['postcode'])) assert db_row is not None, \ - "Missing row for country '{r['country']}' postcode '{r['postcode']}'.".format(r=row) + f"Missing row for country '{row['country']}' postcode '{row['postcode']}'." db_row.assert_row(row, ('country', 'postcode')) @@ -333,12 +312,13 @@ def check_place_addressline_exclude(context): with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: for row in context.table: pid = NominatimID(row['object']).get_place_id(cur) - apid = NominatimID(row['address']).get_place_id(cur) - cur.execute(""" SELECT * FROM place_addressline - WHERE place_id = %s AND address_place_id = %s""", - (pid, apid)) - assert cur.rowcount == 0, \ - "Row found for place %s and address %s" % (row['object'], row['address']) + apid = NominatimID(row['address']).get_place_id(cur, allow_empty=True) + if apid is not None: + cur.execute(""" SELECT * FROM place_addressline + WHERE place_id = %s AND address_place_id = %s""", + (pid, apid)) + assert cur.rowcount == 0, \ + "Row found for place %s and address %s" % (row['object'], row['address']) @then("W(?P\d+) expands to(?P no)? interpolation") def check_location_property_osmline(context, oid, neg): diff --git a/test/bdd/steps/table_compare.py b/test/bdd/steps/table_compare.py index 481a29a0..ca6c3020 100644 --- a/test/bdd/steps/table_compare.py +++ b/test/bdd/steps/table_compare.py @@ -62,11 +62,14 @@ class NominatimID: ','.join(['*'] + (extra_columns or [])), table) cur.execute(query, (pid, )) - def get_place_id(self, cur): + def get_place_id(self, cur, allow_empty=False): """ Look up the place id for the ID. Throws an assertion if the ID is not unique. """ self.query_osm_id(cur, "SELECT place_id FROM placex WHERE {}") + if cur.rowcount == 0 and allow_empty: + return None + assert cur.rowcount == 1, \ "Place ID {!s} not unique. Found {} entries.".format(self, cur.rowcount) -- 2.39.5