From a263e54b9463c89addeb0ac613d0586378de22f2 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 5 May 2021 10:00:34 +0200 Subject: [PATCH] enable BDD tests for different tokenizers The tokenizer to be used can be choosen with -DTOKENIZER. Adapt all tests, so that they work with legacy_icu tokenizer. Move lookup in word table to a function in the tokenizer. Special phrases are temporarily imported from the wiki until we have an implementation that can import from file. TIGER tests do not work yet. --- nominatim/tokenizer/legacy_icu_tokenizer.py | 46 +++++++++++++--- nominatim/tokenizer/legacy_tokenizer.py | 27 ++++++++++ test/bdd/api/search/queries.feature | 2 +- test/bdd/db/import/naming.feature | 2 - test/bdd/environment.py | 1 + test/bdd/steps/nominatim_environment.py | 17 ++++-- test/bdd/steps/steps_db_ops.py | 59 +++++++++------------ 7 files changed, 106 insertions(+), 48 deletions(-) diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 6fba6c8d..696004f5 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -35,7 +35,7 @@ def create(dsn, data_dir): class LegacyICUTokenizer: """ This tokenizer uses libICU to covert names and queries to ASCII. Otherwise it uses the same algorithms and data structures as the - normalization routines in Nominatm 3. + normalization routines in Nominatim 3. """ def __init__(self, dsn, data_dir): @@ -228,6 +228,35 @@ class LegacyICUNameAnalyzer: self.conn = None + def get_word_token_info(self, conn, words): + """ Return token information for the given list of words. + If a word starts with # it is assumed to be a full name + otherwise is a partial name. + + The function returns a list of tuples with + (original word, word token, word id). + + The function is used for testing and debugging only + and not necessarily efficient. + """ + tokens = {} + for word in words: + if word.startswith('#'): + tokens[word] = ' ' + self.make_standard_word(word[1:]) + else: + tokens[word] = self.make_standard_word(word) + + with conn.cursor() as cur: + cur.execute("""SELECT word_token, word_id + FROM word, (SELECT unnest(%s::TEXT[]) as term) t + WHERE word_token = t.term + and class is null and country_code is null""", + (list(tokens.values()), )) + ids = {r[0]: r[1] for r in cur} + + return [(k, v, ids[v]) for k, v in tokens.items()] + + def normalize(self, phrase): """ Normalize the given phrase, i.e. remove all properties that are irrelevant for search. @@ -236,7 +265,7 @@ class LegacyICUNameAnalyzer: @functools.lru_cache(maxsize=1024) def make_standard_word(self, name): - """ Create the normalised version of the name. + """ Create the normalised version of the input. """ norm = ' ' + self.transliterator.transliterate(name) + ' ' for full, abbr in self.abbreviations: @@ -333,24 +362,25 @@ class LegacyICUNameAnalyzer: """ full_names = set((self.make_standard_word(n) for n in names)) full_names.discard('') - self._add_normalised_country_names(country_code, full_names) + self._add_normalized_country_names(country_code, full_names) - def _add_normalised_country_names(self, country_code, names): + def _add_normalized_country_names(self, country_code, names): """ Add names for the given country to the search index. """ + word_tokens = set((' ' + name for name in names)) with self.conn.cursor() as cur: # Get existing names cur.execute("SELECT word_token FROM word WHERE country_code = %s", (country_code, )) - new_names = names.difference((t[0] for t in cur)) + word_tokens.difference_update((t[0] for t in cur)) - if new_names: + if word_tokens: cur.execute("""INSERT INTO word (word_id, word_token, country_code, search_name_count) (SELECT nextval('seq_word'), token, '{}', 0 FROM unnest(%s) as token) - """.format(country_code), (list(new_names),)) + """.format(country_code), (list(word_tokens),)) def process_place(self, place): @@ -371,7 +401,7 @@ class LegacyICUNameAnalyzer: country_feature = place.get('country_feature') if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): - self._add_normalised_country_names(country_feature.lower(), + self._add_normalized_country_names(country_feature.lower(), full_names) address = place.get('address') diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index 2f060b84..438a5aff 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -271,6 +271,33 @@ class LegacyNameAnalyzer: self.conn = None + @staticmethod + def get_word_token_info(conn, words): + """ Return token information for the given list of words. + If a word starts with # it is assumed to be a full name + otherwise is a partial name. + + The function returns a list of tuples with + (original word, word token, word id). + + The function is used for testing and debugging only + and not necessarily efficient. + """ + with conn.cursor() as cur: + cur.execute("""SELECT t.term, word_token, word_id + FROM word, (SELECT unnest(%s::TEXT[]) as term) t + WHERE word_token = (CASE + WHEN left(t.term, 1) = '#' THEN + ' ' || make_standard_name(substring(t.term from 2)) + ELSE + make_standard_name(t.term) + END) + and class is null and country_code is null""", + (words, )) + + return [(r[0], r[1], r[2]) for r in cur] + + def normalize(self, phrase): """ Normalize the given phrase, i.e. remove all properties that are irrelevant for search. diff --git a/test/bdd/api/search/queries.feature b/test/bdd/api/search/queries.feature index ea353f45..6d697ef9 100644 --- a/test/bdd/api/search/queries.feature +++ b/test/bdd/api/search/queries.feature @@ -163,7 +163,7 @@ Feature: Search queries Then exactly 0 results are returned Scenario: Ignore country searches when query is restricted to countries - When sending json search query "de" + When sending json search query "fr" | countrycodes | | li | Then exactly 0 results are returned diff --git a/test/bdd/db/import/naming.feature b/test/bdd/db/import/naming.feature index bb29d2a3..9e11eacd 100644 --- a/test/bdd/db/import/naming.feature +++ b/test/bdd/db/import/naming.feature @@ -55,6 +55,4 @@ Feature: Import and search of names | Вологда | | Αθήνα | | القاهرة | - | រាជធានីភ្នំពេញ | | 東京都 | - | ပုဗ္ဗသီရိ | diff --git a/test/bdd/environment.py b/test/bdd/environment.py index 30ea30a2..f179c8f1 100644 --- a/test/bdd/environment.py +++ b/test/bdd/environment.py @@ -20,6 +20,7 @@ userconfig = { 'API_TEST_DB' : 'test_api_nominatim', 'API_TEST_FILE' : (TEST_BASE_DIR / 'testdb' / 'apidb-test-data.pbf').resolve(), 'SERVER_MODULE_PATH' : None, + 'TOKENIZER' : None, # Test with a custom tokenizer 'PHPCOV' : False, # set to output directory to enable code coverage } diff --git a/test/bdd/steps/nominatim_environment.py b/test/bdd/steps/nominatim_environment.py index f6e3a039..de02e346 100644 --- a/test/bdd/steps/nominatim_environment.py +++ b/test/bdd/steps/nominatim_environment.py @@ -28,6 +28,7 @@ class NominatimEnvironment: self.test_db = config['TEST_DB'] self.api_test_db = config['API_TEST_DB'] self.api_test_file = config['API_TEST_FILE'] + self.tokenizer = config['TOKENIZER'] self.server_module_path = config['SERVER_MODULE_PATH'] self.reuse_template = not config['REMOVE_TEMPLATE'] self.keep_scenario_db = config['KEEP_TEST_DB'] @@ -96,6 +97,8 @@ class NominatimEnvironment: self.test_env['NOMINATIM_DATABASE_MODULE_SRC_PATH'] = str((self.build_dir / 'module').resolve()) self.test_env['NOMINATIM_OSM2PGSQL_BINARY'] = str((self.build_dir / 'osm2pgsql' / 'osm2pgsql').resolve()) self.test_env['NOMINATIM_NOMINATIM_TOOL'] = str((self.build_dir / 'nominatim').resolve()) + if self.tokenizer is not None: + self.test_env['NOMINATIM_TOKENIZER'] = self.tokenizer if self.server_module_path: self.test_env['NOMINATIM_DATABASE_MODULE_PATH'] = self.server_module_path @@ -189,11 +192,19 @@ class NominatimEnvironment: try: self.run_nominatim('import', '--osm-file', str(self.api_test_file)) - self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve())) + if self.tokenizer != 'legacy_icu': + self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve())) self.run_nominatim('freeze') - phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve()) - run_script(['psql', '-d', self.api_test_db, '-f', phrase_file]) + if self.tokenizer != 'legacy_icu': + phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve()) + run_script(['psql', '-d', self.api_test_db, '-f', phrase_file]) + else: + # XXX Temporary use the wiki while there is no CSV import + # available. + self.test_env['NOMINATIM_LANGUAGES'] = 'en' + self.run_nominatim('special-phrases', '--import-from-wiki') + del self.test_env['NOMINATIM_LANGUAGES'] except: self.db_drop_database(self.api_test_db) raise diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py index 52a50a51..6d7bc188 100644 --- a/test/bdd/steps/steps_db_ops.py +++ b/test/bdd/steps/steps_db_ops.py @@ -199,44 +199,35 @@ def check_search_name_contents(context, exclude): have an identifier of the form '[:]'. All expected rows are expected to be present with at least one database row. """ - with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: - for row in context.table: - nid = NominatimID(row['object']) - nid.row_by_place_id(cur, 'search_name', - ['ST_X(centroid) as cx', 'ST_Y(centroid) as cy']) - assert cur.rowcount > 0, "No rows found for " + row['object'] + tokenizer = tokenizer_factory.get_tokenizer_for_db(context.nominatim.get_test_config()) + + with tokenizer.name_analyzer() as analyzer: + with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: + for row in context.table: + nid = NominatimID(row['object']) + nid.row_by_place_id(cur, 'search_name', + ['ST_X(centroid) as cx', 'ST_Y(centroid) as cy']) + assert cur.rowcount > 0, "No rows found for " + row['object'] + + for res in cur: + db_row = DBRow(nid, res, context) + for name, value in zip(row.headings, row.cells): + if name in ('name_vector', 'nameaddress_vector'): + items = [x.strip() for x in value.split(',')] + tokens = analyzer.get_word_token_info(context.db, items) - for res in cur: - db_row = DBRow(nid, res, context) - for name, value in zip(row.headings, row.cells): - if name in ('name_vector', 'nameaddress_vector'): - items = [x.strip() for x in value.split(',')] - with context.db.cursor() as subcur: - subcur.execute(""" SELECT word_id, word_token - FROM word, (SELECT unnest(%s::TEXT[]) as term) t - WHERE word_token = make_standard_name(t.term) - and class is null and country_code is null - and operator is null - UNION - SELECT word_id, word_token - FROM word, (SELECT unnest(%s::TEXT[]) as term) t - WHERE word_token = ' ' || make_standard_name(t.term) - and class is null and country_code is null - and operator is null - """, - (list(filter(lambda x: not x.startswith('#'), items)), - list(filter(lambda x: x.startswith('#'), items)))) if not exclude: - assert subcur.rowcount >= len(items), \ - "No word entry found for {}. Entries found: {!s}".format(value, subcur.rowcount) - for wid in subcur: - present = wid[0] in res[name] + assert len(tokens) >= len(items), \ + "No word entry found for {}. Entries found: {!s}".format(value, len(tokens)) + for word, token, wid in tokens: if exclude: - assert not present, "Found term for {}/{}: {}".format(row['object'], name, wid[1]) + assert wid not in res[name], \ + "Found term for {}/{}: {}".format(nid, name, wid) else: - assert present, "Missing term for {}/{}: {}".format(row['object'], name, wid[1]) - elif name != 'object': - assert db_row.contains(name, value), db_row.assert_msg(name, value) + assert wid in res[name], \ + "Missing term for {}/{}: {}".format(nid, name, wid) + elif name != 'object': + assert db_row.contains(name, value), db_row.assert_msg(name, value) @then("search_name has no entry for (?P.*)") def check_search_name_has_entry(context, oid): -- 2.39.5