From: Sarah Hoffmann Date: Tue, 27 Apr 2021 09:37:18 +0000 (+0200) Subject: move default country name creation to tokenizer X-Git-Tag: v4.0.0~93^2~15 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/bef300305e3c445fe483b806b838ad8ce5b689f6 move default country name creation to tokenizer The new function is also used, when a country us updated. All SQL function related to country names have been removed. --- diff --git a/lib-sql/tokenizer/legacy_tokenizer.sql b/lib-sql/tokenizer/legacy_tokenizer.sql index 5e80b414..2bbf3ed7 100644 --- a/lib-sql/tokenizer/legacy_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_tokenizer.sql @@ -202,29 +202,6 @@ $$ LANGUAGE plpgsql; -CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT, - lookup_country_code varchar(2)) - RETURNS INTEGER - AS $$ -DECLARE - lookup_token TEXT; - return_word_id INTEGER; -BEGIN - lookup_token := ' '||trim(lookup_word); - SELECT min(word_id) FROM word - WHERE word_token = lookup_token and country_code=lookup_country_code - INTO return_word_id; - IF return_word_id IS NULL THEN - return_word_id := nextval('seq_word'); - INSERT INTO word VALUES (return_word_id, lookup_token, null, - null, null, lookup_country_code, 0); - END IF; - RETURN return_word_id; -END; -$$ -LANGUAGE plpgsql; - - CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT, lookup_class text, lookup_type text) RETURNS INTEGER @@ -363,36 +340,6 @@ $$ LANGUAGE plpgsql STABLE STRICT; -CREATE OR REPLACE FUNCTION create_country(src HSTORE, country_code varchar(2)) - RETURNS VOID - AS $$ -DECLARE - s TEXT; - w INTEGER; - words TEXT[]; - item RECORD; - j INTEGER; -BEGIN - FOR item IN SELECT (each(src)).* LOOP - - s := make_standard_name(item.value); - w := getorcreate_country(s, country_code); - - words := regexp_split_to_array(item.value, E'[,;()]'); - IF array_upper(words, 1) != 1 THEN - FOR j IN 1..array_upper(words, 1) LOOP - s := make_standard_name(words[j]); - IF s != '' THEN - w := getorcreate_country(s, country_code); - END IF; - END LOOP; - END IF; - END LOOP; -END; -$$ -LANGUAGE plpgsql; - - CREATE OR REPLACE FUNCTION make_keywords(src HSTORE) RETURNS INTEGER[] AS $$ diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py index 1ce9cf3e..0f19d097 100644 --- a/nominatim/clicmd/setup.py +++ b/nominatim/clicmd/setup.py @@ -133,7 +133,8 @@ class SetupAll: database_import.create_search_indices(conn, args.config, drop=args.no_updates) LOG.warning('Create search index for default country names.') - database_import.create_country_names(conn, args.config) + database_import.create_country_names(conn, tokenizer, + args.config.LANGUAGES) webdir = args.project_dir / 'website' LOG.warning('Setup website at %s', webdir) diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index 7d273064..d4068aea 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -223,6 +223,21 @@ class LegacyNameAnalyzer: FROM (SELECT distinct(postcode) as pc FROM location_postcode) x""") + + def add_country_names(self, country_code, names): + """ Add names for the given country to the search index. + """ + with self.conn.cursor() as cur: + cur.execute( + """INSERT INTO word (word_id, word_token, country_code) + (SELECT nextval('seq_word'), lookup_token, %s + FROM (SELECT ' ' || make_standard_name(n) as lookup_token + FROM unnest(%s)n) y + WHERE NOT EXISTS(SELECT * FROM word + WHERE word_token = lookup_token and country_code = %s)) + """, (country_code, names, country_code)) + + def process_place(self, place): """ Determine tokenizer information about the given place. @@ -231,7 +246,14 @@ class LegacyNameAnalyzer: """ token_info = _TokenInfo(self._cache) - token_info.add_names(self.conn, place.get('name'), place.get('country_feature')) + names = place.get('name') + + if names: + token_info.add_names(self.conn, names) + + country_feature = place.get('country_feature') + if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): + self.add_country_names(country_feature.lower(), list(names.values())) address = place.get('address') @@ -279,22 +301,14 @@ class _TokenInfo: self.data = {} - def add_names(self, conn, names, country_feature): + def add_names(self, conn, names): """ Add token information for the names of the place. """ - if not names: - return - with conn.cursor() as cur: # Create the token IDs for all names. self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text", (names, )) - # Add country tokens to word table if necessary. - if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): - cur.execute("SELECT create_country(%s, %s)", - (names, country_feature.lower())) - def add_housenumbers(self, conn, hnrs): """ Extract housenumber information from the address. @@ -334,7 +348,8 @@ class _TokenInfo: """ def _get_place(name): with conn.cursor() as cur: - cur.execute("""SELECT (addr_ids_from_name(%s) || getorcreate_name_id(make_standard_name(%s), ''))::text, + cur.execute("""SELECT (addr_ids_from_name(%s) + || getorcreate_name_id(make_standard_name(%s), ''))::text, word_ids_from_name(%s)::text""", (name, name, name)) return cur.fetchone() diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py index 400ce7a5..664d3c6b 100644 --- a/nominatim/tools/database_import.py +++ b/nominatim/tools/database_import.py @@ -8,6 +8,7 @@ import subprocess from pathlib import Path import psutil +import psycopg2.extras from nominatim.db.connection import connect, get_pg_env from nominatim.db import utils as db_utils @@ -250,34 +251,37 @@ def create_search_indices(conn, config, drop=False): sql.run_sql_file(conn, 'indices.sql', drop=drop) -def create_country_names(conn, config): - """ Create search index for default country names. +def create_country_names(conn, tokenizer, languages=None): + """ Add default country names to search index. `languages` is a comma- + separated list of language codes as used in OSM. If `languages` is not + empty then only name translations for the given languages are added + to the index. """ + if languages: + languages = languages.split(',') + + def _include_key(key): + return key == 'name' or \ + (key.startswith('name:') \ + and (not languages or key[5:] in languages)) with conn.cursor() as cur: - cur.execute("""SELECT getorcreate_country(make_standard_name('uk'), 'gb')""") - cur.execute("""SELECT getorcreate_country(make_standard_name('united states'), 'us')""") - cur.execute("""SELECT COUNT(*) FROM - (SELECT getorcreate_country(make_standard_name(country_code), - country_code) FROM country_name WHERE country_code is not null) AS x""") - cur.execute("""SELECT COUNT(*) FROM - (SELECT getorcreate_country(make_standard_name(name->'name'), country_code) - FROM country_name WHERE name ? 'name') AS x""") - sql_statement = """SELECT COUNT(*) FROM (SELECT getorcreate_country(make_standard_name(v), - country_code) FROM (SELECT country_code, skeys(name) - AS k, svals(name) AS v FROM country_name) x WHERE k""" - - languages = config.LANGUAGES - - if languages: - sql_statement = "{} IN (".format(sql_statement) - delim = '' - for language in languages.split(','): - sql_statement = "{}{}'name:{}'".format(sql_statement, delim, language) - delim = ', ' - sql_statement = '{})'.format(sql_statement) - else: - sql_statement = "{} LIKE 'name:%'".format(sql_statement) - sql_statement = "{}) v".format(sql_statement) - cur.execute(sql_statement) + psycopg2.extras.register_hstore(cur) + cur.execute("""SELECT country_code, name FROM country_name + WHERE country_code is not null""") + + with tokenizer.name_analyzer() as analyzer: + for code, name in cur: + names = [code] + if code == 'gb': + names.append('UK') + if code == 'us': + names.append('United States') + + # country names (only in languages as provided) + if name: + names.extend((v for k, v in name.items() if _include_key(k))) + + analyzer.add_country_names(code, names) + conn.commit() diff --git a/test/python/conftest.py b/test/python/conftest.py index f43f09d0..493620c4 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -121,9 +121,8 @@ def table_factory(temp_db_cursor): def mk_table(name, definition='id INT', content=None): temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition)) if content is not None: - if not isinstance(content, str): - content = '),('.join([str(x) for x in content]) - temp_db_cursor.execute("INSERT INTO {} VALUES ({})".format(name, content)) + psycopg2.extras.execute_values( + temp_db_cursor, "INSERT INTO {} VALUES %s".format(name), content) return mk_table @@ -290,7 +289,7 @@ def osm2pgsql_options(temp_db): @pytest.fixture def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory): - table_factory('country_name', 'partition INT', (0, 1, 2)) + table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, ))) cfg = Configuration(None, SRC_DIR.resolve() / 'settings') cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php', sql=tmp_path, data=SRC_DIR / 'data') @@ -299,9 +298,10 @@ def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory): @pytest.fixture -def tokenizer_mock(monkeypatch, property_table, temp_db_conn, dsn): +def tokenizer_mock(monkeypatch, property_table, temp_db_conn, tmp_path): """ Sets up the configuration so that the test dummy tokenizer will be - loaded. + loaded when the tokenizer factory is used. Also returns a factory + with which a new dummy tokenizer may be created. """ monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy') @@ -310,3 +310,8 @@ def tokenizer_mock(monkeypatch, property_table, temp_db_conn, dsn): monkeypatch.setattr(importlib, "import_module", _import_dummy) properties.set_property(temp_db_conn, 'tokenizer', 'dummy') + + def _create_tokenizer(): + return dummy_tokenizer.DummyTokenizer(None, None) + + return _create_tokenizer diff --git a/test/python/dummy_tokenizer.py b/test/python/dummy_tokenizer.py index ceea4a7e..79197dfb 100644 --- a/test/python/dummy_tokenizer.py +++ b/test/python/dummy_tokenizer.py @@ -13,6 +13,7 @@ class DummyTokenizer: self.dsn = dsn self.data_dir = data_dir self.init_state = None + self.analyser_cache = {} def init_new_db(self, config): @@ -26,7 +27,7 @@ class DummyTokenizer: def name_analyzer(self): - return DummyNameAnalyzer() + return DummyNameAnalyzer(self.analyser_cache) class DummyNameAnalyzer: @@ -38,18 +39,20 @@ class DummyNameAnalyzer: self.close() + def __init__(self, cache): + self.analyser_cache = cache + cache['countries'] = [] + + def close(self): - """ Free all resources used by the analyzer. - """ pass def add_postcodes_from_db(self): pass - def process_place(self, place): - """ Determine tokenizer information about the given place. - Returns a JSON-serialisable structure that will be handed into - the database via the token_info field. - """ + def add_country_names(self, code, names): + self.analyser_cache['countries'].append((code, names)) + + def process_place(self, place): return {} diff --git a/test/python/test_tools_database_import.py b/test/python/test_tools_database_import.py index e31017e8..ceac7a24 100644 --- a/test/python/test_tools_database_import.py +++ b/test/python/test_tools_database_import.py @@ -143,7 +143,8 @@ def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory): 'location_property_tiger', 'location_property_osmline', 'location_postcode', 'search_name', 'location_road_23') for table in tables: - table_factory(table, content=(1, 2, 3)) + table_factory(table, content=((1, ), (2, ), (3, ))) + assert temp_db_cursor.table_rows(table) == 3 database_import.truncate_data_tables(temp_db_conn) @@ -168,31 +169,28 @@ def test_load_data(dsn, src_dir, place_row, placex_table, osmline_table, word_ta assert temp_db_cursor.table_rows('placex') == 30 assert temp_db_cursor.table_rows('location_property_osmline') == 1 -@pytest.mark.parametrize("languages", (False, True)) -def test_create_country_names(temp_db_conn, temp_db_cursor, def_config, - temp_db_with_extensions, monkeypatch, languages): - if languages: - monkeypatch.setenv('NOMINATIM_LANGUAGES', 'fr,en') - temp_db_cursor.execute("""CREATE FUNCTION make_standard_name (name TEXT) - RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL - """) - temp_db_cursor.execute('CREATE TABLE country_name (country_code varchar(2), name hstore)') - temp_db_cursor.execute('CREATE TABLE word (code varchar(2))') - temp_db_cursor.execute("""INSERT INTO country_name VALUES ('us', - '"name"=>"us","name:af"=>"us"')""") - temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT, - lookup_country_code varchar(2)) - RETURNS INTEGER - AS $$ - BEGIN - INSERT INTO word VALUES (lookup_country_code); - RETURN 5; - END; - $$ - LANGUAGE plpgsql; - """) - database_import.create_country_names(temp_db_conn, def_config) + +@pytest.mark.parametrize("languages", (None, ' fr,en')) +def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cursor, + table_factory, tokenizer_mock, languages): + + table_factory('country_name', 'country_code varchar(2), name hstore', + content=(('us', '"name"=>"us1","name:af"=>"us2"'), + ('fr', '"name"=>"Fra", "name:en"=>"Fren"'))) + + assert temp_db_cursor.scalar("SELECT count(*) FROM country_name") == 2 + + tokenizer = tokenizer_mock() + + database_import.create_country_names(temp_db_conn, tokenizer, languages) + + assert len(tokenizer.analyser_cache['countries']) == 2 + + result_set = {k: set(v) for k, v in tokenizer.analyser_cache['countries']} + if languages: - assert temp_db_cursor.table_rows('word') == 4 + assert result_set == {'us' : set(('us', 'us1', 'United States')), + 'fr' : set(('fr', 'Fra', 'Fren'))} else: - assert temp_db_cursor.table_rows('word') == 5 + assert result_set == {'us' : set(('us', 'us1', 'us2', 'United States')), + 'fr' : set(('fr', 'Fra', 'Fren'))} diff --git a/test/python/test_tools_refresh_create_functions.py b/test/python/test_tools_refresh_create_functions.py index 53ea2b52..3f9bccbd 100644 --- a/test/python/test_tools_refresh_create_functions.py +++ b/test/python/test_tools_refresh_create_functions.py @@ -11,9 +11,7 @@ def sql_tmp_path(tmp_path, def_config): return tmp_path @pytest.fixture -def conn(temp_db_conn, table_factory, monkeypatch): - monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.') - table_factory('country_name', 'partition INT', (0, 1, 2)) +def conn(sql_preprocessor, temp_db_conn): return temp_db_conn