From 388ebcbae2b895279727edafebfb8ed794f0c4cc Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 30 Apr 2021 17:28:34 +0200 Subject: [PATCH] move index creation for word table to tokenizer This introduces a finalization routing for the tokenizer where it can post-process the import if necessary. --- lib-sql/indices.sql | 3 --- .../tokenizer/legacy_tokenizer_indices.sql | 2 ++ nominatim/clicmd/setup.py | 1 + nominatim/tokenizer/legacy_tokenizer.py | 9 ++++++++ test/python/dummy_tokenizer.py | 4 ++++ test/python/test_cli.py | 23 ++++++++++++------- 6 files changed, 31 insertions(+), 11 deletions(-) create mode 100644 lib-sql/tokenizer/legacy_tokenizer_indices.sql diff --git a/lib-sql/indices.sql b/lib-sql/indices.sql index a6f7cf95..81299544 100644 --- a/lib-sql/indices.sql +++ b/lib-sql/indices.sql @@ -1,9 +1,6 @@ -- Indices used only during search and update. -- These indices are created only after the indexing process is done. -CREATE INDEX {{sql.if_index_not_exists}} idx_word_word_id - ON word USING BTREE (word_id) {{db.tablespace.search_index}}; - CREATE INDEX {{sql.if_index_not_exists}} idx_place_addressline_address_place_id ON place_addressline USING BTREE (address_place_id) {{db.tablespace.search_index}}; diff --git a/lib-sql/tokenizer/legacy_tokenizer_indices.sql b/lib-sql/tokenizer/legacy_tokenizer_indices.sql new file mode 100644 index 00000000..44a2909c --- /dev/null +++ b/lib-sql/tokenizer/legacy_tokenizer_indices.sql @@ -0,0 +1,2 @@ +CREATE INDEX {{sql.if_index_not_exists}} idx_word_word_id + ON word USING BTREE (word_id) {{db.tablespace.search_index}}; diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py index 0f19d097..eb0178a9 100644 --- a/nominatim/clicmd/setup.py +++ b/nominatim/clicmd/setup.py @@ -135,6 +135,7 @@ class SetupAll: LOG.warning('Create search index for default country names.') database_import.create_country_names(conn, tokenizer, args.config.LANGUAGES) + tokenizer.finalize_import(args.config) webdir = args.project_dir / 'website' LOG.warning('Setup website at %s', webdir) diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index b1fd9e96..2f060b84 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -119,6 +119,15 @@ class LegacyTokenizer: self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION) + def finalize_import(self, config): + """ Do any required postprocessing to make the tokenizer data ready + for use. + """ + with connect(self.dsn) as conn: + sqlp = SQLPreprocessor(conn, config) + sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql') + + def update_sql_functions(self, config): """ Reimport the SQL functions for this tokenizer. """ diff --git a/test/python/dummy_tokenizer.py b/test/python/dummy_tokenizer.py index d3f006de..6352a644 100644 --- a/test/python/dummy_tokenizer.py +++ b/test/python/dummy_tokenizer.py @@ -26,6 +26,10 @@ class DummyTokenizer: self.init_state = "loaded" + def finalize_import(self, _): + pass + + def name_analyzer(self): return DummyNameAnalyzer(self.analyser_cache) diff --git a/test/python/test_cli.py b/test/python/test_cli.py index e0d4fb86..a2869956 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -62,13 +62,19 @@ def tokenizer_mock(monkeypatch): class DummyTokenizer: def __init__(self, *args, **kwargs): self.update_sql_functions_called = False + self.finalize_import_called = False def update_sql_functions(self, *args): self.update_sql_functions_called = True + def finalize_import(self, *args): + self.finalize_import_called = True + tok = DummyTokenizer() monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' , lambda *args: tok) + monkeypatch.setattr(nominatim.tokenizer.factory, 'create_tokenizer' , + lambda *args: tok) return tok @@ -101,7 +107,7 @@ def test_import_bad_file(temp_db): assert 1 == call_nominatim('import', '--osm-file', '.') -def test_import_full(temp_db, mock_func_factory): +def test_import_full(temp_db, mock_func_factory, tokenizer_mock): mocks = [ mock_func_factory(nominatim.tools.database_import, 'setup_database_skeleton'), mock_func_factory(nominatim.tools.database_import, 'import_osm_data'), @@ -113,7 +119,6 @@ def test_import_full(temp_db, mock_func_factory): mock_func_factory(nominatim.tools.database_import, 'create_partition_tables'), mock_func_factory(nominatim.tools.database_import, 'create_search_indices'), mock_func_factory(nominatim.tools.database_import, 'create_country_names'), - mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'), mock_func_factory(nominatim.tools.refresh, 'load_address_levels_from_file'), mock_func_factory(nominatim.tools.postcodes, 'import_postcodes'), mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'), @@ -124,6 +129,7 @@ def test_import_full(temp_db, mock_func_factory): cf_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions') assert 0 == call_nominatim('import', '--osm-file', __file__) + assert tokenizer_mock.finalize_import_called assert cf_mock.called > 1 @@ -131,13 +137,12 @@ def test_import_full(temp_db, mock_func_factory): assert mock.called == 1, "Mock '{}' not called".format(mock.func_name) -def test_import_continue_load_data(temp_db, mock_func_factory): +def test_import_continue_load_data(temp_db, mock_func_factory, tokenizer_mock): mocks = [ mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'), mock_func_factory(nominatim.tools.database_import, 'load_data'), mock_func_factory(nominatim.tools.database_import, 'create_search_indices'), mock_func_factory(nominatim.tools.database_import, 'create_country_names'), - mock_func_factory(nominatim.tokenizer.factory, 'create_tokenizer'), mock_func_factory(nominatim.tools.postcodes, 'import_postcodes'), mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'), mock_func_factory(nominatim.tools.refresh, 'setup_website'), @@ -145,17 +150,18 @@ def test_import_continue_load_data(temp_db, mock_func_factory): ] assert 0 == call_nominatim('import', '--continue', 'load-data') + assert tokenizer_mock.finalize_import_called for mock in mocks: assert mock.called == 1, "Mock '{}' not called".format(mock.func_name) -def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp_db_conn): +def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, + temp_db_conn, tokenizer_mock): mocks = [ mock_func_factory(nominatim.tools.database_import, 'create_search_indices'), mock_func_factory(nominatim.tools.database_import, 'create_country_names'), mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_full'), - mock_func_factory(nominatim.tokenizer.factory, 'get_tokenizer_for_db'), mock_func_factory(nominatim.tools.refresh, 'setup_website'), mock_func_factory(nominatim.db.properties, 'set_property') ] @@ -172,17 +178,18 @@ def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp assert temp_db_conn.index_exists('idx_placex_pendingsector') -def test_import_continue_postprocess(temp_db, mock_func_factory): +def test_import_continue_postprocess(temp_db, mock_func_factory, tokenizer_mock): mocks = [ mock_func_factory(nominatim.tools.database_import, 'create_search_indices'), mock_func_factory(nominatim.tools.database_import, 'create_country_names'), mock_func_factory(nominatim.tools.refresh, 'setup_website'), - mock_func_factory(nominatim.tokenizer.factory, 'get_tokenizer_for_db'), mock_func_factory(nominatim.db.properties, 'set_property') ] assert 0 == call_nominatim('import', '--continue', 'db-postprocess') + assert tokenizer_mock.finalize_import_called + for mock in mocks: assert mock.called == 1, "Mock '{}' not called".format(mock.func_name) -- 2.39.5