From 296a66558ffb2ee2c43e653b00ff24097ee3525c Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 21 Apr 2021 15:00:37 +0200 Subject: [PATCH 1/1] move module installation to legacy tokenizer --- nominatim/clicmd/setup.py | 6 -- nominatim/tokenizer/legacy_tokenizer.py | 63 ++++++++++++++++++- nominatim/tools/database_import.py | 45 -------------- test/python/test_cli.py | 1 - test/python/test_tokenizer_legacy.py | 74 ++++++++++++++++++++--- test/python/test_tools_database_import.py | 33 ---------- 6 files changed, 124 insertions(+), 98 deletions(-) diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py index 68727972..066c2960 100644 --- a/nominatim/clicmd/setup.py +++ b/nominatim/clicmd/setup.py @@ -68,12 +68,6 @@ class SetupAll: args.no_partitions, rouser=args.config.DATABASE_WEBUSER) - LOG.warning('Installing database module') - with connect(args.config.get_libpq_dsn()) as conn: - database_import.install_module(args.module_dir, args.project_dir, - args.config.DATABASE_MODULE_PATH, - conn=conn) - LOG.warning('Importing OSM data file') database_import.import_osm_data(Path(args.osm_file), args.osm2pgsql_options(0, 1), diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index ab3e320e..bd7b7709 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -1,16 +1,61 @@ """ Tokenizer implementing normalisation as used before Nominatim 4. """ +import logging +import shutil + +import psycopg2 + from nominatim.db.connection import connect from nominatim.db import properties +from nominatim.errors import UsageError DBCFG_NORMALIZATION = "tokenizer_normalization" +LOG = logging.getLogger() + def create(dsn, data_dir): """ Create a new instance of the tokenizer provided by this module. """ return LegacyTokenizer(dsn, data_dir) + +def _install_module(src_dir, module_dir): + """ Copies the PostgreSQL normalisation module into the project + directory if necessary. For historical reasons the module is + saved in the '/module' subdirectory and not with the other tokenizer + data. + + The function detects when the installation is run from the + build directory. It doesn't touch the module in that case. + """ + if module_dir.exists() and src_dir.samefile(module_dir): + LOG.info('Running from build directory. Leaving database module as is.') + return + + if not module_dir.exists(): + module_dir.mkdir() + + destfile = module_dir / 'nominatim.so' + shutil.copy(str(src_dir / 'nominatim.so'), str(destfile)) + destfile.chmod(0o755) + + LOG.info('Database module installed at %s', str(destfile)) + + +def _check_module(module_dir, conn): + with conn.cursor() as cur: + try: + cur.execute("""CREATE FUNCTION nominatim_test_import_func(text) + RETURNS text AS '{}/nominatim.so', 'transliteration' + LANGUAGE c IMMUTABLE STRICT; + DROP FUNCTION nominatim_test_import_func(text) + """.format(module_dir)) + except psycopg2.DatabaseError as err: + LOG.fatal("Error accessing database module: %s", err) + raise UsageError("Database module cannot be accessed.") from err + + class LegacyTokenizer: """ The legacy tokenizer uses a special PostgreSQL module to normalize names and queries. The tokenizer thus implements normalization through @@ -29,12 +74,24 @@ class LegacyTokenizer: This copies all necessary data in the project directory to make sure the tokenizer remains stable even over updates. """ + # Find and optionally install the PsotgreSQL normalization module. + if config.DATABASE_MODULE_PATH: + LOG.info("Using custom path for database module at '%s'", + config.DATABASE_MODULE_PATH) + module_dir = config.DATABASE_MODULE_PATH + else: + _install_module(config.lib_dir.module, config.project_dir / 'module') + module_dir = config.project_dir / 'module' + self.normalization = config.TERM_NORMALIZATION - # Stable configuration is saved in the database. with connect(self.dsn) as conn: - properties.set_property(conn, DBCFG_NORMALIZATION, - self.normalization) + _check_module(module_dir, conn) + + # Stable configuration is saved in the database. + properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization) + + conn.commit() def init_from_project(self): diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py index 3618ed28..324a82cf 100644 --- a/nominatim/tools/database_import.py +++ b/nominatim/tools/database_import.py @@ -5,11 +5,9 @@ import logging import os import selectors import subprocess -import shutil from pathlib import Path import psutil -import psycopg2 from nominatim.db.connection import connect, get_pg_env from nominatim.db import utils as db_utils @@ -89,49 +87,6 @@ def setup_extensions(conn): raise UsageError('PostGIS version is too old.') -def install_module(src_dir, project_dir, module_dir, conn=None): - """ Copy the normalization module from src_dir into the project - directory under the '/module' directory. If 'module_dir' is set, then - use the module from there instead and check that it is accessible - for Postgresql. - - The function detects when the installation is run from the - build directory. It doesn't touch the module in that case. - - If 'conn' is given, then the function also tests if the module - can be access via the given database. - """ - if not module_dir: - module_dir = project_dir / 'module' - - if not module_dir.exists() or not src_dir.samefile(module_dir): - - if not module_dir.exists(): - module_dir.mkdir() - - destfile = module_dir / 'nominatim.so' - shutil.copy(str(src_dir / 'nominatim.so'), str(destfile)) - destfile.chmod(0o755) - - LOG.info('Database module installed at %s', str(destfile)) - else: - LOG.info('Running from build directory. Leaving database module as is.') - else: - LOG.info("Using custom path for database module at '%s'", module_dir) - - if conn is not None: - with conn.cursor() as cur: - try: - cur.execute("""CREATE FUNCTION nominatim_test_import_func(text) - RETURNS text AS '{}/nominatim.so', 'transliteration' - LANGUAGE c IMMUTABLE STRICT; - DROP FUNCTION nominatim_test_import_func(text) - """.format(module_dir)) - except psycopg2.DatabaseError as err: - LOG.fatal("Error accessing database module: %s", err) - raise UsageError("Database module cannot be accessed.") from err - - def import_base_data(dsn, sql_dir, ignore_partitions=False): """ Create and populate the tables with basic static data that provides the background for geocoding. Data is assumed to not yet exist. diff --git a/test/python/test_cli.py b/test/python/test_cli.py index 6b8bfc55..10a31542 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -88,7 +88,6 @@ def test_import_bad_file(temp_db): def test_import_full(temp_db, mock_func_factory): mocks = [ mock_func_factory(nominatim.tools.database_import, 'setup_database_skeleton'), - mock_func_factory(nominatim.tools.database_import, 'install_module'), mock_func_factory(nominatim.tools.database_import, 'import_osm_data'), mock_func_factory(nominatim.tools.refresh, 'import_wikipedia_articles'), mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'), diff --git a/test/python/test_tokenizer_legacy.py b/test/python/test_tokenizer_legacy.py index cb6fb00b..44937904 100644 --- a/test/python/test_tokenizer_legacy.py +++ b/test/python/test_tokenizer_legacy.py @@ -5,24 +5,78 @@ import pytest from nominatim.tokenizer import legacy_tokenizer from nominatim.db import properties +from nominatim.errors import UsageError @pytest.fixture -def tokenizer(dsn, tmp_path, def_config, property_table): - tok = legacy_tokenizer.create(dsn, tmp_path) - tok.init_new_db(def_config) +def test_config(def_config, tmp_path): + def_config.project_dir = tmp_path / 'project' + def_config.project_dir.mkdir() - return tok + module_dir = tmp_path / 'module_src' + module_dir.mkdir() + (module_dir / 'nominatim.so').write_text('TEST nomiantim.so') -def test_init_new(dsn, tmp_path, def_config, property_table, monkeypatch, temp_db_conn): + def_config.lib_dir.module = module_dir + + return def_config + + +@pytest.fixture +def tokenizer_factory(dsn, tmp_path, monkeypatch): + + def _maker(): + return legacy_tokenizer.create(dsn, tmp_path / 'tokenizer') + + return _maker + +@pytest.fixture +def tokenizer_setup(tokenizer_factory, test_config, property_table, monkeypatch): + monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None) + tok = tokenizer_factory() + tok.init_new_db(test_config) + + +def test_init_new(tokenizer_factory, test_config, property_table, monkeypatch, temp_db_conn): monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', 'xxvv') + monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None) - tok = legacy_tokenizer.create(dsn, tmp_path) - tok.init_new_db(def_config) + tok = tokenizer_factory() + tok.init_new_db(test_config) assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) == 'xxvv' + outfile = test_config.project_dir / 'module' / 'nominatim.so' + + assert outfile.exists() + assert outfile.read_text() == 'TEST nomiantim.so' + assert outfile.stat().st_mode == 33261 + + +def test_init_module_load_failed(tokenizer_factory, test_config, property_table, monkeypatch, temp_db_conn): + tok = tokenizer_factory() + + with pytest.raises(UsageError): + tok.init_new_db(test_config) + + +def test_init_module_custom(tokenizer_factory, test_config, property_table, + monkeypatch, tmp_path): + module_dir = (tmp_path / 'custom').resolve() + module_dir.mkdir() + (module_dir/ 'nominatim.so').write_text('CUSTOM nomiantim.so') + + monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', str(module_dir)) + monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None) + + tok = tokenizer_factory() + tok.init_new_db(test_config) + + assert not (test_config.project_dir / 'module').exists() + + +def test_init_from_project(tokenizer_setup, tokenizer_factory): + tok = tokenizer_factory() -def test_init_from_project(tokenizer): - tokenizer.init_from_project() + tok.init_from_project() - assert tokenizer.normalization is not None + assert tok.normalization is not None diff --git a/test/python/test_tools_database_import.py b/test/python/test_tools_database_import.py index 280ca704..7ae19e2a 100644 --- a/test/python/test_tools_database_import.py +++ b/test/python/test_tools_database_import.py @@ -80,39 +80,6 @@ def test_setup_extensions_old_postgis(temp_db_conn, monkeypatch): database_import.setup_extensions(temp_db_conn) -def test_install_module(tmp_path): - src_dir = tmp_path / 'source' - src_dir.mkdir() - (src_dir / 'nominatim.so').write_text('TEST nomiantim.so') - - project_dir = tmp_path / 'project' - project_dir.mkdir() - - database_import.install_module(src_dir, project_dir, '') - - outfile = project_dir / 'module' / 'nominatim.so' - - assert outfile.exists() - assert outfile.read_text() == 'TEST nomiantim.so' - assert outfile.stat().st_mode == 33261 - - -def test_install_module_custom(tmp_path): - (tmp_path / 'nominatim.so').write_text('TEST nomiantim.so') - - database_import.install_module(tmp_path, tmp_path, str(tmp_path.resolve())) - - assert not (tmp_path / 'module').exists() - - -def test_install_module_fail_access(temp_db_conn, tmp_path): - (tmp_path / 'nominatim.so').write_text('TEST nomiantim.so') - - with pytest.raises(UsageError, match='.*module cannot be accessed.*'): - database_import.install_module(tmp_path, tmp_path, '', - conn=temp_db_conn) - - def test_import_base_data(src_dir, temp_db, temp_db_cursor): temp_db_cursor.execute('CREATE EXTENSION hstore') temp_db_cursor.execute('CREATE EXTENSION postgis') -- 2.39.5