From 32683f73c787464e16f2a146d4c08c4041087dd5 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 24 Feb 2021 17:21:45 +0100 Subject: [PATCH] move import-data option to native python This adds a new dependecy to the Python psutil package. --- .pylintrc | 11 +++++ CMakeLists.txt | 2 +- docs/admin/Installation.md | 3 +- lib-php/Shell.php | 2 +- lib-php/admin/setup.php | 17 ++++++- lib-php/setup/SetupClass.php | 60 ----------------------- nominatim/clicmd/args.py | 11 +++-- nominatim/clicmd/index.py | 12 +---- nominatim/clicmd/transition.py | 18 +++++++ nominatim/db/connection.py | 11 +++++ nominatim/tools/database_import.py | 47 ++++++++++++++++-- nominatim/tools/exec_utils.py | 9 ++++ test/python/conftest.py | 11 ++++- test/python/test_cli.py | 2 + test/python/test_db_connection.py | 17 +++++++ test/python/test_tools_database_import.py | 40 +++++++++++++++ test/python/test_tools_exec_utils.py | 17 +++++-- vagrant/Install-on-Centos-7.sh | 2 +- vagrant/Install-on-Centos-8.sh | 2 +- vagrant/Install-on-Ubuntu-18.sh | 2 +- vagrant/Install-on-Ubuntu-20.sh | 2 +- 21 files changed, 205 insertions(+), 93 deletions(-) create mode 100644 .pylintrc diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 00000000..da6dbe03 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,11 @@ +[MASTER] + +extension-pkg-whitelist=osmium + +[MESSAGES CONTROL] + +[TYPECHECK] + +# closing added here because it sometimes triggers a false positive with +# 'with' statements. +ignored-classes=NominatimArgs,closing diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f8e4611..2b4c2976 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -177,7 +177,7 @@ if (BUILD_TESTS) if (PYLINT) message(STATUS "Using pylint binary ${PYLINT}") add_test(NAME pylint - COMMAND ${PYLINT} --extension-pkg-whitelist=osmium nominatim + COMMAND ${PYLINT} nominatim WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) else() message(WARNING "pylint not found. Python linting tests disabled.") diff --git a/docs/admin/Installation.md b/docs/admin/Installation.md index 05e57f93..4360c9a7 100644 --- a/docs/admin/Installation.md +++ b/docs/admin/Installation.md @@ -41,10 +41,11 @@ For running Nominatim: * [Python 3](https://www.python.org/) (3.5+) * [Psycopg2](https://www.psycopg.org) (2.7+) * [Python Dotenv](https://github.com/theskumar/python-dotenv) + * [psutil] (https://github.com/giampaolo/psutil) * [PHP](https://php.net) (7.0 or later) * PHP-pgsql * PHP-intl (bundled with PHP) - ( PHP-cgi (for running queries from the command line) + * PHP-cgi (for running queries from the command line) For running continuous updates: diff --git a/lib-php/Shell.php b/lib-php/Shell.php index 52a7d8fb..b43db135 100644 --- a/lib-php/Shell.php +++ b/lib-php/Shell.php @@ -48,7 +48,7 @@ class Shell return join(' ', $aEscaped); } - public function run($bExitOnFail = False) + public function run($bExitOnFail = false) { $sCmd = $this->escapedCmd(); // $aEnv does not need escaping, proc_open seems to handle it fine diff --git a/lib-php/admin/setup.php b/lib-php/admin/setup.php index 902c24e0..d3831322 100644 --- a/lib-php/admin/setup.php +++ b/lib-php/admin/setup.php @@ -86,12 +86,25 @@ if ($aCMDResult['create-db'] || $aCMDResult['all']) { if ($aCMDResult['setup-db'] || $aCMDResult['all']) { $bDidSomething = true; - (clone($oNominatimCmd))->addParams('transition', '--setup-db')->run(true); + $oCmd = (clone($oNominatimCmd))->addParams('transition', '--setup-db'); + + if ($aCMDResult['no-partitions'] ?? false) { + $oCmd->addParams('--no-partitions'); + } + + $oCmd->run(true); } if ($aCMDResult['import-data'] || $aCMDResult['all']) { $bDidSomething = true; - $oSetup->importData($aCMDResult['osm-file']); + $oCmd = (clone($oNominatimCmd)) + ->addParams('transition', '--import-data') + ->addParams('--osm-file', $aCMDResult['osm-file']); + if ($aCMDResult['drop'] ?? false) { + $oCmd->addParams('--drop'); + } + + $oCmd->run(true); } if ($aCMDResult['create-functions'] || $aCMDResult['all']) { diff --git a/lib-php/setup/SetupClass.php b/lib-php/setup/SetupClass.php index 1e7dccb4..e11ecd13 100755 --- a/lib-php/setup/SetupClass.php +++ b/lib-php/setup/SetupClass.php @@ -84,66 +84,6 @@ class SetupFunctions } } - public function importData($sOSMFile) - { - info('Import data'); - - if (!file_exists(getOsm2pgsqlBinary())) { - echo "Check NOMINATIM_OSM2PGSQL_BINARY in your local .env file.\n"; - echo "Normally you should not need to set this manually.\n"; - fail("osm2pgsql not found in '".getOsm2pgsqlBinary()."'"); - } - - $oCmd = new \Nominatim\Shell(getOsm2pgsqlBinary()); - $oCmd->addParams('--style', getImportStyle()); - - if (getSetting('FLATNODE_FILE')) { - $oCmd->addParams('--flat-nodes', getSetting('FLATNODE_FILE')); - } - if (getSetting('TABLESPACE_OSM_DATA')) { - $oCmd->addParams('--tablespace-slim-data', getSetting('TABLESPACE_OSM_DATA')); - } - if (getSetting('TABLESPACE_OSM_INDEX')) { - $oCmd->addParams('--tablespace-slim-index', getSetting('TABLESPACE_OSM_INDEX')); - } - if (getSetting('TABLESPACE_PLACE_DATA')) { - $oCmd->addParams('--tablespace-main-data', getSetting('TABLESPACE_PLACE_DATA')); - } - if (getSetting('TABLESPACE_PLACE_INDEX')) { - $oCmd->addParams('--tablespace-main-index', getSetting('TABLESPACE_PLACE_INDEX')); - } - $oCmd->addParams('--latlong', '--slim', '--create'); - $oCmd->addParams('--output', 'gazetteer'); - $oCmd->addParams('--hstore'); - $oCmd->addParams('--number-processes', 1); - $oCmd->addParams('--with-forward-dependencies', 'false'); - $oCmd->addParams('--log-progress', 'true'); - $oCmd->addParams('--cache', $this->iCacheMemory); - $oCmd->addParams('--port', $this->aDSNInfo['port']); - - if (isset($this->aDSNInfo['username'])) { - $oCmd->addParams('--username', $this->aDSNInfo['username']); - } - if (isset($this->aDSNInfo['password'])) { - $oCmd->addEnvPair('PGPASSWORD', $this->aDSNInfo['password']); - } - if (isset($this->aDSNInfo['hostspec'])) { - $oCmd->addParams('--host', $this->aDSNInfo['hostspec']); - } - $oCmd->addParams('--database', $this->aDSNInfo['database']); - $oCmd->addParams($sOSMFile); - $oCmd->run(); - - if (!$this->sIgnoreErrors && !$this->db()->getRow('select * from place limit 1')) { - fail('No Data'); - } - - if ($this->bDrop) { - $this->dropTable('planet_osm_nodes'); - $this->removeFlatnodeFile(); - } - } - public function createFunctions() { info('Create Functions'); diff --git a/nominatim/clicmd/args.py b/nominatim/clicmd/args.py index f9b94ef2..47007579 100644 --- a/nominatim/clicmd/args.py +++ b/nominatim/clicmd/args.py @@ -3,7 +3,7 @@ Provides custom functions over command-line arguments. """ -class NominatimArgs: +class NominatimArgs: # pylint: disable=too-few-public-methods """ Customized namespace class for the nominatim command line tool to receive the command-line arguments. """ @@ -18,5 +18,10 @@ class NominatimArgs: osm2pgsql_style=self.config.get_import_style_file(), threads=self.threads or default_threads, dsn=self.config.get_libpq_dsn(), - flatnode_file=self.config.FLATNODE_FILE) - + flatnode_file=self.config.FLATNODE_FILE, + tablespaces=dict(slim_data=self.config.TABLESPACE_OSM_DATA, + slim_index=self.config.TABLESPACE_OSM_INDEX, + main_data=self.config.TABLESPACE_PLACE_DATA, + main_index=self.config.TABLESPACE_PLACE_INDEX + ) + ) diff --git a/nominatim/clicmd/index.py b/nominatim/clicmd/index.py index 96a69396..0225c5ed 100644 --- a/nominatim/clicmd/index.py +++ b/nominatim/clicmd/index.py @@ -1,7 +1,7 @@ """ Implementation of the 'index' subcommand. """ -import os +import psutil from ..db import status from ..db.connection import connect @@ -11,14 +11,6 @@ from ..db.connection import connect # Using non-top-level imports to avoid eventually unused imports. # pylint: disable=E0012,C0415 -def _num_system_cpus(): - try: - cpus = len(os.sched_getaffinity(0)) - except NotImplementedError: - cpus = None - - return cpus or os.cpu_count() - class UpdateIndex: """\ @@ -42,7 +34,7 @@ class UpdateIndex: from ..indexer.indexer import Indexer indexer = Indexer(args.config.get_libpq_dsn(), - args.threads or _num_system_cpus() or 1) + args.threads or psutil.cpu_count() or 1) if not args.no_boundaries: indexer.index_boundaries(args.minrank, args.maxrank) diff --git a/nominatim/clicmd/transition.py b/nominatim/clicmd/transition.py index 2f351be2..eb4e2d2f 100644 --- a/nominatim/clicmd/transition.py +++ b/nominatim/clicmd/transition.py @@ -6,8 +6,10 @@ through the PHP scripts but are now no longer directly accessible. This module will be removed as soon as the transition phase is over. """ import logging +from pathlib import Path from ..db.connection import connect +from ..errors import UsageError # Do not repeat documentation of subcommand classes. # pylint: disable=C0111 @@ -28,9 +30,17 @@ class AdminTransition: help='Create nominatim db') group.add_argument('--setup-db', action='store_true', help='Build a blank nominatim db') + group.add_argument('--import-data', action='store_true', + help='Import a osm file') group = parser.add_argument_group('Options') group.add_argument('--no-partitions', action='store_true', help='Do not partition search indices') + group.add_argument('--osm-file', metavar='FILE', + help='File to import') + group.add_argument('--drop', action='store_true', + help='Drop tables needed for updates, making the database readonly') + group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int, + help='Size of cache to be used by osm2pgsql (in MB)') @staticmethod def run(args): @@ -51,3 +61,11 @@ class AdminTransition: database_import.import_base_data(args.config.get_libpq_dsn(), args.data_dir, args.no_partitions) + + if args.import_data: + LOG.warning('Import data') + if not args.osm_file: + raise UsageError('Missing required --osm-file argument') + database_import.import_osm_data(Path(args.osm_file), + args.osm2pgsql_options(0, 1), + drop=args.drop) diff --git a/nominatim/db/connection.py b/nominatim/db/connection.py index 12f5f4e1..5aa05ced 100644 --- a/nominatim/db/connection.py +++ b/nominatim/db/connection.py @@ -75,6 +75,17 @@ class _Connection(psycopg2.extensions.connection): return True + def drop_table(self, name, if_exists=True): + """ Drop the table with the given name. + Set `if_exists` to False if a non-existant table should raise + an exception instead of just being ignored. + """ + with self.cursor() as cur: + cur.execute("""DROP TABLE {} "{}" + """.format('IF EXISTS' if if_exists else '', name)) + self.commit() + + def server_version_tuple(self): """ Return the server version as a tuple of (major, minor). Converts correctly for pre-10 and post-10 PostgreSQL versions. diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py index 89e01f66..00ec95c0 100644 --- a/nominatim/tools/database_import.py +++ b/nominatim/tools/database_import.py @@ -2,11 +2,16 @@ Functions for setting up and importing a new Nominatim database. """ import logging +import os import subprocess import shutil +from pathlib import Path + +import psutil from ..db.connection import connect, get_pg_env from ..db import utils as db_utils +from .exec_utils import run_osm2pgsql from ..errors import UsageError from ..version import POSTGRESQL_REQUIRED_VERSION, POSTGIS_REQUIRED_VERSION @@ -28,7 +33,7 @@ def create_db(dsn, rouser=None): raise UsageError('Creating new database failed.') with connect(dsn) as conn: - postgres_version = conn.server_version_tuple() # pylint: disable=E1101 + postgres_version = conn.server_version_tuple() if postgres_version < POSTGRESQL_REQUIRED_VERSION: LOG.fatal('Minimum supported version of Postgresql is %d.%d. ' 'Found version %d.%d.', @@ -37,7 +42,7 @@ def create_db(dsn, rouser=None): raise UsageError('PostgreSQL server is too old.') if rouser is not None: - with conn.cursor() as cur: # pylint: disable=E1101 + with conn.cursor() as cur: cnt = cur.scalar('SELECT count(*) FROM pg_user where usename = %s', (rouser, )) if cnt == 0: @@ -109,13 +114,45 @@ def check_module_dir_path(conn, path): def import_base_data(dsn, sql_dir, ignore_partitions=False): """ Create and populate the tables with basic static data that provides - the background for geocoding. + the background for geocoding. Data is assumed to not yet exist. """ db_utils.execute_file(dsn, sql_dir / 'country_name.sql') db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz') if ignore_partitions: with connect(dsn) as conn: - with conn.cursor() as cur: # pylint: disable=E1101 + with conn.cursor() as cur: cur.execute('UPDATE country_name SET partition = 0') - conn.commit() # pylint: disable=E1101 + conn.commit() + + +def import_osm_data(osm_file, options, drop=False): + """ Import the given OSM file. 'options' contains the list of + default settings for osm2pgsql. + """ + options['import_file'] = osm_file + options['append'] = False + options['threads'] = 1 + + if not options['flatnode_file'] and options['osm2pgsql_cache'] == 0: + # Make some educated guesses about cache size based on the size + # of the import file and the available memory. + mem = psutil.virtual_memory() + fsize = os.stat(str(osm_file)).st_size + options['osm2pgsql_cache'] = int(min((mem.available + mem.cached) * 0.75, + fsize * 2) / 1024 / 1024) + 1 + + run_osm2pgsql(options) + + with connect(options['dsn']) as conn: + with conn.cursor() as cur: + cur.execute('SELECT * FROM place LIMIT 1') + if cur.rowcount == 0: + raise UsageError('No data imported by osm2pgsql.') + + if drop: + conn.drop_table('planet_osm_nodes') + + if drop: + if options['flatnode_file']: + Path(options['flatnode_file']).unlink() diff --git a/nominatim/tools/exec_utils.py b/nominatim/tools/exec_utils.py index b2ccc4a2..b476c123 100644 --- a/nominatim/tools/exec_utils.py +++ b/nominatim/tools/exec_utils.py @@ -110,10 +110,19 @@ def run_osm2pgsql(options): ] if options['append']: cmd.append('--append') + else: + cmd.append('--create') if options['flatnode_file']: cmd.extend(('--flat-nodes', options['flatnode_file'])) + for key, param in (('slim_data', '--tablespace-slim-data'), + ('slim_index', '--tablespace-slim-index'), + ('main_data', '--tablespace-main-data'), + ('main_index', '--tablespace-main-index')): + if options['tablespaces'][key]: + cmd.extend((param, options['tablespaces'][key])) + if options.get('disable_jit', False): env['PGOPTIONS'] = '-c jit=off -c max_parallel_workers_per_gather=0' diff --git a/test/python/conftest.py b/test/python/conftest.py index 6d4d3ac9..f0569ab8 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -198,4 +198,13 @@ def placex_table(temp_db_with_extensions, temp_db_conn): temp_db_conn.commit() - +@pytest.fixture +def osm2pgsql_options(temp_db): + return dict(osm2pgsql='echo', + osm2pgsql_cache=10, + osm2pgsql_style='style.file', + threads=1, + dsn='dbname=' + temp_db, + flatnode_file='', + tablespaces=dict(slim_data='', slim_index='', + main_data='', main_index='')) diff --git a/test/python/test_cli.py b/test/python/test_cli.py index 2b4240b4..9170406d 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -40,6 +40,7 @@ def mock_run_legacy(monkeypatch): monkeypatch.setattr(nominatim.cli, 'run_legacy_script', mock) return mock + @pytest.fixture def mock_func_factory(monkeypatch): def get_mock(module, func): @@ -49,6 +50,7 @@ def mock_func_factory(monkeypatch): return get_mock + def test_cli_help(capsys): """ Running nominatim tool without arguments prints help. """ diff --git a/test/python/test_db_connection.py b/test/python/test_db_connection.py index dcbfb8bf..635465f5 100644 --- a/test/python/test_db_connection.py +++ b/test/python/test_db_connection.py @@ -2,6 +2,7 @@ Tests for specialised conenction and cursor classes. """ import pytest +import psycopg2 from nominatim.db.connection import connect, get_pg_env @@ -30,6 +31,22 @@ def test_connection_index_exists(db, temp_db_cursor): assert db.index_exists('some_index', table='bar') == False +def test_drop_table_existing(db, temp_db_cursor): + temp_db_cursor.execute('CREATE TABLE dummy (id INT)') + + assert db.table_exists('dummy') + db.drop_table('dummy') + assert not db.table_exists('dummy') + + +def test_drop_table_non_existsing(db): + db.drop_table('dfkjgjriogjigjgjrdghehtre') + + +def test_drop_table_non_existing_force(db): + with pytest.raises(psycopg2.ProgrammingError, match='.*does not exist.*'): + db.drop_table('dfkjgjriogjigjgjrdghehtre', if_exists=False) + def test_connection_server_version_tuple(db): ver = db.server_version_tuple() diff --git a/test/python/test_tools_database_import.py b/test/python/test_tools_database_import.py index a4ab16f9..597fdfc1 100644 --- a/test/python/test_tools_database_import.py +++ b/test/python/test_tools_database_import.py @@ -4,6 +4,7 @@ Tests for functions to import a new database. import pytest import psycopg2 import sys +from pathlib import Path from nominatim.tools import database_import from nominatim.errors import UsageError @@ -94,3 +95,42 @@ def test_import_base_data_ignore_partitions(src_dir, temp_db, temp_db_cursor): assert temp_db_cursor.scalar('SELECT count(*) FROM country_name') > 0 assert temp_db_cursor.scalar('SELECT count(*) FROM country_name WHERE partition != 0') == 0 + + +def test_import_osm_data_simple(temp_db_cursor,osm2pgsql_options): + temp_db_cursor.execute('CREATE TABLE place (id INT)') + temp_db_cursor.execute('INSERT INTO place values (1)') + + database_import.import_osm_data('file.pdf', osm2pgsql_options) + + +def test_import_osm_data_simple_no_data(temp_db_cursor,osm2pgsql_options): + temp_db_cursor.execute('CREATE TABLE place (id INT)') + + with pytest.raises(UsageError, match='No data.*'): + database_import.import_osm_data('file.pdf', osm2pgsql_options) + + +def test_import_osm_data_drop(temp_db_conn, temp_db_cursor, tmp_path, osm2pgsql_options): + temp_db_cursor.execute('CREATE TABLE place (id INT)') + temp_db_cursor.execute('CREATE TABLE planet_osm_nodes (id INT)') + temp_db_cursor.execute('INSERT INTO place values (1)') + + flatfile = tmp_path / 'flatfile' + flatfile.write_text('touch') + + osm2pgsql_options['flatnode_file'] = str(flatfile.resolve()) + + database_import.import_osm_data('file.pdf', osm2pgsql_options, drop=True) + + assert not flatfile.exists() + assert not temp_db_conn.table_exists('planet_osm_nodes') + + +def test_import_osm_data_default_cache(temp_db_cursor,osm2pgsql_options): + temp_db_cursor.execute('CREATE TABLE place (id INT)') + temp_db_cursor.execute('INSERT INTO place values (1)') + + osm2pgsql_options['osm2pgsql_cache'] = 0 + + database_import.import_osm_data(Path(__file__), osm2pgsql_options) diff --git a/test/python/test_tools_exec_utils.py b/test/python/test_tools_exec_utils.py index 283f486a..8f60ac74 100644 --- a/test/python/test_tools_exec_utils.py +++ b/test/python/test_tools_exec_utils.py @@ -105,8 +105,15 @@ def test_run_api_with_extra_env(tmp_project_dir): ### run_osm2pgsql -def test_run_osm2pgsql(): - exec_utils.run_osm2pgsql(dict(osm2pgsql='echo', append=False, flatnode_file=None, - dsn='dbname=foobar', threads=1, osm2pgsql_cache=500, - osm2pgsql_style='./my.style', - import_file='foo.bar')) +def test_run_osm2pgsql(osm2pgsql_options): + osm2pgsql_options['append'] = False + osm2pgsql_options['import_file'] = 'foo.bar' + osm2pgsql_options['tablespaces']['osm_data'] = 'extra' + exec_utils.run_osm2pgsql(osm2pgsql_options) + + +def test_run_osm2pgsql_disable_jit(osm2pgsql_options): + osm2pgsql_options['append'] = True + osm2pgsql_options['import_file'] = 'foo.bar' + osm2pgsql_options['disable_jit'] = True + exec_utils.run_osm2pgsql(osm2pgsql_options) diff --git a/vagrant/Install-on-Centos-7.sh b/vagrant/Install-on-Centos-7.sh index eb16f873..e0ab7470 100755 --- a/vagrant/Install-on-Centos-7.sh +++ b/vagrant/Install-on-Centos-7.sh @@ -42,7 +42,7 @@ python3-pip python3-setuptools python3-devel \ expat-devel zlib-devel - pip3 install --user psycopg2 python-dotenv + pip3 install --user psycopg2 python-dotenv psutil # diff --git a/vagrant/Install-on-Centos-8.sh b/vagrant/Install-on-Centos-8.sh index 1cf93a1f..0018a452 100755 --- a/vagrant/Install-on-Centos-8.sh +++ b/vagrant/Install-on-Centos-8.sh @@ -35,7 +35,7 @@ python3-pip python3-setuptools python3-devel \ expat-devel zlib-devel - pip3 install --user psycopg2 python-dotenv + pip3 install --user psycopg2 python-dotenv psutil # diff --git a/vagrant/Install-on-Ubuntu-18.sh b/vagrant/Install-on-Ubuntu-18.sh index 527ded09..d8231908 100755 --- a/vagrant/Install-on-Ubuntu-18.sh +++ b/vagrant/Install-on-Ubuntu-18.sh @@ -30,7 +30,7 @@ export DEBIAN_FRONTEND=noninteractive #DOCS: postgresql-server-dev-10 postgresql-10-postgis-2.4 \ postgresql-contrib-10 postgresql-10-postgis-scripts \ php php-pgsql php-intl python3-pip \ - python3-psycopg2 git + python3-psycopg2 python3-psutil git # The python-dotenv package that comes with Ubuntu 18.04 is too old, so # install the latest version from pip: diff --git a/vagrant/Install-on-Ubuntu-20.sh b/vagrant/Install-on-Ubuntu-20.sh index bf8120c4..6f03bc72 100755 --- a/vagrant/Install-on-Ubuntu-20.sh +++ b/vagrant/Install-on-Ubuntu-20.sh @@ -33,7 +33,7 @@ export DEBIAN_FRONTEND=noninteractive #DOCS: postgresql-server-dev-12 postgresql-12-postgis-3 \ postgresql-contrib-12 postgresql-12-postgis-3-scripts \ php php-pgsql php-intl python3-dotenv \ - python3-psycopg2 git + python3-psycopg2 python3-psutil git # # System Configuration -- 2.39.5