From: Sarah Hoffmann Date: Sat, 23 Jan 2021 16:25:14 +0000 (+0100) Subject: port address level computation to Python X-Git-Tag: v3.7.0~46^2~10 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/94fa7162be678dc74c1a5516e22916122ba66bbb port address level computation to Python Also adds simple tests for correct table creation. --- diff --git a/lib/admin/update.php b/lib/admin/update.php index 48609c3e..972a6fe5 100644 --- a/lib/admin/update.php +++ b/lib/admin/update.php @@ -4,7 +4,6 @@ require_once(CONST_LibDir.'/init-cmd.php'); require_once(CONST_LibDir.'/setup_functions.php'); require_once(CONST_LibDir.'/setup/SetupClass.php'); -require_once(CONST_LibDir.'/setup/AddressLevelParser.php'); ini_set('memory_limit', '800M'); @@ -275,10 +274,7 @@ if ($aResult['index']) { } if ($aResult['update-address-levels']) { - $sAddressLevelConfig = getSettingConfig('ADDRESS_LEVEL_CONFIG', 'address-levels.json'); - echo 'Updating address levels from '.$sAddressLevelConfig.".\n"; - $oAlParser = new \Nominatim\Setup\AddressLevelParser($sAddressLevelConfig); - $oAlParser->createTable($oDB, 'address_levels'); + (clone($oNominatimCmd))->addParams('refresh', '--address-levels')->run(); } if ($aResult['recompute-importance']) { diff --git a/lib/setup/AddressLevelParser.php b/lib/setup/AddressLevelParser.php deleted file mode 100644 index a399c955..00000000 --- a/lib/setup/AddressLevelParser.php +++ /dev/null @@ -1,98 +0,0 @@ -aLevels = json_decode($sJson, true); - if (!$this->aLevels) { - switch (json_last_error()) { - case JSON_ERROR_NONE: - break; - case JSON_ERROR_DEPTH: - fail('JSON error - Maximum stack depth exceeded'); - break; - case JSON_ERROR_STATE_MISMATCH: - fail('JSON error - Underflow or the modes mismatch'); - break; - case JSON_ERROR_CTRL_CHAR: - fail('JSON error - Unexpected control character found'); - break; - case JSON_ERROR_SYNTAX: - fail('JSON error - Syntax error, malformed JSON'); - break; - case JSON_ERROR_UTF8: - fail('JSON error - Malformed UTF-8 characters, possibly incorrectly encoded'); - break; - default: - fail('JSON error - Unknown error'); - break; - } - } - } - - /** - * Dump the description into a database table. - * - * @param object $oDB Database conneciton to use. - * @param string $sTable Name of table to create. - * - * @return null - * - * A new table is created. Any previously existing table is dropped. - * The table has the following columns: - * country, class, type, rank_search, rank_address. - */ - public function createTable($oDB, $sTable) - { - $oDB->exec('DROP TABLE IF EXISTS '.$sTable); - $sSql = 'CREATE TABLE '.$sTable; - $sSql .= '(country_code varchar(2), class TEXT, type TEXT,'; - $sSql .= ' rank_search SMALLINT, rank_address SMALLINT)'; - $oDB->exec($sSql); - - $sSql = 'CREATE UNIQUE INDEX ON '.$sTable.' (country_code, class, type)'; - $oDB->exec($sSql); - - $sSql = 'INSERT INTO '.$sTable.' VALUES '; - foreach ($this->aLevels as $aLevel) { - $aCountries = array(); - if (isset($aLevel['countries'])) { - foreach ($aLevel['countries'] as $sCountry) { - $aCountries[$sCountry] = $oDB->getDBQuoted($sCountry); - } - } else { - $aCountries['NULL'] = 'NULL'; - } - foreach ($aLevel['tags'] as $sKey => $aValues) { - foreach ($aValues as $sValue => $mRanks) { - $aFields = array( - $oDB->getDBQuoted($sKey), - $sValue ? $oDB->getDBQuoted($sValue) : 'NULL' - ); - if (is_array($mRanks)) { - $aFields[] = (string) $mRanks[0]; - $aFields[] = (string) $mRanks[1]; - } else { - $aFields[] = (string) $mRanks; - $aFields[] = (string) $mRanks; - } - $sLine = ','.join(',', $aFields).'),'; - - foreach ($aCountries as $sCountries) { - $sSql .= '('.$sCountries.$sLine; - } - } - } - } - $oDB->exec(rtrim($sSql, ',')); - } -} diff --git a/lib/setup/SetupClass.php b/lib/setup/SetupClass.php index d17fdca7..635949b9 100755 --- a/lib/setup/SetupClass.php +++ b/lib/setup/SetupClass.php @@ -2,7 +2,6 @@ namespace Nominatim\Setup; -require_once(CONST_LibDir.'/setup/AddressLevelParser.php'); require_once(CONST_LibDir.'/Shell.php'); class SetupFunctions @@ -19,6 +18,7 @@ class SetupFunctions protected $bNoPartitions; protected $bDrop; protected $oDB = null; + protected $oNominatimCmd; public function __construct(array $aCMDResult) { @@ -81,6 +81,14 @@ class SetupFunctions } $this->bDrop = isset($aCMDResult['drop']) && $aCMDResult['drop']; + + $this->oNominatimCmd = new \Nominatim\Shell(getSetting('NOMINATIM_TOOL')); + if ($this->bQuiet) { + $this->oNominatimCmd->addParams('--quiet'); + } + if ($this->bVerbose) { + $this->oNominatimCmd->addParams('--verbose'); + } } public function createDB() @@ -256,8 +264,7 @@ class SetupFunctions $this->dropTable('search_name'); } - $oAlParser = new AddressLevelParser(getSettingConfig('ADDRESS_LEVEL_CONFIG', 'address-levels.json')); - $oAlParser->createTable($this->db(), 'address_levels'); + (clone($this->oNominatimCmd))->addParams('refresh', '--address-levels')->run(); } public function createTableTriggers() @@ -549,19 +556,10 @@ class SetupFunctions { $this->checkModulePresence(); // raises exception on failure - $oBaseCmd = (new \Nominatim\Shell(getSetting('NOMINATIM_TOOL'))) - ->addParams('index'); - - if ($this->bQuiet) { - $oBaseCmd->addParams('-q'); - } - if ($this->bVerbose) { - $oBaseCmd->addParams('-v'); - } + $oBaseCmd = (clone $this->oNominatimCmd)->addParams('index'); info('Index ranks 0 - 4'); $oCmd = (clone $oBaseCmd)->addParams('--maxrank', 4); - echo $oCmd->escapedCmd(); $iStatus = $oCmd->run(); if ($iStatus != 0) { diff --git a/nominatim/cli.py b/nominatim/cli.py index 4388902d..00eb905e 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -372,33 +372,38 @@ class UpdateRefresh: def run(args): import nominatim.tools.refresh - with psycopg2.connect(args.config.get_libpq_dsn()) as conn: - if args.postcodes: - LOG.warning("Update postcodes centroid") - nominatim.tools.refresh.update_postcodes(conn, args.data_dir) - if args.word_counts: - LOG.warning('Recompute frequency of full-word search terms') - nominatim.tools.refresh.recompute_word_counts(conn, args.data_dir) - if args.address_levels: - run_legacy_script('update.php', '--update-address-levels', - nominatim_env=args, throw_on_fail=True) - if args.functions: - params = ['setup.php', '--create-functions', '--create-partition-functions'] - if args.diffs: - params.append('--enable-diff-updates') - if args.enable_debug_statements: - params.append('--enable-debug-statements') - run_legacy_script(*params, nominatim_env=args, throw_on_fail=True) - if args.wiki_data: - run_legacy_script('setup.php', '--import-wikipedia-articles', - nominatim_env=args, throw_on_fail=True) - # Attention: importance MUST come after wiki data import. - if args.importance: - run_legacy_script('update.php', '--recompute-importance', - nominatim_env=args, throw_on_fail=True) - if args.website: - run_legacy_script('setup.php', '--setup-website', - nominatim_env=args, throw_on_fail=True) + conn = psycopg2.connect(args.config.get_libpq_dsn()) + + if args.postcodes: + LOG.warning("Update postcodes centroid") + nominatim.tools.refresh.update_postcodes(conn, args.data_dir) + if args.word_counts: + LOG.warning('Recompute frequency of full-word search terms') + nominatim.tools.refresh.recompute_word_counts(conn, args.data_dir) + if args.address_levels: + cfg = Path(args.config.ADDRESS_LEVEL_CONFIG) + LOG.warning('Updating address levels from %s', cfg) + nominatim.tools.refresh.load_address_levels_from_file(conn, cfg) + if args.functions: + params = ['setup.php', '--create-functions', '--create-partition-functions'] + if args.diffs: + params.append('--enable-diff-updates') + if args.enable_debug_statements: + params.append('--enable-debug-statements') + run_legacy_script(*params, nominatim_env=args, throw_on_fail=True) + if args.wiki_data: + run_legacy_script('setup.php', '--import-wikipedia-articles', + nominatim_env=args, throw_on_fail=True) + # Attention: importance MUST come after wiki data import. + if args.importance: + run_legacy_script('update.php', '--recompute-importance', + nominatim_env=args, throw_on_fail=True) + if args.website: + run_legacy_script('setup.php', '--setup-website', + nominatim_env=args, throw_on_fail=True) + + conn.close() + return 0 diff --git a/nominatim/config.py b/nominatim/config.py index 458c828f..d4ba0d7a 100644 --- a/nominatim/config.py +++ b/nominatim/config.py @@ -24,6 +24,13 @@ class Configuration: if project_dir is not None: self._config.update(dotenv_values(str((project_dir / '.env').resolve()))) + # Add defaults for variables that are left empty to set the default. + # They may still be overwritten by environment variables. + if not self._config['NOMINATIM_ADDRESS_LEVEL_CONFIG']: + self._config['NOMINATIM_ADDRESS_LEVEL_CONFIG'] = \ + str(config_dir / 'address-levels.json') + + def __getattr__(self, name): name = 'NOMINATIM_' + name diff --git a/nominatim/db/utils.py b/nominatim/db/utils.py index 1a39746e..abd72519 100644 --- a/nominatim/db/utils.py +++ b/nominatim/db/utils.py @@ -9,3 +9,4 @@ def execute_file(conn, fname): sql = fdesc.read() with conn.cursor() as cur: cur.execute(sql) + conn.commit() diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py index 859b5646..885caca5 100644 --- a/nominatim/tools/refresh.py +++ b/nominatim/tools/refresh.py @@ -1,6 +1,10 @@ """ Functions for bringing auxiliary data in the database up-to-date. """ +import json + +from psycopg2.extras import execute_values + from ..db.utils import execute_file def update_postcodes(conn, datadir): @@ -14,3 +18,54 @@ def recompute_word_counts(conn, datadir): """ Compute the frequency of full-word search terms. """ execute_file(conn, datadir / 'sql' / 'words_from_search_name.sql') + + +def _add_address_level_rows_from_entry(rows, entry): + """ Converts a single entry from the JSON format for address rank + descriptions into a flat format suitable for inserting into a + PostgreSQL table and adds these lines to `rows`. + """ + countries = entry.get('countries') or (None, ) + for key, values in entry['tags'].items(): + for value, ranks in values.items(): + if isinstance(ranks, list): + rank_search, rank_address = ranks + else: + rank_search = rank_address = ranks + if not value: + value = None + for country in countries: + rows.append((country, key, value, rank_search, rank_address)) + +def load_address_levels(conn, table, levels): + """ Replace the `address_levels` table with the contents of `levels'. + + A new table is created any previously existing table is dropped. + The table has the following columns: + country, class, type, rank_search, rank_address + """ + rows = [] + for entry in levels: + _add_address_level_rows_from_entry(rows, entry) + + with conn.cursor() as cur: + cur.execute('DROP TABLE IF EXISTS {}'.format(table)) + + cur.execute("""CREATE TABLE {} (country_code varchar(2), + class TEXT, + type TEXT, + rank_search SMALLINT, + rank_address SMALLINT)""".format(table)) + + execute_values(cur, "INSERT INTO {} VALUES %s".format(table), rows) + + cur.execute('CREATE UNIQUE INDEX ON {} (country_code, class, type)'.format(table)) + + conn.commit() + +def load_address_levels_from_file(conn, config_file): + """ Replace the `address_levels` table with the contents of the config + file. + """ + with config_file.open('r') as fdesc: + load_address_levels(conn, 'address_levels', json.load(fdesc)) diff --git a/test/python/conftest.py b/test/python/conftest.py index 1cc9ef9c..d92df5c5 100644 --- a/test/python/conftest.py +++ b/test/python/conftest.py @@ -2,13 +2,43 @@ import sys from pathlib import Path import psycopg2 +import psycopg2.extras import pytest +SRC_DIR = Path(__file__) / '..' / '..' / '..' + # always test against the source -sys.path.insert(0, str((Path(__file__) / '..' / '..' / '..').resolve())) +sys.path.insert(0, str(SRC_DIR.resolve())) + +from nominatim.config import Configuration + +class _TestingCursor(psycopg2.extras.DictCursor): + """ Extension to the DictCursor class that provides execution + short-cuts that simplify writing assertions. + """ + + def scalar(self, sql, params=None): + """ Execute a query with a single return value and return this value. + Raises an assertion when not exactly one row is returned. + """ + self.execute(sql, params) + assert self.rowcount == 1 + return self.fetchone()[0] + + def row_set(self, sql, params=None): + """ Execute a query and return the result as a set of tuples. + """ + self.execute(sql, params) + if self.rowcount == 1: + return set(tuple(self.fetchone())) + + return set((tuple(row) for row in self)) @pytest.fixture def temp_db(monkeypatch): + """ Create an empty database for the test. The database name is also + exported into NOMINATIM_DATABASE_DSN. + """ name = 'test_nominatim_python_unittest' with psycopg2.connect(database='postgres') as conn: conn.set_isolation_level(0) @@ -24,3 +54,29 @@ def temp_db(monkeypatch): conn.set_isolation_level(0) with conn.cursor() as cur: cur.execute('DROP DATABASE IF EXISTS {}'.format(name)) + + +@pytest.fixture +def temp_db_conn(temp_db): + """ Connection to the test database. + """ + conn = psycopg2.connect(database=temp_db) + yield conn + conn.close() + + +@pytest.fixture +def temp_db_cursor(temp_db): + """ Connection and cursor towards the test database. The connection will + be in auto-commit mode. + """ + conn = psycopg2.connect('dbname=' + temp_db) + conn.set_isolation_level(0) + with conn.cursor(cursor_factory=_TestingCursor) as cur: + yield cur + conn.close() + + +@pytest.fixture +def def_config(): + return Configuration(None, SRC_DIR.resolve() / 'settings') diff --git a/test/python/test_cli.py b/test/python/test_cli.py index 33c65ade..bd192260 100644 --- a/test/python/test_cli.py +++ b/test/python/test_cli.py @@ -84,10 +84,8 @@ def test_add_data_command(mock_run_legacy, name, oid): (['--boundaries-only'], 1, 0), (['--no-boundaries'], 0, 1), (['--boundaries-only', '--no-boundaries'], 0, 0)]) -def test_index_command(monkeypatch, temp_db, params, do_bnds, do_ranks): - with psycopg2.connect(database=temp_db) as conn: - with conn.cursor() as cur: - cur.execute("CREATE TABLE import_status (indexed bool)") +def test_index_command(monkeypatch, temp_db_cursor, params, do_bnds, do_ranks): + temp_db_cursor.execute("CREATE TABLE import_status (indexed bool)") bnd_mock = MockParamCapture() monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_boundaries', bnd_mock) rank_mock = MockParamCapture() @@ -100,7 +98,6 @@ def test_index_command(monkeypatch, temp_db, params, do_bnds, do_ranks): @pytest.mark.parametrize("command,params", [ - ('address-levels', ('update.php', '--update-address-levels')), ('functions', ('setup.php',)), ('wiki-data', ('setup.php', '--import-wikipedia-articles')), ('importance', ('update.php', '--recompute-importance')), @@ -116,6 +113,7 @@ def test_refresh_legacy_command(mock_run_legacy, command, params): @pytest.mark.parametrize("command,func", [ ('postcodes', 'update_postcodes'), ('word-counts', 'recompute_word_counts'), + ('address-levels', 'load_address_levels_from_file'), ]) def test_refresh_command(monkeypatch, command, func): func_mock = MockParamCapture() diff --git a/test/python/test_db_utils.py b/test/python/test_db_utils.py index 3210721e..e756f2c4 100644 --- a/test/python/test_db_utils.py +++ b/test/python/test_db_utils.py @@ -6,28 +6,25 @@ import pytest import nominatim.db.utils as db_utils -def test_execute_file_success(temp_db, tmp_path): +def test_execute_file_success(temp_db_conn, tmp_path): tmpfile = tmp_path / 'test.sql' tmpfile.write_text('CREATE TABLE test (id INT);\nINSERT INTO test VALUES(56);') - with psycopg2.connect('dbname=' + temp_db) as conn: - db_utils.execute_file(conn, tmpfile) + db_utils.execute_file(temp_db_conn, tmpfile) - with conn.cursor() as cur: - cur.execute('SELECT * FROM test') + with temp_db_conn.cursor() as cur: + cur.execute('SELECT * FROM test') - assert cur.rowcount == 1 - assert cur.fetchone()[0] == 56 + assert cur.rowcount == 1 + assert cur.fetchone()[0] == 56 -def test_execute_file_bad_file(temp_db, tmp_path): - with psycopg2.connect('dbname=' + temp_db) as conn: - with pytest.raises(FileNotFoundError): - db_utils.execute_file(conn, tmp_path / 'test2.sql') +def test_execute_file_bad_file(temp_db_conn, tmp_path): + with pytest.raises(FileNotFoundError): + db_utils.execute_file(temp_db_conn, tmp_path / 'test2.sql') -def test_execute_file_bad_sql(temp_db, tmp_path): +def test_execute_file_bad_sql(temp_db_conn, tmp_path): tmpfile = tmp_path / 'test.sql' tmpfile.write_text('CREATE STABLE test (id INT)') - with psycopg2.connect('dbname=' + temp_db) as conn: - with pytest.raises(psycopg2.ProgrammingError): - db_utils.execute_file(conn, tmpfile) + with pytest.raises(psycopg2.ProgrammingError): + db_utils.execute_file(temp_db_conn, tmpfile) diff --git a/test/python/test_indexing.py b/test/python/test_indexing.py index 2fe21e80..6b52a65e 100644 --- a/test/python/test_indexing.py +++ b/test/python/test_indexing.py @@ -82,10 +82,8 @@ class IndexerTestDB: @pytest.fixture -def test_db(temp_db): - conn = psycopg2.connect(database=temp_db) - yield IndexerTestDB(conn) - conn.close() +def test_db(temp_db_conn): + yield IndexerTestDB(temp_db_conn) @pytest.mark.parametrize("threads", [1, 15]) diff --git a/test/python/test_tools_exec_utils.py b/test/python/test_tools_exec_utils.py index d9f80740..a4eef61f 100644 --- a/test/python/test_tools_exec_utils.py +++ b/test/python/test_tools_exec_utils.py @@ -7,7 +7,6 @@ import tempfile import pytest -from nominatim.config import Configuration import nominatim.tools.exec_utils as exec_utils @pytest.fixture @@ -18,9 +17,9 @@ def tmp_phplib_dir(): yield Path(phpdir) @pytest.fixture -def nominatim_env(tmp_phplib_dir): +def nominatim_env(tmp_phplib_dir, def_config): class _NominatimEnv: - config = Configuration(None, Path(__file__) / '..' / '..' / '..' / 'settings') + config = def_config phplib_dir = tmp_phplib_dir data_dir = Path('data') project_dir = Path('.') diff --git a/test/python/test_tools_refresh_address_levels.py b/test/python/test_tools_refresh_address_levels.py new file mode 100644 index 00000000..87e34c61 --- /dev/null +++ b/test/python/test_tools_refresh_address_levels.py @@ -0,0 +1,85 @@ +""" +Tests for function for importing address ranks. +""" +import json +import pytest +from pathlib import Path + +from nominatim.tools.refresh import load_address_levels, load_address_levels_from_file + +def test_load_ranks_def_config(temp_db_conn, temp_db_cursor, def_config): + load_address_levels_from_file(temp_db_conn, Path(def_config.ADDRESS_LEVEL_CONFIG)) + + assert temp_db_cursor.scalar('SELECT count(*) FROM address_levels') > 0 + +def test_load_ranks_from_file(temp_db_conn, temp_db_cursor, tmp_path): + test_file = tmp_path / 'test_levels.json' + test_file.write_text('[{"tags":{"place":{"sea":2}}}]') + + load_address_levels_from_file(temp_db_conn, test_file) + + assert temp_db_cursor.scalar('SELECT count(*) FROM address_levels') > 0 + + +def test_load_ranks_from_broken_file(temp_db_conn, tmp_path): + test_file = tmp_path / 'test_levels.json' + test_file.write_text('[{"tags":"place":{"sea":2}}}]') + + with pytest.raises(json.decoder.JSONDecodeError): + load_address_levels_from_file(temp_db_conn, test_file) + + +def test_load_ranks_country(temp_db_conn, temp_db_cursor): + load_address_levels(temp_db_conn, 'levels', + [{"tags": {"place": {"village": 14}}}, + {"countries": ['de'], + "tags": {"place": {"village": 15}}}, + {"countries": ['uk', 'us' ], + "tags": {"place": {"village": 16}}} + ]) + + assert temp_db_cursor.row_set('SELECT * FROM levels') == \ + set([(None, 'place', 'village', 14, 14), + ('de', 'place', 'village', 15, 15), + ('uk', 'place', 'village', 16, 16), + ('us', 'place', 'village', 16, 16), + ]) + + +def test_load_ranks_default_value(temp_db_conn, temp_db_cursor): + load_address_levels(temp_db_conn, 'levels', + [{"tags": {"boundary": {"": 28}}}, + {"countries": ['hu'], + "tags": {"boundary": {"": 29}}} + ]) + + assert temp_db_cursor.row_set('SELECT * FROM levels') == \ + set([(None, 'boundary', None, 28, 28), + ('hu', 'boundary', None, 29, 29), + ]) + + +def test_load_ranks_multiple_keys(temp_db_conn, temp_db_cursor): + load_address_levels(temp_db_conn, 'levels', + [{"tags": + {"place": {"city": 14}, + "boundary": {"administrative2" : 4}} + }]) + + assert temp_db_cursor.row_set('SELECT * FROM levels') == \ + set([(None, 'place', 'city', 14, 14), + (None, 'boundary', 'administrative2', 4, 4), + ]) + + +def test_load_ranks_address(temp_db_conn, temp_db_cursor): + load_address_levels(temp_db_conn, 'levels', + [{"tags": + {"place": {"city": 14, + "town" : [14, 13]}} + }]) + + assert temp_db_cursor.row_set('SELECT * FROM levels') == \ + set([(None, 'place', 'city', 14, 14), + (None, 'place', 'town', 14, 13), + ])