From: Sarah Hoffmann Date: Mon, 17 Jul 2023 20:37:07 +0000 (+0200) Subject: port export function to Python X-Git-Tag: v4.3.0~48^2~6 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/0804cc0cff82d5ae0ff8c7d77f1653d1c8f3e908 port export function to Python Some of the parameters have been renoved as they don't make sense anymore. --- diff --git a/lib-php/admin/export.php b/lib-php/admin/export.php deleted file mode 100644 index 887b4beb..00000000 --- a/lib-php/admin/export.php +++ /dev/null @@ -1,190 +0,0 @@ - 1, - 'country' => 4, - 'state' => 8, - 'county' => 12, - 'city' => 16, - 'suburb' => 20, - 'street' => 26, - 'path' => 27 - ); - - $oDB = new Nominatim\DB(); - $oDB->connect(); - - if (isset($aCMDResult['output-type'])) { - if (!isset($aRankmap[$aCMDResult['output-type']])) { - fail('unknown output-type: '.$aCMDResult['output-type']); - } - $iOutputRank = $aRankmap[$aCMDResult['output-type']]; - } else { - $iOutputRank = $aRankmap['street']; - } - - - // Preferred language - $oParams = new Nominatim\ParameterParser(); - if (!isset($aCMDResult['language'])) { - $aCMDResult['language'] = 'xx'; - } - $aLangPrefOrder = $oParams->getPreferredLanguages($aCMDResult['language']); - $sLanguagePrefArraySQL = $oDB->getArraySQL($oDB->getDBQuotedList($aLangPrefOrder)); - - // output formatting: build up a lookup table that maps address ranks to columns - $aColumnMapping = array(); - $iNumCol = 0; - if (!isset($aCMDResult['output-format'])) { - $aCMDResult['output-format'] = 'street;suburb;city;county;state;country'; - } - foreach (preg_split('/\s*;\s*/', $aCMDResult['output-format']) as $sColumn) { - $bHasData = false; - foreach (preg_split('/\s*,\s*/', $sColumn) as $sRank) { - if ($sRank == 'postcode' || $sRank == 'placeid') { - $aColumnMapping[$sRank] = $iNumCol; - $bHasData = true; - } elseif (isset($aRankmap[$sRank])) { - $iRank = $aRankmap[$sRank]; - if ($iRank <= $iOutputRank) { - $aColumnMapping[(string)$iRank] = $iNumCol; - $bHasData = true; - } - } - } - if ($bHasData) { - $iNumCol++; - } - } - - // build the query for objects - $sPlacexSQL = 'select min(place_id) as place_id, '; - $sPlacexSQL .= 'array_agg(place_id) as place_ids, '; - $sPlacexSQL .= 'country_code as cc, '; - $sPlacexSQL .= 'postcode, '; - // get the address places excluding postcodes - $sPlacexSQL .= 'array(select address_place_id from place_addressline a'; - $sPlacexSQL .= ' where a.place_id = placex.place_id and isaddress'; - $sPlacexSQL .= ' and address_place_id != placex.place_id'; - $sPlacexSQL .= ' and not cached_rank_address in (5,11)'; - $sPlacexSQL .= ' and cached_rank_address > 2 order by cached_rank_address)'; - $sPlacexSQL .= ' as address'; - $sPlacexSQL .= ' from placex where name is not null and linked_place_id is null'; - - $sPlacexSQL .= ' and rank_address = '.$iOutputRank; - - if (isset($aCMDResult['restrict-to-country'])) { - $sPlacexSQL .= ' and country_code = '.$oDB->getDBQuoted($aCMDResult['restrict-to-country']); - } - - // restriction to parent place id - $sParentId = false; - $sOsmType = false; - - if (isset($aCMDResult['restrict-to-osm-node'])) { - $sOsmType = 'N'; - $sOsmId = $aCMDResult['restrict-to-osm-node']; - } - if (isset($aCMDResult['restrict-to-osm-way'])) { - $sOsmType = 'W'; - $sOsmId = $aCMDResult['restrict-to-osm-way']; - } - if (isset($aCMDResult['restrict-to-osm-relation'])) { - $sOsmType = 'R'; - $sOsmId = $aCMDResult['restrict-to-osm-relation']; - } - if ($sOsmType) { - $sSQL = 'select place_id from placex where osm_type = :osm_type and osm_id = :osm_id'; - $sParentId = $oDB->getOne($sSQL, array('osm_type' => $sOsmType, 'osm_id' => $sOsmId)); - if (!$sParentId) { - fail('Could not find place '.$sOsmType.' '.$sOsmId); - } - } - if ($sParentId) { - $sPlacexSQL .= ' and place_id in (select place_id from place_addressline where address_place_id = '.$sParentId.' and isaddress)'; - } - - $sPlacexSQL .= " group by name->'name', address, postcode, country_code, placex.place_id"; - - // Iterate over placeids - // to get further hierarchical information - //var_dump($sPlacexSQL); - $oResults = $oDB->getQueryStatement($sPlacexSQL); - $fOutstream = fopen('php://output', 'w'); - while ($aRow = $oResults->fetch()) { - $iPlaceID = $aRow['place_id']; - $sSQL = "select rank_address,get_name_by_language(name,$sLanguagePrefArraySQL) as localname from get_addressdata(:place_id, -1)"; - $sSQL .= ' WHERE isaddress'; - $sSQL .= ' order by rank_address desc,isaddress desc'; - $aAddressLines = $oDB->getAll($sSQL, array('place_id' => $iPlaceID)); - - $aOutput = array_fill(0, $iNumCol, ''); - // output address parts - foreach ($aAddressLines as $aAddress) { - if (isset($aColumnMapping[$aAddress['rank_address']])) { - $aOutput[$aColumnMapping[$aAddress['rank_address']]] = $aAddress['localname']; - } - } - // output postcode - if (isset($aColumnMapping['postcode'])) { - if ($aCMDResult['output-all-postcodes']) { - $sSQL = 'select array_agg(px.postcode) from placex px join place_addressline pa '; - $sSQL .= 'on px.place_id = pa.address_place_id '; - $sSQL .= 'where pa.cached_rank_address in (5,11) '; - $sSQL .= 'and pa.place_id in (select place_id from place_addressline where address_place_id in (:first_place_id)) '; - $sSQL .= 'group by postcode order by count(*) desc limit 1'; - $sRes = $oDB->getOne($sSQL, array('first_place_id' => substr($aRow['place_ids'], 1, -1))); - - $aOutput[$aColumnMapping['postcode']] = substr($sRes, 1, -1); - } else { - $aOutput[$aColumnMapping['postcode']] = $aRow['postcode']; - } - } - if (isset($aColumnMapping['placeid'])) { - $aOutput[$aColumnMapping['placeid']] = substr($aRow['place_ids'], 1, -1); - } - fputcsv($fOutstream, $aOutput); - } - fclose($fOutstream); diff --git a/nominatim/api/__init__.py b/nominatim/api/__init__.py index 794cd96c..9e3d6a1d 100644 --- a/nominatim/api/__init__.py +++ b/nominatim/api/__init__.py @@ -16,6 +16,7 @@ import from this file, not from the source files directly. from .core import (NominatimAPI as NominatimAPI, NominatimAPIAsync as NominatimAPIAsync) +from .connection import (SearchConnection as SearchConnection) from .status import (StatusResult as StatusResult) from .types import (PlaceID as PlaceID, OsmID as OsmID, diff --git a/nominatim/cli.py b/nominatim/cli.py index 836f9037..4a8fe3e4 100644 --- a/nominatim/cli.py +++ b/nominatim/cli.py @@ -2,13 +2,13 @@ # # This file is part of Nominatim. (https://nominatim.org) # -# Copyright (C) 2022 by the Nominatim developer community. +# Copyright (C) 2023 by the Nominatim developer community. # For a full list of authors see the git log. """ Command-line interface to the Nominatim functions for import, update, database administration and querying. """ -from typing import Optional, Any, List, Union +from typing import Optional, Any import importlib import logging import os @@ -17,7 +17,7 @@ import argparse from pathlib import Path from nominatim.config import Configuration -from nominatim.tools.exec_utils import run_legacy_script, run_php_server +from nominatim.tools.exec_utils import run_php_server from nominatim.errors import UsageError from nominatim import clicmd from nominatim import version @@ -140,60 +140,6 @@ class CommandlineParser: # # No need to document the functions each time. # pylint: disable=C0111 -class QueryExport: - """\ - Export addresses as CSV file from the database. - """ - - def add_args(self, parser: argparse.ArgumentParser) -> None: - group = parser.add_argument_group('Output arguments') - group.add_argument('--output-type', default='street', - choices=('continent', 'country', 'state', 'county', - 'city', 'suburb', 'street', 'path'), - help='Type of places to output (default: street)') - group.add_argument('--output-format', - default='street;suburb;city;county;state;country', - help=("Semicolon-separated list of address types " - "(see --output-type). Multiple ranks can be " - "merged into one column by simply using a " - "comma-separated list.")) - group.add_argument('--output-all-postcodes', action='store_true', - help=("List all postcodes for address instead of " - "just the most likely one")) - group.add_argument('--language', - help=("Preferred language for output " - "(use local name, if omitted)")) - group = parser.add_argument_group('Filter arguments') - group.add_argument('--restrict-to-country', metavar='COUNTRY_CODE', - help='Export only objects within country') - group.add_argument('--restrict-to-osm-node', metavar='ID', type=int, - help='Export only children of this OSM node') - group.add_argument('--restrict-to-osm-way', metavar='ID', type=int, - help='Export only children of this OSM way') - group.add_argument('--restrict-to-osm-relation', metavar='ID', type=int, - help='Export only children of this OSM relation') - - - def run(self, args: NominatimArgs) -> int: - params: List[Union[int, str]] = [ - '--output-type', args.output_type, - '--output-format', args.output_format] - if args.output_all_postcodes: - params.append('--output-all-postcodes') - if args.language: - params.extend(('--language', args.language)) - if args.restrict_to_country: - params.extend(('--restrict-to-country', args.restrict_to_country)) - if args.restrict_to_osm_node: - params.extend(('--restrict-to-osm-node', args.restrict_to_osm_node)) - if args.restrict_to_osm_way: - params.extend(('--restrict-to-osm-way', args.restrict_to_osm_way)) - if args.restrict_to_osm_relation: - params.extend(('--restrict-to-osm-relation', args.restrict_to_osm_relation)) - - return run_legacy_script('export.php', *params, config=args.config) - - class AdminServe: """\ Start a simple web server for serving the API. @@ -260,7 +206,7 @@ def get_set_parser() -> CommandlineParser: parser.add_subcommand('admin', clicmd.AdminFuncs()) - parser.add_subcommand('export', QueryExport()) + parser.add_subcommand('export', clicmd.QueryExport()) parser.add_subcommand('serve', AdminServe()) parser.add_subcommand('search', clicmd.APISearch()) diff --git a/nominatim/clicmd/__init__.py b/nominatim/clicmd/__init__.py index bdd9bafe..235dff0c 100644 --- a/nominatim/clicmd/__init__.py +++ b/nominatim/clicmd/__init__.py @@ -2,7 +2,7 @@ # # This file is part of Nominatim. (https://nominatim.org) # -# Copyright (C) 2022 by the Nominatim developer community. +# Copyright (C) 2023 by the Nominatim developer community. # For a full list of authors see the git log. """ Subcommand definitions for the command-line tool. @@ -24,3 +24,4 @@ from nominatim.clicmd.add_data import UpdateAddData as UpdateAddData from nominatim.clicmd.admin import AdminFuncs as AdminFuncs from nominatim.clicmd.freeze import SetupFreeze as SetupFreeze from nominatim.clicmd.special_phrases import ImportSpecialPhrases as ImportSpecialPhrases +from nominatim.clicmd.export import QueryExport as QueryExport diff --git a/nominatim/clicmd/args.py b/nominatim/clicmd/args.py index 10316165..e3150c3e 100644 --- a/nominatim/clicmd/args.py +++ b/nominatim/clicmd/args.py @@ -100,9 +100,6 @@ class NominatimArgs: output_all_postcodes: bool language: Optional[str] restrict_to_country: Optional[str] - restrict_to_osm_node: Optional[int] - restrict_to_osm_way: Optional[int] - restrict_to_osm_relation: Optional[int] # Arguments to 'refresh' postcodes: bool diff --git a/nominatim/clicmd/export.py b/nominatim/clicmd/export.py new file mode 100644 index 00000000..ddddc5d7 --- /dev/null +++ b/nominatim/clicmd/export.py @@ -0,0 +1,193 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2023 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Implementation of the 'export' subcommand. +""" +from typing import Optional, List, cast +import logging +import argparse +import asyncio +import csv +import sys + +import sqlalchemy as sa + +from nominatim.clicmd.args import NominatimArgs +import nominatim.api as napi +from nominatim.api.results import create_from_placex_row, ReverseResult, add_result_details +from nominatim.api.types import LookupDetails +from nominatim.errors import UsageError + +# Do not repeat documentation of subcommand classes. +# pylint: disable=C0111 +# Using non-top-level imports to avoid eventually unused imports. +# pylint: disable=E0012,C0415 +# Needed for SQLAlchemy +# pylint: disable=singleton-comparison + +LOG = logging.getLogger() + +RANK_RANGE_MAP = { + 'country': (4, 4), + 'state': (5, 9), + 'county': (10, 12), + 'city': (13, 16), + 'suburb': (17, 21), + 'street': (26, 26), + 'path': (27, 27) +} + +RANK_TO_OUTPUT_MAP = { + 4: 'country', + 5: 'state', 6: 'state', 7: 'state', 8: 'state', 9: 'state', + 10: 'county', 11: 'county', 12: 'county', + 13: 'city', 14: 'city', 15: 'city', 16: 'city', + 17: 'suburb', 18: 'suburb', 19: 'suburb', 20: 'suburb', 21: 'suburb', + 26: 'street', 27: 'path'} + +class QueryExport: + """\ + Export places as CSV file from the database. + """ + + def add_args(self, parser: argparse.ArgumentParser) -> None: + group = parser.add_argument_group('Output arguments') + group.add_argument('--output-type', default='street', + choices=('country', 'state', 'county', + 'city', 'suburb', 'street', 'path'), + help='Type of places to output (default: street)') + group.add_argument('--output-format', + default='street;suburb;city;county;state;country', + help=("Semicolon-separated list of address types " + "(see --output-type).")) + group.add_argument('--language', + help=("Preferred language for output " + "(use local name, if omitted)")) + group = parser.add_argument_group('Filter arguments') + group.add_argument('--restrict-to-country', metavar='COUNTRY_CODE', + help='Export only objects within country') + group.add_argument('--restrict-to-osm-node', metavar='ID', type=int, + dest='node', + help='Export only children of this OSM node') + group.add_argument('--restrict-to-osm-way', metavar='ID', type=int, + dest='way', + help='Export only children of this OSM way') + group.add_argument('--restrict-to-osm-relation', metavar='ID', type=int, + dest='relation', + help='Export only children of this OSM relation') + + + def run(self, args: NominatimArgs) -> int: + return asyncio.run(export(args)) + + +async def export(args: NominatimArgs) -> int: + """ The actual export as a asynchronous function. + """ + + api = napi.NominatimAPIAsync(args.project_dir) + + output_range = RANK_RANGE_MAP[args.output_type] + + writer = init_csv_writer(args.output_format) + + async with api.begin() as conn, api.begin() as detail_conn: + t = conn.t.placex + + sql = sa.select(t.c.place_id, t.c.osm_type, t.c.osm_id, t.c.name, + t.c.class_, t.c.type, t.c.admin_level, + t.c.address, t.c.extratags, + t.c.housenumber, t.c.postcode, t.c.country_code, + t.c.importance, t.c.wikipedia, t.c.indexed_date, + t.c.rank_address, t.c.rank_search, + t.c.centroid)\ + .where(t.c.linked_place_id == None)\ + .where(t.c.rank_address.between(*output_range)) + + parent_place_id = await get_parent_id(conn, args.node, args.way, args.relation) + if parent_place_id: + taddr = conn.t.addressline + + sql = sql.join(taddr, taddr.c.place_id == t.c.place_id)\ + .where(taddr.c.address_place_id == parent_place_id)\ + .where(taddr.c.isaddress) + + if args.restrict_to_country: + sql = sql.where(t.c.country_code == args.restrict_to_country.lower()) + + results = [] + for row in await conn.execute(sql): + result = create_from_placex_row(row, ReverseResult) + if result is not None: + results.append(result) + + if len(results) == 1000: + await dump_results(detail_conn, results, writer, args.language) + results = [] + + if results: + await dump_results(detail_conn, results, writer, args.language) + + return 0 + + +def init_csv_writer(output_format: str) -> 'csv.DictWriter[str]': + fields = output_format.split(';') + writer = csv.DictWriter(sys.stdout, fieldnames=fields, extrasaction='ignore') + writer.writeheader() + + return writer + + +async def dump_results(conn: napi.SearchConnection, + results: List[ReverseResult], + writer: 'csv.DictWriter[str]', + lang: Optional[str]) -> None: + await add_result_details(conn, results, + LookupDetails(address_details=True)) + + + locale = napi.Locales([lang] if lang else None) + + for result in results: + data = {'placeid': result.place_id, + 'postcode': result.postcode} + + result.localize(locale) + for line in (result.address_rows or []): + if line.isaddress and line.local_name\ + and line.rank_address in RANK_TO_OUTPUT_MAP: + data[RANK_TO_OUTPUT_MAP[line.rank_address]] = line.local_name + + writer.writerow(data) + + +async def get_parent_id(conn: napi.SearchConnection, node_id: Optional[int], + way_id: Optional[int], + relation_id: Optional[int]) -> Optional[int]: + """ Get the place ID for the given OSM object. + """ + if node_id is not None: + osm_type, osm_id = 'N', node_id + elif way_id is not None: + osm_type, osm_id = 'W', way_id + elif relation_id is not None: + osm_type, osm_id = 'R', relation_id + else: + return None + + t = conn.t.placex + sql = sa.select(t.c.place_id).limit(1)\ + .where(t.c.osm_type == osm_type)\ + .where(t.c.osm_id == osm_id)\ + .where(t.c.rank_address > 0)\ + .order_by(t.c.rank_address) + + for result in await conn.execute(sql): + return cast(int, result[0]) + + raise UsageError(f'Cannot find a place {osm_type}{osm_id}.') diff --git a/test/python/cli/test_cli.py b/test/python/cli/test_cli.py index f1bb75a9..93e86108 100644 --- a/test/python/cli/test_cli.py +++ b/test/python/cli/test_cli.py @@ -100,35 +100,6 @@ def test_cli_serve_uvicorn_based(cli_call, engine, mock_func_factory): assert func.last_kwargs['host'] == '127.0.0.1' assert func.last_kwargs['port'] == 8088 -def test_cli_export_command(cli_call, mock_run_legacy): - assert cli_call('export', '--output-all-postcodes') == 0 - - assert mock_run_legacy.called == 1 - assert mock_run_legacy.last_args[0] == 'export.php' - - -@pytest.mark.parametrize("param,value", [('output-type', 'country'), - ('output-format', 'street;city'), - ('language', 'xf'), - ('restrict-to-country', 'us'), - ('restrict-to-osm-node', '536'), - ('restrict-to-osm-way', '727'), - ('restrict-to-osm-relation', '197532') - ]) -def test_export_parameters(src_dir, tmp_path, param, value, monkeypatch): - (tmp_path / 'admin').mkdir() - (tmp_path / 'admin' / 'export.php').write_text(f"""= 0 ? 0 : 10); - """) - - monkeypatch.setattr(nominatim.paths, 'PHPLIB_DIR', tmp_path) - - assert nominatim.cli.nominatim(module_dir='MODULE NOT AVAILABLE', - osm2pgsql_path='OSM2PGSQL NOT AVAILABLE', - phpcgi_path='/usr/bin/php-cgi', - cli_args=['export', '--' + param, value]) == 0 - - class TestCliWithDb: