]> git.openstreetmap.org Git - nominatim.git/commitdiff
Merge pull request #2143 from lonvia/integrate-indexer-into-nominatim-tool
authorSarah Hoffmann <lonvia@denofr.de>
Tue, 19 Jan 2021 07:42:22 +0000 (08:42 +0100)
committerGitHub <noreply@github.com>
Tue, 19 Jan 2021 07:42:22 +0000 (08:42 +0100)
Integrate indexer into nominatim tool

13 files changed:
cmake/script.tmpl
cmake/tool.tmpl
lib/Shell.php
lib/admin/update.php
lib/setup/SetupClass.php
nominatim/cli.py
nominatim/config.py
nominatim/db/__init__.py [new file with mode: 0644]
nominatim/db/async_connection.py [moved from nominatim/indexer/db.py with 87% similarity]
nominatim/indexer/indexer.py [moved from nominatim/nominatim.py with 54% similarity, mode: 0644]
nominatim/indexer/progress.py
test/bdd/steps/nominatim_environment.py
test/python/test_config.py

index 30b8717bac8af7d67918d157f7d7f4ff59b2bf47..aa25a1248418d064916454880eff55f94a71adf5 100755 (executable)
@@ -8,5 +8,6 @@ require('@CMAKE_SOURCE_DIR@/lib/dotenv_loader.php');
 @define('CONST_DataDir', '@CMAKE_SOURCE_DIR@');
 
 loadDotEnv();
+$_SERVER['NOMINATIM_NOMINATIM_TOOL'] = '@CMAKE_BINARY_DIR@/nominatim';
 
 require_once('@CMAKE_SOURCE_DIR@/lib/admin/@script_source@');
index 40f2b8ea7920e910641577002a2aac1d051fadfd..43646792d19d8824753a331f39b3111fa1d6f357 100755 (executable)
@@ -1,8 +1,11 @@
 #!/usr/bin/env python3
 import sys
+import os
 
 sys.path.insert(1, '@CMAKE_SOURCE_DIR@')
 
+os.environ['NOMINATIM_NOMINATIM_TOOL'] = __file__
+
 from nominatim import cli
 
 exit(cli.nominatim(module_dir='@CMAKE_BINARY_DIR@/module',
index 59c4473bde94d43b2569331fac3814211d0b10ed..72f90735e9763e798cb354155e8b077b37666f7e 100644 (file)
@@ -7,7 +7,7 @@ class Shell
     public function __construct($sBaseCmd, ...$aParams)
     {
         if (!$sBaseCmd) {
-            throw new Exception('Command missing in new() call');
+            throw new \Exception('Command missing in new() call');
         }
         $this->baseCmd = $sBaseCmd;
         $this->aParams = array();
index 50f611d71aa9ab7c9294cc649f7394eb030703ff..fe9658b54f42ee458d25fcfb75be6ae38a1782fd 100644 (file)
@@ -105,25 +105,14 @@ if ($fPostgresVersion >= 11.0) {
 }
 
 
-$oIndexCmd = (new \Nominatim\Shell(CONST_DataDir.'/nominatim/nominatim.py'))
-             ->addParams('--database', $aDSNInfo['database'])
-             ->addParams('--port', $aDSNInfo['port'])
-             ->addParams('--threads', $aResult['index-instances']);
-if (!$aResult['quiet']) {
-    $oIndexCmd->addParams('--verbose');
+$oIndexCmd = (new \Nominatim\Shell(getSetting('NOMINATIM_TOOL')))
+             ->addParams('index');
+if ($aResult['quiet']) {
+    $oIndexCmd->addParams('--quiet');
 }
 if ($aResult['verbose']) {
     $oIndexCmd->addParams('--verbose');
 }
-if (isset($aDSNInfo['hostspec']) && $aDSNInfo['hostspec']) {
-    $oIndexCmd->addParams('--host', $aDSNInfo['hostspec']);
-}
-if (isset($aDSNInfo['username']) && $aDSNInfo['username']) {
-    $oIndexCmd->addParams('--username', $aDSNInfo['username']);
-}
-if (isset($aDSNInfo['password']) && $aDSNInfo['password']) {
-    $oIndexCmd->addEnvPair('PGPASSWORD', $aDSNInfo['password']);
-}
 
 $sPyosmiumBin = getSetting('PYOSMIUM_BINARY');
 $sBaseURL = getSetting('REPLICATION_URL');
@@ -288,15 +277,9 @@ if ($aResult['recompute-word-counts']) {
 }
 
 if ($aResult['index']) {
-    $oCmd = (clone $oIndexCmd)
-            ->addParams('--minrank', $aResult['index-rank'], '-b');
-    $oCmd->run();
-
     $oCmd = (clone $oIndexCmd)
             ->addParams('--minrank', $aResult['index-rank']);
     $oCmd->run();
-
-    $oDB->exec('update import_status set indexed = true');
 }
 
 if ($aResult['update-address-levels']) {
@@ -438,15 +421,6 @@ if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) {
         if (!$aResult['no-index']) {
             $fCMDStartTime = time();
 
-            $oThisIndexCmd = clone($oIndexCmd);
-            $oThisIndexCmd->addParams('-b');
-            echo $oThisIndexCmd->escapedCmd()."\n";
-            $iErrorLevel = $oThisIndexCmd->run();
-            if ($iErrorLevel) {
-                echo "Error: $iErrorLevel\n";
-                exit($iErrorLevel);
-            }
-
             $oThisIndexCmd = clone($oIndexCmd);
             echo $oThisIndexCmd->escapedCmd()."\n";
             $iErrorLevel = $oThisIndexCmd->run();
@@ -463,9 +437,6 @@ if ($aResult['import-osmosis'] || $aResult['import-osmosis-all']) {
             var_Dump($sSQL);
             $oDB->exec($sSQL);
             echo date('Y-m-d H:i:s')." Completed index step for $sBatchEnd in ".round((time()-$fCMDStartTime)/60, 2)." minutes\n";
-
-            $sSQL = 'update import_status set indexed = true';
-            $oDB->exec($sSQL);
         } else {
             if ($aResult['import-osmosis-all']) {
                 echo "Error: --no-index cannot be used with continuous imports (--import-osmosis-all).\n";
index 77b14a8a4c27260a3a5bde6c303a43f86b16af83..d17fdca7c8b0763324c60fd2e985775cf7db524e 100755 (executable)
@@ -549,26 +549,15 @@ class SetupFunctions
     {
         $this->checkModulePresence(); // raises exception on failure
 
-        $oBaseCmd = (new \Nominatim\Shell(CONST_DataDir.'/nominatim/nominatim.py'))
-                    ->addParams('--database', $this->aDSNInfo['database'])
-                    ->addParams('--port', $this->aDSNInfo['port'])
-                    ->addParams('--threads', $this->iInstances);
+        $oBaseCmd = (new \Nominatim\Shell(getSetting('NOMINATIM_TOOL')))
+                    ->addParams('index');
 
-        if (!$this->bQuiet) {
-            $oBaseCmd->addParams('-v');
+        if ($this->bQuiet) {
+            $oBaseCmd->addParams('-q');
         }
         if ($this->bVerbose) {
             $oBaseCmd->addParams('-v');
         }
-        if (isset($this->aDSNInfo['hostspec'])) {
-            $oBaseCmd->addParams('--host', $this->aDSNInfo['hostspec']);
-        }
-        if (isset($this->aDSNInfo['username'])) {
-            $oBaseCmd->addParams('--user', $this->aDSNInfo['username']);
-        }
-        if (isset($this->aDSNInfo['password'])) {
-            $oBaseCmd->addEnvPair('PGPASSWORD', $this->aDSNInfo['password']);
-        }
 
         info('Index ranks 0 - 4');
         $oCmd = (clone $oBaseCmd)->addParams('--maxrank', 4);
@@ -581,14 +570,14 @@ class SetupFunctions
         if (!$bIndexNoanalyse) $this->pgsqlRunScript('ANALYSE');
 
         info('Index administrative boundaries');
-        $oCmd = (clone $oBaseCmd)->addParams('-b');
+        $oCmd = (clone $oBaseCmd)->addParams('--boundaries-only');
         $iStatus = $oCmd->run();
         if ($iStatus != 0) {
             fail('error status ' . $iStatus . ' running nominatim!');
         }
 
         info('Index ranks 5 - 25');
-        $oCmd = (clone $oBaseCmd)->addParams('--minrank', 5, '--maxrank', 25);
+        $oCmd = (clone $oBaseCmd)->addParams('--no-boundaries', '--minrank', 5, '--maxrank', 25);
         $iStatus = $oCmd->run();
         if ($iStatus != 0) {
             fail('error status ' . $iStatus . ' running nominatim!');
@@ -597,7 +586,7 @@ class SetupFunctions
         if (!$bIndexNoanalyse) $this->pgsqlRunScript('ANALYSE');
 
         info('Index ranks 26 - 30');
-        $oCmd = (clone $oBaseCmd)->addParams('--minrank', 26);
+        $oCmd = (clone $oBaseCmd)->addParams('--no-boundaries', '--minrank', 26);
         $iStatus = $oCmd->run();
         if ($iStatus != 0) {
             fail('error status ' . $iStatus . ' running nominatim!');
index 8d4071db973a117f5ba5c501b70dafed6bdd033a..65ea90bbc3534b4421558e883ecaf31796e5b0b5 100644 (file)
@@ -11,6 +11,17 @@ from pathlib import Path
 from .config import Configuration
 from .admin.exec_utils import run_legacy_script
 
+from .indexer.indexer import Indexer
+
+def _num_system_cpus():
+    try:
+        cpus = len(os.sched_getaffinity(0))
+    except NotImplementedError:
+        cpus = None
+
+    return cpus or os.cpu_count()
+
+
 class CommandlineParser:
     """ Wraps some of the common functions for parsing the command line
         and setting up subcommands.
@@ -67,7 +78,7 @@ class CommandlineParser:
         args.project_dir = Path(args.project_dir)
 
         logging.basicConfig(stream=sys.stderr,
-                            format='%(asctime)s %(levelname)s: %(message)s',
+                            format='%(asctime)s: %(message)s',
                             datefmt='%Y-%m-%d %H:%M:%S',
                             level=max(4 - args.verbose, 1) * 10)
 
@@ -297,11 +308,30 @@ class UpdateIndex:
 
     @staticmethod
     def add_args(parser):
-        pass
+        group = parser.add_argument_group('Filter arguments')
+        group.add_argument('--boundaries-only', action='store_true',
+                           help="""Index only administrative boundaries.""")
+        group.add_argument('--no-boundaries', action='store_true',
+                           help="""Index everything except administrative boundaries.""")
+        group.add_argument('--minrank', '-r', type=int, metavar='RANK', default=0,
+                           help='Minimum/starting rank')
+        group.add_argument('--maxrank', '-R', type=int, metavar='RANK', default=30,
+                           help='Maximum/finishing rank')
 
     @staticmethod
     def run(args):
-        return run_legacy_script('update.php', '--index', nominatim_env=args)
+        indexer = Indexer(args.config.get_libpq_dsn(),
+                          args.threads or _num_system_cpus() or 1)
+
+        if not args.no_boundaries:
+            indexer.index_boundaries(args.minrank, args.maxrank)
+        if not args.boundaries_only:
+            indexer.index_by_rank(args.minrank, args.maxrank)
+
+        if not args.no_boundaries and not args.boundaries_only:
+            indexer.update_status_table()
+
+        return 0
 
 
 class UpdateRefresh:
index 911c7ddf127f68f438ab582c75fa38d5ed469dd8..458c828f58fce8adeda02fd550823641366ceb2c 100644 (file)
@@ -29,6 +29,18 @@ class Configuration:
 
         return os.environ.get(name) or self._config[name]
 
+    def get_libpq_dsn(self):
+        """ Get configured database DSN converted into the key/value format
+            understood by libpq and psycopg.
+        """
+        dsn = self.DATABASE_DSN
+
+        if dsn.startswith('pgsql:'):
+            # Old PHP DSN format. Convert before returning.
+            return dsn[6:].replace(';', ' ')
+
+        return dsn
+
     def get_os_env(self):
         """ Return a copy of the OS environment with the Nominatim configuration
             merged in.
diff --git a/nominatim/db/__init__.py b/nominatim/db/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
similarity index 87%
rename from nominatim/indexer/db.py
rename to nominatim/db/async_connection.py
index 85b844312ba271a638be8ef084dbe34a9274889d..45e83664663ba835419db49304209b9da6491d35 100644 (file)
@@ -11,26 +11,14 @@ from psycopg2.extras import wait_select
 
 LOG = logging.getLogger()
 
-def make_connection(options, asynchronous=False):
-    """ Create a psycopg2 connection from the given options.
-    """
-    params = {'dbname' : options.dbname,
-              'user' : options.user,
-              'password' : options.password,
-              'host' : options.host,
-              'port' : options.port,
-              'async' : asynchronous}
-
-    return psycopg2.connect(**params)
-
 class DBConnection:
     """ A single non-blocking database connection.
     """
 
-    def __init__(self, options):
+    def __init__(self, dsn):
         self.current_query = None
         self.current_params = None
-        self.options = options
+        self.dsn = dsn
 
         self.conn = None
         self.cursor = None
@@ -46,7 +34,9 @@ class DBConnection:
             self.cursor.close()
             self.conn.close()
 
-        self.conn = make_connection(self.options, asynchronous=True)
+        # Use a dict to hand in the parameters because async is a reserved
+        # word in Python3.
+        self.conn = psycopg2.connect(**{'dsn' : self.dsn, 'async' : True})
         self.wait()
 
         self.cursor = self.conn.cursor()
old mode 100755 (executable)
new mode 100644 (file)
similarity index 54%
rename from nominatim/nominatim.py
rename to nominatim/indexer/indexer.py
index 8cac583..094d127
@@ -1,35 +1,14 @@
-#! /usr/bin/env python3
-#-----------------------------------------------------------------------------
-# nominatim - [description]
-#-----------------------------------------------------------------------------
-#
-# Indexing tool for the Nominatim database.
-#
-# Based on C version by Brian Quinion
-#
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License
-# as published by the Free Software Foundation; either version 2
-# of the License, or (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-#-----------------------------------------------------------------------------
+"""
+Main work horse for indexing (computing addresses) the database.
+"""
 # pylint: disable=C0111
-from argparse import ArgumentParser, RawDescriptionHelpFormatter
 import logging
-import sys
-import getpass
 import select
 
-from indexer.progress import ProgressLogger # pylint: disable=E0401
-from indexer.db import DBConnection, make_connection # pylint: disable=E0401
+import psycopg2
+
+from .progress import ProgressLogger
+from ..db.async_connection import DBConnection
 
 LOG = logging.getLogger()
 
@@ -117,34 +96,40 @@ class Indexer:
     """ Main indexing routine.
     """
 
-    def __init__(self, opts):
-        self.minrank = max(1, opts.minrank)
-        self.maxrank = min(30, opts.maxrank)
-        self.conn = make_connection(opts)
-        self.threads = [DBConnection(opts) for _ in range(opts.threads)]
+    def __init__(self, dsn, num_threads):
+        self.conn = psycopg2.connect(dsn)
+        self.threads = [DBConnection(dsn) for _ in range(num_threads)]
 
-    def index_boundaries(self):
+    def index_boundaries(self, minrank, maxrank):
         LOG.warning("Starting indexing boundaries using %s threads",
                     len(self.threads))
 
-        for rank in range(max(self.minrank, 5), min(self.maxrank, 26)):
+        for rank in range(max(minrank, 5), min(maxrank, 26)):
             self.index(BoundaryRunner(rank))
 
-    def index_by_rank(self):
+    def index_by_rank(self, minrank, maxrank):
         """ Run classic indexing by rank.
         """
+        maxrank = min(maxrank, 30)
         LOG.warning("Starting indexing rank (%i to %i) using %i threads",
-                    self.minrank, self.maxrank, len(self.threads))
+                    minrank, maxrank, len(self.threads))
 
-        for rank in range(max(1, self.minrank), self.maxrank):
+        for rank in range(max(1, minrank), maxrank):
             self.index(RankRunner(rank))
 
-        if self.maxrank == 30:
+        if maxrank == 30:
             self.index(RankRunner(0))
             self.index(InterpolationRunner(), 20)
-            self.index(RankRunner(self.maxrank), 20)
+            self.index(RankRunner(30), 20)
         else:
-            self.index(RankRunner(self.maxrank))
+            self.index(RankRunner(maxrank))
+
+    def update_status_table(self):
+        """ Update the status in the status table to 'indexed'.
+        """
+        with self.conn.cursor() as cur:
+            cur.execute('UPDATE import_status SET indexed = true')
+        self.conn.commit()
 
     def index(self, obj, batch=1):
         """ Index a single rank or table. `obj` describes the SQL to use
@@ -212,60 +197,3 @@ class Indexer:
                 ready, _, _ = select.select(self.threads, [], [])
 
         assert False, "Unreachable code"
-
-
-def nominatim_arg_parser():
-    """ Setup the command-line parser for the tool.
-    """
-    parser = ArgumentParser(description="Indexing tool for Nominatim.",
-                            formatter_class=RawDescriptionHelpFormatter)
-
-    parser.add_argument('-d', '--database',
-                        dest='dbname', action='store', default='nominatim',
-                        help='Name of the PostgreSQL database to connect to.')
-    parser.add_argument('-U', '--username',
-                        dest='user', action='store',
-                        help='PostgreSQL user name.')
-    parser.add_argument('-W', '--password',
-                        dest='password_prompt', action='store_true',
-                        help='Force password prompt.')
-    parser.add_argument('-H', '--host',
-                        dest='host', action='store',
-                        help='PostgreSQL server hostname or socket location.')
-    parser.add_argument('-P', '--port',
-                        dest='port', action='store',
-                        help='PostgreSQL server port')
-    parser.add_argument('-b', '--boundary-only',
-                        dest='boundary_only', action='store_true',
-                        help='Only index administrative boundaries (ignores min/maxrank).')
-    parser.add_argument('-r', '--minrank',
-                        dest='minrank', type=int, metavar='RANK', default=0,
-                        help='Minimum/starting rank.')
-    parser.add_argument('-R', '--maxrank',
-                        dest='maxrank', type=int, metavar='RANK', default=30,
-                        help='Maximum/finishing rank.')
-    parser.add_argument('-t', '--threads',
-                        dest='threads', type=int, metavar='NUM', default=1,
-                        help='Number of threads to create for indexing.')
-    parser.add_argument('-v', '--verbose',
-                        dest='loglevel', action='count', default=0,
-                        help='Increase verbosity')
-
-    return parser
-
-if __name__ == '__main__':
-    logging.basicConfig(stream=sys.stderr, format='%(levelname)s: %(message)s')
-
-    OPTIONS = nominatim_arg_parser().parse_args(sys.argv[1:])
-
-    LOG.setLevel(max(3 - OPTIONS.loglevel, 0) * 10)
-
-    OPTIONS.password = None
-    if OPTIONS.password_prompt:
-        PASSWORD = getpass.getpass("Database password: ")
-        OPTIONS.password = PASSWORD
-
-    if OPTIONS.boundary_only:
-        Indexer(OPTIONS).index_boundaries()
-    else:
-        Indexer(OPTIONS).index_by_rank()
index 99120673faa67680216ac5fc48d6c8f93da62d03..c9d8816be989fb99675341a512c6806efcf06465 100644 (file)
@@ -26,7 +26,7 @@ class ProgressLogger:
         self.done_places = 0
         self.rank_start_time = datetime.now()
         self.log_interval = log_interval
-        self.next_info = INITIAL_PROGRESS if LOG.isEnabledFor(logging.INFO) else total + 1
+        self.next_info = INITIAL_PROGRESS if LOG.isEnabledFor(logging.WARNING) else total + 1
 
     def add(self, num=1):
         """ Mark `num` places as processed. Print a log message if the
@@ -47,9 +47,9 @@ class ProgressLogger:
         places_per_sec = self.done_places / done_time
         eta = (self.total_places - self.done_places) / places_per_sec
 
-        LOG.info("Done %d in %d @ %.3f per second - %s ETA (seconds): %.2f",
-                 self.done_places, int(done_time),
-                 places_per_sec, self.name, eta)
+        LOG.warning("Done %d in %d @ %.3f per second - %s ETA (seconds): %.2f",
+                    self.done_places, int(done_time),
+                    places_per_sec, self.name, eta)
 
         self.next_info += int(places_per_sec) * self.log_interval
 
index 68d7b2f43544ef1626d29e7ca66962b1bea682ff..0ee921375e8544594e65ddb2b213ceffe8cb1a59 100644 (file)
@@ -91,6 +91,7 @@ class NominatimEnvironment:
         self.test_env['NOMINATIM_BINDIR'] = self.src_dir / 'utils'
         self.test_env['NOMINATIM_DATABASE_MODULE_PATH'] = self.build_dir / 'module'
         self.test_env['NOMINATIM_OSM2PGSQL_BINARY'] = self.build_dir / 'osm2pgsql' / 'osm2pgsql'
+        self.test_env['NOMINATIM_NOMINATIM_TOOL'] = self.build_dir / 'nominatim'
 
         if self.server_module_path:
             self.test_env['NOMINATIM_DATABASE_MODULE_PATH'] = self.server_module_path
index 03e4a800afa07eafe46d415ba51c0b5b415d891c..e5d18f91f392218b3ea9dc5b360439e6f936a9a6 100644 (file)
@@ -54,3 +54,22 @@ def test_get_os_env_prefer_os_environ():
     assert config.get_os_env()['NOMINATIM_DATABASE_WEBUSER'] == 'nobody'
 
     del os.environ['NOMINATIM_DATABASE_WEBUSER']
+
+def test_get_libpq_dsn_convert_default():
+    config = Configuration(None, DEFCFG_DIR)
+
+    assert config.get_libpq_dsn() == 'dbname=nominatim'
+
+def test_get_libpq_dsn_convert_php():
+    config = Configuration(None, DEFCFG_DIR)
+
+    os.environ['NOMINATIM_DATABASE_DSN'] = 'pgsql:dbname=gis;password=foo;host=localhost'
+
+    assert config.get_libpq_dsn() == 'dbname=gis password=foo host=localhost'
+
+def test_get_libpq_dsn_convert_libpq():
+    config = Configuration(None, DEFCFG_DIR)
+
+    os.environ['NOMINATIM_DATABASE_DSN'] = 'host=localhost dbname=gis password=foo'
+
+    assert config.get_libpq_dsn() == 'host=localhost dbname=gis password=foo'