X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/f7e4aa51d386bacfe6bc58073f21dabbc288c849..96e6bbe3a1f2e2d41064e4ca19c0e22de42896c1:/nominatim/indexer/indexer.py?ds=sidebyside diff --git a/nominatim/indexer/indexer.py b/nominatim/indexer/indexer.py index faf91c45..b7673aba 100644 --- a/nominatim/indexer/indexer.py +++ b/nominatim/indexer/indexer.py @@ -3,6 +3,9 @@ Main work horse for indexing (computing addresses) the database. """ import logging import select +import time + +import psycopg2.extras from nominatim.indexer.progress import ProgressLogger from nominatim.indexer import runners @@ -11,6 +14,73 @@ from nominatim.db.connection import connect LOG = logging.getLogger() + +class PlaceFetcher: + """ Asynchronous connection that fetches place details for processing. + """ + def __init__(self, dsn, setup_conn): + self.wait_time = 0 + self.current_ids = None + self.conn = DBConnection(dsn, cursor_factory=psycopg2.extras.DictCursor) + + with setup_conn.cursor() as cur: + # need to fetch those manually because register_hstore cannot + # fetch them on an asynchronous connection below. + hstore_oid = cur.scalar("SELECT 'hstore'::regtype::oid") + hstore_array_oid = cur.scalar("SELECT 'hstore[]'::regtype::oid") + + psycopg2.extras.register_hstore(self.conn.conn, oid=hstore_oid, + array_oid=hstore_array_oid) + + def close(self): + """ Close the underlying asynchronous connection. + """ + if self.conn: + self.conn.close() + self.conn = None + + + def fetch_next_batch(self, cur, runner): + """ Send a request for the next batch of places. + If details for the places are required, they will be fetched + asynchronously. + + Returns true if there is still data available. + """ + ids = cur.fetchmany(100) + + if not ids: + self.current_ids = None + return False + + if hasattr(runner, 'get_place_details'): + runner.get_place_details(self.conn, ids) + self.current_ids = [] + else: + self.current_ids = ids + + return True + + def get_batch(self): + """ Get the next batch of data, previously requested with + `fetch_next_batch`. + """ + if self.current_ids is not None and not self.current_ids: + tstart = time.time() + self.conn.wait() + self.wait_time += time.time() - tstart + self.current_ids = self.conn.cursor.fetchall() + + return self.current_ids + + def __enter__(self): + return self + + + def __exit__(self, exc_type, exc_value, traceback): + self.conn.wait() + self.close() + class WorkerPool: """ A pool of asynchronous database connections. @@ -21,6 +91,7 @@ class WorkerPool: def __init__(self, dsn, pool_size): self.threads = [DBConnection(dsn) for _ in range(pool_size)] self.free_workers = self._yield_free_worker() + self.wait_time = 0 def finish_all(self): @@ -64,7 +135,9 @@ class WorkerPool: ready = self.threads command_stat = 0 else: + tstart = time.time() _, ready, _ = select.select([], self.threads, []) + self.wait_time += time.time() - tstart def __enter__(self): @@ -72,6 +145,7 @@ class WorkerPool: def __exit__(self, exc_type, exc_value, traceback): + self.finish_all() self.close() @@ -79,8 +153,9 @@ class Indexer: """ Main indexing routine. """ - def __init__(self, dsn, num_threads): + def __init__(self, dsn, tokenizer, num_threads): self.dsn = dsn + self.tokenizer = tokenizer self.num_threads = num_threads @@ -123,8 +198,9 @@ class Indexer: LOG.warning("Starting indexing boundaries using %s threads", self.num_threads) - for rank in range(max(minrank, 4), min(maxrank, 26)): - self._index(runners.BoundaryRunner(rank)) + with self.tokenizer.name_analyzer() as analyzer: + for rank in range(max(minrank, 4), min(maxrank, 26)): + self._index(runners.BoundaryRunner(rank, analyzer)) def index_by_rank(self, minrank, maxrank): """ Index all entries of placex in the given rank range (inclusive) @@ -137,15 +213,16 @@ class Indexer: LOG.warning("Starting indexing rank (%i to %i) using %i threads", minrank, maxrank, self.num_threads) - for rank in range(max(1, minrank), maxrank): - self._index(runners.RankRunner(rank)) + with self.tokenizer.name_analyzer() as analyzer: + for rank in range(max(1, minrank), maxrank): + self._index(runners.RankRunner(rank, analyzer)) - if maxrank == 30: - self._index(runners.RankRunner(0)) - self._index(runners.InterpolationRunner(), 20) - self._index(runners.RankRunner(30), 20) - else: - self._index(runners.RankRunner(maxrank)) + if maxrank == 30: + self._index(runners.RankRunner(0, analyzer)) + self._index(runners.InterpolationRunner(analyzer), 20) + self._index(runners.RankRunner(30, analyzer), 20) + else: + self._index(runners.RankRunner(maxrank, analyzer)) def index_postcodes(self): @@ -173,6 +250,7 @@ class Indexer: LOG.warning("Starting %s (using batch size %s)", runner.name(), batch) with connect(self.dsn) as conn: + psycopg2.extras.register_hstore(conn) with conn.cursor() as cur: total_tuples = cur.scalar(runner.sql_count_objects()) LOG.debug("Total number of rows: %i", total_tuples) @@ -185,19 +263,24 @@ class Indexer: with conn.cursor(name='places') as cur: cur.execute(runner.sql_get_objects()) - with WorkerPool(self.dsn, self.num_threads) as pool: - while True: - places = [p[0] for p in cur.fetchmany(batch)] - if not places: - break + with PlaceFetcher(self.dsn, conn) as fetcher: + with WorkerPool(self.dsn, self.num_threads) as pool: + has_more = fetcher.fetch_next_batch(cur, runner) + while has_more: + places = fetcher.get_batch() - LOG.debug("Processing places: %s", str(places)) - worker = pool.next_free_worker() + # asynchronously get the next batch + has_more = fetcher.fetch_next_batch(cur, runner) - worker.perform(runner.sql_index_place(places)) - progress.add(len(places)) + # And insert the curent batch + for idx in range(0, len(places), batch): + part = places[idx:idx+batch] + LOG.debug("Processing places: %s", str(part)) + runner.index_places(pool.next_free_worker(), part) + progress.add(len(part)) - pool.finish_all() + LOG.info("Wait time: fetcher: %.2fs, pool: %.2fs", + fetcher.wait_time, pool.wait_time) conn.commit()