From 20891abe1c0f1e07a160d13a9bc044e05da8ee8a Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 30 Apr 2021 16:17:28 +0200 Subject: [PATCH] indexer: fetch extra place data asynchronously The indexer now fetches any extra data besides the place_id asynchronously while processing the places from the last batch. This also means that more places are now fetched at once. --- nominatim/indexer/indexer.py | 138 ++++++++++++++++++++++------------- nominatim/indexer/runners.py | 12 ++- 2 files changed, 96 insertions(+), 54 deletions(-) diff --git a/nominatim/indexer/indexer.py b/nominatim/indexer/indexer.py index d685e83a..b7673aba 100644 --- a/nominatim/indexer/indexer.py +++ b/nominatim/indexer/indexer.py @@ -14,6 +14,73 @@ from nominatim.db.connection import connect LOG = logging.getLogger() + +class PlaceFetcher: + """ Asynchronous connection that fetches place details for processing. + """ + def __init__(self, dsn, setup_conn): + self.wait_time = 0 + self.current_ids = None + self.conn = DBConnection(dsn, cursor_factory=psycopg2.extras.DictCursor) + + with setup_conn.cursor() as cur: + # need to fetch those manually because register_hstore cannot + # fetch them on an asynchronous connection below. + hstore_oid = cur.scalar("SELECT 'hstore'::regtype::oid") + hstore_array_oid = cur.scalar("SELECT 'hstore[]'::regtype::oid") + + psycopg2.extras.register_hstore(self.conn.conn, oid=hstore_oid, + array_oid=hstore_array_oid) + + def close(self): + """ Close the underlying asynchronous connection. + """ + if self.conn: + self.conn.close() + self.conn = None + + + def fetch_next_batch(self, cur, runner): + """ Send a request for the next batch of places. + If details for the places are required, they will be fetched + asynchronously. + + Returns true if there is still data available. + """ + ids = cur.fetchmany(100) + + if not ids: + self.current_ids = None + return False + + if hasattr(runner, 'get_place_details'): + runner.get_place_details(self.conn, ids) + self.current_ids = [] + else: + self.current_ids = ids + + return True + + def get_batch(self): + """ Get the next batch of data, previously requested with + `fetch_next_batch`. + """ + if self.current_ids is not None and not self.current_ids: + tstart = time.time() + self.conn.wait() + self.wait_time += time.time() - tstart + self.current_ids = self.conn.cursor.fetchall() + + return self.current_ids + + def __enter__(self): + return self + + + def __exit__(self, exc_type, exc_value, traceback): + self.conn.wait() + self.close() + class WorkerPool: """ A pool of asynchronous database connections. @@ -24,6 +91,7 @@ class WorkerPool: def __init__(self, dsn, pool_size): self.threads = [DBConnection(dsn) for _ in range(pool_size)] self.free_workers = self._yield_free_worker() + self.wait_time = 0 def finish_all(self): @@ -67,7 +135,9 @@ class WorkerPool: ready = self.threads command_stat = 0 else: + tstart = time.time() _, ready, _ = select.select([], self.threads, []) + self.wait_time += time.time() - tstart def __enter__(self): @@ -75,6 +145,7 @@ class WorkerPool: def __exit__(self, exc_type, exc_value, traceback): + self.finish_all() self.close() @@ -184,70 +255,33 @@ class Indexer: total_tuples = cur.scalar(runner.sql_count_objects()) LOG.debug("Total number of rows: %i", total_tuples) - # need to fetch those manually because register_hstore cannot - # fetch them on an asynchronous connection below. - hstore_oid = cur.scalar("SELECT 'hstore'::regtype::oid") - hstore_array_oid = cur.scalar("SELECT 'hstore[]'::regtype::oid") - conn.commit() progress = ProgressLogger(runner.name(), total_tuples) - fetcher_wait = 0 - pool_wait = 0 - if total_tuples > 0: with conn.cursor(name='places') as cur: cur.execute(runner.sql_get_objects()) - fetcher = DBConnection(self.dsn, cursor_factory=psycopg2.extras.DictCursor) - psycopg2.extras.register_hstore(fetcher.conn, - oid=hstore_oid, - array_oid=hstore_array_oid) - - with WorkerPool(self.dsn, self.num_threads) as pool: - places = self._fetch_next_batch(cur, fetcher, runner) - while places is not None: - if not places: - t0 = time.time() - fetcher.wait() - fetcher_wait += time.time() - t0 - places = fetcher.cursor.fetchall() + with PlaceFetcher(self.dsn, conn) as fetcher: + with WorkerPool(self.dsn, self.num_threads) as pool: + has_more = fetcher.fetch_next_batch(cur, runner) + while has_more: + places = fetcher.get_batch() - # asynchronously get the next batch - next_places = self._fetch_next_batch(cur, fetcher, runner) + # asynchronously get the next batch + has_more = fetcher.fetch_next_batch(cur, runner) - # And insert the curent batch - for idx in range(0, len(places), batch): - t0 = time.time() - worker = pool.next_free_worker() - pool_wait += time.time() - t0 - part = places[idx:idx+batch] - LOG.debug("Processing places: %s", str(part)) - runner.index_places(worker, part) - progress.add(len(part)) + # And insert the curent batch + for idx in range(0, len(places), batch): + part = places[idx:idx+batch] + LOG.debug("Processing places: %s", str(part)) + runner.index_places(pool.next_free_worker(), part) + progress.add(len(part)) - places = next_places - - pool.finish_all() - - fetcher.wait() - fetcher.close() + LOG.info("Wait time: fetcher: %.2fs, pool: %.2fs", + fetcher.wait_time, pool.wait_time) conn.commit() progress.done() - LOG.warning("Wait time: fetcher: {}s, pool: {}s".format(fetcher_wait, pool_wait)) - - - def _fetch_next_batch(self, cur, fetcher, runner): - ids = cur.fetchmany(100) - - if not ids: - return None - - if not hasattr(runner, 'get_place_details'): - return ids - - runner.get_place_details(fetcher, ids) - return [] diff --git a/nominatim/indexer/runners.py b/nominatim/indexer/runners.py index 459f8004..aa607faa 100644 --- a/nominatim/indexer/runners.py +++ b/nominatim/indexer/runners.py @@ -28,7 +28,8 @@ class AbstractPlacexRunner: """.format(','.join(["(%s, %s::hstore, %s::jsonb)"] * num_places)) - def get_place_details(self, worker, ids): + @staticmethod + def get_place_details(worker, ids): worker.perform("""SELECT place_id, (placex_prepare_update(placex)).* FROM placex WHERE place_id IN %s""", (tuple((p[0] for p in ids)), )) @@ -103,12 +104,19 @@ class InterpolationRunner: @staticmethod def sql_get_objects(): - return """SELECT place_id, get_interpolation_address(address, osm_id) as address + return """SELECT place_id FROM location_property_osmline WHERE indexed_status > 0 ORDER BY geometry_sector""" + @staticmethod + def get_place_details(worker, ids): + worker.perform("""SELECT place_id, get_interpolation_address(address, osm_id) as address + FROM location_property_osmline WHERE place_id IN %s""", + (tuple((p[0] for p in ids)), )) + + @staticmethod @functools.lru_cache(maxsize=1) def _index_sql(num_places): -- 2.39.5