X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/26a81654a87ffe772da683cc7a1f29e47c00c7eb..6ce6f62b8ef7cec8b5950293516845e319dd5f06:/nominatim/indexer/indexer.py diff --git a/nominatim/indexer/indexer.py b/nominatim/indexer/indexer.py index aa1fb8ef..d685e83a 100644 --- a/nominatim/indexer/indexer.py +++ b/nominatim/indexer/indexer.py @@ -3,6 +3,9 @@ Main work horse for indexing (computing addresses) the database. """ import logging import select +import time + +import psycopg2.extras from nominatim.indexer.progress import ProgressLogger from nominatim.indexer import runners @@ -11,25 +14,78 @@ from nominatim.db.connection import connect LOG = logging.getLogger() +class WorkerPool: + """ A pool of asynchronous database connections. -class Indexer: - """ Main indexing routine. + The pool may be used as a context manager. """ + REOPEN_CONNECTIONS_AFTER = 100000 - def __init__(self, dsn, num_threads): - self.dsn = dsn - self.num_threads = num_threads - self.threads = [] + def __init__(self, dsn, pool_size): + self.threads = [DBConnection(dsn) for _ in range(pool_size)] + self.free_workers = self._yield_free_worker() - def _setup_connections(self): - self.threads = [DBConnection(self.dsn) for _ in range(self.num_threads)] + def finish_all(self): + """ Wait for all connection to finish. + """ + for thread in self.threads: + while not thread.is_done(): + thread.wait() + self.free_workers = self._yield_free_worker() - def _close_connections(self): + def close(self): + """ Close all connections and clear the pool. + """ for thread in self.threads: thread.close() self.threads = [] + self.free_workers = None + + + def next_free_worker(self): + """ Get the next free connection. + """ + return next(self.free_workers) + + + def _yield_free_worker(self): + ready = self.threads + command_stat = 0 + while True: + for thread in ready: + if thread.is_done(): + command_stat += 1 + yield thread + + if command_stat > self.REOPEN_CONNECTIONS_AFTER: + for thread in self.threads: + while not thread.is_done(): + thread.wait() + thread.connect() + ready = self.threads + command_stat = 0 + else: + _, ready, _ = select.select([], self.threads, []) + + + def __enter__(self): + return self + + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + + +class Indexer: + """ Main indexing routine. + """ + + def __init__(self, dsn, tokenizer, num_threads): + self.dsn = dsn + self.tokenizer = tokenizer + self.num_threads = num_threads def index_full(self, analyse=True): @@ -42,27 +98,27 @@ class Indexer: conn.autocommit = True if analyse: - def _analyse(): + def _analyze(): with conn.cursor() as cur: - cur.execute('ANALYSE') + cur.execute('ANALYZE') else: - def _analyse(): + def _analyze(): pass self.index_by_rank(0, 4) - _analyse() + _analyze() self.index_boundaries(0, 30) - _analyse() + _analyze() self.index_by_rank(5, 25) - _analyse() + _analyze() self.index_by_rank(26, 30) - _analyse() + _analyze() self.index_postcodes() - _analyse() + _analyze() def index_boundaries(self, minrank, maxrank): @@ -71,13 +127,9 @@ class Indexer: LOG.warning("Starting indexing boundaries using %s threads", self.num_threads) - self._setup_connections() - - try: + with self.tokenizer.name_analyzer() as analyzer: for rank in range(max(minrank, 4), min(maxrank, 26)): - self._index(runners.BoundaryRunner(rank)) - finally: - self._close_connections() + self._index(runners.BoundaryRunner(rank, analyzer)) def index_by_rank(self, minrank, maxrank): """ Index all entries of placex in the given rank range (inclusive) @@ -90,20 +142,16 @@ class Indexer: LOG.warning("Starting indexing rank (%i to %i) using %i threads", minrank, maxrank, self.num_threads) - self._setup_connections() - - try: + with self.tokenizer.name_analyzer() as analyzer: for rank in range(max(1, minrank), maxrank): - self._index(runners.RankRunner(rank)) + self._index(runners.RankRunner(rank, analyzer)) if maxrank == 30: - self._index(runners.RankRunner(0)) - self._index(runners.InterpolationRunner(), 20) - self._index(runners.RankRunner(30), 20) + self._index(runners.RankRunner(0, analyzer)) + self._index(runners.InterpolationRunner(analyzer), 20) + self._index(runners.RankRunner(30, analyzer), 20) else: - self._index(runners.RankRunner(maxrank)) - finally: - self._close_connections() + self._index(runners.RankRunner(maxrank, analyzer)) def index_postcodes(self): @@ -111,12 +159,8 @@ class Indexer: """ LOG.warning("Starting indexing postcodes using %s threads", self.num_threads) - self._setup_connections() + self._index(runners.PostcodeRunner(), 20) - try: - self._index(runners.PostcodeRunner(), 20) - finally: - self._close_connections() def update_status_table(self): """ Update the status in the status table to 'indexed'. @@ -135,60 +179,75 @@ class Indexer: LOG.warning("Starting %s (using batch size %s)", runner.name(), batch) with connect(self.dsn) as conn: + psycopg2.extras.register_hstore(conn) with conn.cursor() as cur: total_tuples = cur.scalar(runner.sql_count_objects()) LOG.debug("Total number of rows: %i", total_tuples) + # need to fetch those manually because register_hstore cannot + # fetch them on an asynchronous connection below. + hstore_oid = cur.scalar("SELECT 'hstore'::regtype::oid") + hstore_array_oid = cur.scalar("SELECT 'hstore[]'::regtype::oid") + conn.commit() progress = ProgressLogger(runner.name(), total_tuples) + fetcher_wait = 0 + pool_wait = 0 + if total_tuples > 0: with conn.cursor(name='places') as cur: cur.execute(runner.sql_get_objects()) - next_thread = self.find_free_thread() - while True: - places = [p[0] for p in cur.fetchmany(batch)] - if not places: - break + fetcher = DBConnection(self.dsn, cursor_factory=psycopg2.extras.DictCursor) + psycopg2.extras.register_hstore(fetcher.conn, + oid=hstore_oid, + array_oid=hstore_array_oid) - LOG.debug("Processing places: %s", str(places)) - thread = next(next_thread) + with WorkerPool(self.dsn, self.num_threads) as pool: + places = self._fetch_next_batch(cur, fetcher, runner) + while places is not None: + if not places: + t0 = time.time() + fetcher.wait() + fetcher_wait += time.time() - t0 + places = fetcher.cursor.fetchall() - thread.perform(runner.sql_index_place(places)) - progress.add(len(places)) + # asynchronously get the next batch + next_places = self._fetch_next_batch(cur, fetcher, runner) - conn.commit() + # And insert the curent batch + for idx in range(0, len(places), batch): + t0 = time.time() + worker = pool.next_free_worker() + pool_wait += time.time() - t0 + part = places[idx:idx+batch] + LOG.debug("Processing places: %s", str(part)) + runner.index_places(worker, part) + progress.add(len(part)) - for thread in self.threads: - thread.wait() + places = next_places + + pool.finish_all() + + fetcher.wait() + fetcher.close() + + conn.commit() progress.done() + LOG.warning("Wait time: fetcher: {}s, pool: {}s".format(fetcher_wait, pool_wait)) - def find_free_thread(self): - """ Generator that returns the next connection that is free for - sending a query. - """ - ready = self.threads - command_stat = 0 - while True: - for thread in ready: - if thread.is_done(): - command_stat += 1 - yield thread + def _fetch_next_batch(self, cur, fetcher, runner): + ids = cur.fetchmany(100) - # refresh the connections occasionaly to avoid potential - # memory leaks in Postgresql. - if command_stat > 100000: - for thread in self.threads: - while not thread.is_done(): - thread.wait() - thread.connect() - command_stat = 0 - ready = self.threads - else: - ready, _, _ = select.select(self.threads, [], []) + if not ids: + return None + + if not hasattr(runner, 'get_place_details'): + return ids - assert False, "Unreachable code" + runner.get_place_details(fetcher, ids) + return []