nominatim/tools/tiger_data.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Functions for importing tiger data and handling tarbar and directory files
   9 """
  10 import csv
  11 import io
  12 import logging
  13 import os
  14 import tarfile
  15
  16 from nominatim.db.connection import connect
  17 from nominatim.db.async_connection import WorkerPool
  18 from nominatim.db.sql_preprocessor import SQLPreprocessor
  19 from nominatim.errors import UsageError
  20 from nominatim.indexer.place_info import PlaceInfo
  21
  22 LOG = logging.getLogger()
  23
  24 class TigerInput:
  25     """ Context manager that goes through Tiger input files which may
  26         either be in a directory or gzipped together in a tar file.
  27     """
  28
  29     def __init__(self, data_dir):
  30         self.tar_handle = None
  31         self.files = []
  32
  33         if data_dir.endswith('.tar.gz'):
  34             try:
  35                 self.tar_handle = tarfile.open(data_dir) # pylint: disable=consider-using-with
  36             except tarfile.ReadError as err:
  37                 LOG.fatal("Cannot open '%s'. Is this a tar file?", data_dir)
  38                 raise UsageError("Cannot open Tiger data file.") from err
  39
  40             self.files = [i for i in self.tar_handle.getmembers() if i.name.endswith('.csv')]
  41             LOG.warning("Found %d CSV files in tarfile with path %s", len(self.files), data_dir)
  42         else:
  43             files = os.listdir(data_dir)
  44             self.files = [os.path.join(data_dir, i) for i in files if i.endswith('.csv')]
  45             LOG.warning("Found %d CSV files in path %s", len(self.files), data_dir)
  46
  47         if not self.files:
  48             LOG.warning("Tiger data import selected but no files found at %s", data_dir)
  49
  50
  51     def __enter__(self):
  52         return self
  53
  54
  55     def __exit__(self, exc_type, exc_val, exc_tb):
  56         if self.tar_handle:
  57             self.tar_handle.close()
  58             self.tar_handle = None
  59
  60
  61     def next_file(self):
  62         """ Return a file handle to the next file to be processed.
  63             Raises an IndexError if there is no file left.
  64         """
  65         fname = self.files.pop(0)
  66
  67         if self.tar_handle is not None:
  68             return io.TextIOWrapper(self.tar_handle.extractfile(fname))
  69
  70         return open(fname, encoding='utf-8')
  71
  72
  73     def __len__(self):
  74         return len(self.files)
  75
  76
  77 def handle_threaded_sql_statements(pool, fd, analyzer):
  78     """ Handles sql statement with multiplexing
  79     """
  80     lines = 0
  81     # Using pool of database connections to execute sql statements
  82
  83     sql = "SELECT tiger_line_import(%s, %s, %s, %s, %s, %s)"
  84
  85     for row in csv.DictReader(fd, delimiter=';'):
  86         try:
  87             address = dict(street=row['street'], postcode=row['postcode'])
  88             args = ('SRID=4326;' + row['geometry'],
  89                     int(row['from']), int(row['to']), row['interpolation'],
  90                     PlaceInfo({'address': address}).analyze(analyzer),
  91                     analyzer.normalize_postcode(row['postcode']))
  92         except ValueError:
  93             continue
  94         pool.next_free_worker().perform(sql, args=args)
  95
  96         lines += 1
  97         if lines == 1000:
  98             print('.', end='', flush=True)
  99             lines = 0
 100
 101
 102 def add_tiger_data(data_dir, config, threads, tokenizer):
 103     """ Import tiger data from directory or tar file `data dir`.
 104     """
 105     dsn = config.get_libpq_dsn()
 106
 107     with TigerInput(data_dir) as tar:
 108         if not tar:
 109             return
 110
 111         with connect(dsn) as conn:
 112             sql = SQLPreprocessor(conn, config)
 113             sql.run_sql_file(conn, 'tiger_import_start.sql')
 114
 115         # Reading files and then for each file line handling
 116         # sql_query in <threads - 1> chunks.
 117         place_threads = max(1, threads - 1)
 118
 119         with WorkerPool(dsn, place_threads, ignore_sql_errors=True) as pool:
 120             with tokenizer.name_analyzer() as analyzer:
 121                 while tar:
 122                     with tar.next_file() as fd:
 123                         handle_threaded_sql_statements(pool, fd, analyzer)
 124
 125         print('\n')
 126
 127     LOG.warning("Creating indexes on Tiger data")
 128     with connect(dsn) as conn:
 129         sql = SQLPreprocessor(conn, config)
 130         sql.run_sql_file(conn, 'tiger_import_finish.sql')