1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Implementation of the 'import' subcommand.
11 from pathlib import Path
15 from nominatim.db.connection import connect
16 from nominatim.db import status, properties
17 from nominatim.version import version_str
19 # Do not repeat documentation of subcommand classes.
20 # pylint: disable=C0111
21 # Using non-top-level imports to avoid eventually unused imports.
22 # pylint: disable=C0415
24 LOG = logging.getLogger()
28 Create a new Nominatim database from an OSM file.
30 This sub-command sets up a new Nominatim database from scratch starting
31 with creating a new database in Postgresql. The user running this command
32 needs superuser rights on the database.
37 group_name = parser.add_argument_group('Required arguments')
38 group = group_name.add_mutually_exclusive_group(required=True)
39 group.add_argument('--osm-file', metavar='FILE', action='append',
40 help='OSM file to be imported'
41 ' (repeat for importing multiple files)')
42 group.add_argument('--continue', dest='continue_at',
43 choices=['load-data', 'indexing', 'db-postprocess'],
44 help='Continue an import that was interrupted')
45 group = parser.add_argument_group('Optional arguments')
46 group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
47 help='Size of cache to be used by osm2pgsql (in MB)')
48 group.add_argument('--reverse-only', action='store_true',
49 help='Do not create tables and indexes for searching')
50 group.add_argument('--no-partitions', action='store_true',
51 help=("Do not partition search indices "
52 "(speeds up import of single country extracts)"))
53 group.add_argument('--no-updates', action='store_true',
54 help="Do not keep tables that are only needed for "
55 "updating the database later")
56 group.add_argument('--offline', action='store_true',
57 help="Do not attempt to load any additional data from the internet")
58 group = parser.add_argument_group('Expert options')
59 group.add_argument('--ignore-errors', action='store_true',
60 help='Continue import even when errors in SQL are present')
61 group.add_argument('--index-noanalyse', action='store_true',
62 help='Do not perform analyse operations during index (expert only)')
66 def run(args): # pylint: disable=too-many-statements
67 from ..data import country_info
68 from ..tools import database_import, refresh, postcodes, freeze
69 from ..indexer.indexer import Indexer
71 country_info.setup_country_config(args.config)
73 if args.continue_at is None:
74 files = args.get_osm_file_list()
76 LOG.warning('Creating database')
77 database_import.setup_database_skeleton(args.config.get_libpq_dsn(),
78 rouser=args.config.DATABASE_WEBUSER)
80 LOG.warning('Setting up country tables')
81 country_info.setup_country_tables(args.config.get_libpq_dsn(),
85 LOG.warning('Importing OSM data file')
86 database_import.import_osm_data(files,
87 args.osm2pgsql_options(0, 1),
89 ignore_errors=args.ignore_errors)
91 SetupAll._setup_tables(args.config, args.reverse_only)
93 LOG.warning('Importing wikipedia importance data')
94 data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir)
95 if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
97 LOG.error('Wikipedia importance dump file not found. '
98 'Will be using default importances.')
100 if args.continue_at is None or args.continue_at == 'load-data':
101 LOG.warning('Initialise tables')
102 with connect(args.config.get_libpq_dsn()) as conn:
103 database_import.truncate_data_tables(conn)
105 LOG.warning('Load data into placex table')
106 database_import.load_data(args.config.get_libpq_dsn(),
107 args.threads or psutil.cpu_count() or 1)
109 LOG.warning("Setting up tokenizer")
110 tokenizer = SetupAll._get_tokenizer(args.continue_at, args.config)
112 if args.continue_at is None or args.continue_at == 'load-data':
113 LOG.warning('Calculate postcodes')
114 postcodes.update_postcodes(args.config.get_libpq_dsn(),
115 args.project_dir, tokenizer)
117 if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
118 if args.continue_at is not None and args.continue_at != 'load-data':
119 with connect(args.config.get_libpq_dsn()) as conn:
120 SetupAll._create_pending_index(conn, args.config.TABLESPACE_ADDRESS_INDEX)
121 LOG.warning('Indexing places')
122 indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
123 args.threads or psutil.cpu_count() or 1)
124 indexer.index_full(analyse=not args.index_noanalyse)
126 LOG.warning('Post-process tables')
127 with connect(args.config.get_libpq_dsn()) as conn:
128 database_import.create_search_indices(conn, args.config,
129 drop=args.no_updates)
130 LOG.warning('Create search index for default country names.')
131 country_info.create_country_names(conn, tokenizer,
132 args.config.get_str_list('LANGUAGES'))
134 freeze.drop_update_tables(conn)
135 tokenizer.finalize_import(args.config)
137 LOG.warning('Recompute word counts')
138 tokenizer.update_statistics()
140 webdir = args.project_dir / 'website'
141 LOG.warning('Setup website at %s', webdir)
142 with connect(args.config.get_libpq_dsn()) as conn:
143 refresh.setup_website(webdir, args.config, conn)
145 SetupAll._finalize_database(args.config.get_libpq_dsn(), args.offline)
151 def _setup_tables(config, reverse_only):
152 """ Set up the basic database layout: tables, indexes and functions.
154 from ..tools import database_import, refresh
156 with connect(config.get_libpq_dsn()) as conn:
157 LOG.warning('Create functions (1st pass)')
158 refresh.create_functions(conn, config, False, False)
159 LOG.warning('Create tables')
160 database_import.create_tables(conn, config, reverse_only=reverse_only)
161 refresh.load_address_levels_from_config(conn, config)
162 LOG.warning('Create functions (2nd pass)')
163 refresh.create_functions(conn, config, False, False)
164 LOG.warning('Create table triggers')
165 database_import.create_table_triggers(conn, config)
166 LOG.warning('Create partition tables')
167 database_import.create_partition_tables(conn, config)
168 LOG.warning('Create functions (3rd pass)')
169 refresh.create_functions(conn, config, False, False)
173 def _get_tokenizer(continue_at, config):
174 """ Set up a new tokenizer or load an already initialised one.
176 from ..tokenizer import factory as tokenizer_factory
178 if continue_at is None or continue_at == 'load-data':
179 # (re)initialise the tokenizer data
180 return tokenizer_factory.create_tokenizer(config)
182 # just load the tokenizer
183 return tokenizer_factory.get_tokenizer_for_db(config)
186 def _create_pending_index(conn, tablespace):
187 """ Add a supporting index for finding places still to be indexed.
189 This index is normally created at the end of the import process
190 for later updates. When indexing was partially done, then this
191 index can greatly improve speed going through already indexed data.
193 if conn.index_exists('idx_placex_pendingsector'):
196 with conn.cursor() as cur:
197 LOG.warning('Creating support index')
199 tablespace = 'TABLESPACE ' + tablespace
200 cur.execute(f"""CREATE INDEX idx_placex_pendingsector
201 ON placex USING BTREE (rank_address,geometry_sector)
202 {tablespace} WHERE indexed_status > 0
208 def _finalize_database(dsn, offline):
209 """ Determine the database date and set the status accordingly.
211 with connect(dsn) as conn:
214 dbdate = status.compute_database_date(conn)
215 status.set_status(conn, dbdate)
216 LOG.info('Database is at %s.', dbdate)
217 except Exception as exc: # pylint: disable=broad-except
218 LOG.error('Cannot determine date of database: %s', exc)
220 properties.set_property(conn, 'database_version', version_str())