1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Implementation of 'refresh' subcommand.
10 from typing import Tuple, Optional
13 from pathlib import Path
16 from ..config import Configuration
17 from ..db.connection import connect, table_exists
18 from ..tokenizer.base import AbstractTokenizer
19 from .args import NominatimArgs
21 # Do not repeat documentation of subcommand classes.
22 # pylint: disable=C0111
23 # Using non-top-level imports to avoid eventually unused imports.
24 # pylint: disable=E0012,C0415
26 LOG = logging.getLogger()
28 def _parse_osm_object(obj: str) -> Tuple[str, int]:
29 """ Parse the given argument into a tuple of OSM type and ID.
30 Raises an ArgumentError if the format is not recognized.
32 if len(obj) < 2 or obj[0].lower() not in 'nrw' or not obj[1:].isdigit():
33 raise argparse.ArgumentTypeError("Cannot parse OSM ID. Expect format: [N|W|R]<id>.")
35 return (obj[0].upper(), int(obj[1:]))
40 Recompute auxiliary data used by the indexing process.
42 This sub-commands updates various static data and functions in the database.
43 It usually needs to be run after changing various aspects of the
44 configuration. The configuration documentation will mention the exact
45 command to use in such case.
47 Warning: the 'update' command must not be run in parallel with other update
48 commands like 'replication' or 'add-data'.
50 def __init__(self) -> None:
51 self.tokenizer: Optional[AbstractTokenizer] = None
53 def add_args(self, parser: argparse.ArgumentParser) -> None:
54 group = parser.add_argument_group('Data arguments')
55 group.add_argument('--postcodes', action='store_true',
56 help='Update postcode centroid table')
57 group.add_argument('--word-tokens', action='store_true',
58 help='Clean up search terms')
59 group.add_argument('--word-counts', action='store_true',
60 help='Compute frequency of full-word search terms')
61 group.add_argument('--address-levels', action='store_true',
62 help='Reimport address level configuration')
63 group.add_argument('--functions', action='store_true',
64 help='Update the PL/pgSQL functions in the database')
65 group.add_argument('--wiki-data', action='store_true',
66 help='Update Wikipedia/data importance numbers')
67 group.add_argument('--secondary-importance', action='store_true',
68 help='Update secondary importance raster data')
69 group.add_argument('--importance', action='store_true',
70 help='Recompute place importances (expensive!)')
71 group.add_argument('--website', action='store_true',
72 help='Refresh the directory that serves the scripts for the web API')
73 group.add_argument('--data-object', action='append',
74 type=_parse_osm_object, metavar='OBJECT',
75 help='Mark the given OSM object as requiring an update'
76 ' (format: [NWR]<id>)')
77 group.add_argument('--data-area', action='append',
78 type=_parse_osm_object, metavar='OBJECT',
79 help='Mark the area around the given OSM object as requiring an update'
80 ' (format: [NWR]<id>)')
82 group = parser.add_argument_group('Arguments for function refresh')
83 group.add_argument('--no-diff-updates', action='store_false', dest='diffs',
84 help='Do not enable code for propagating updates')
85 group.add_argument('--enable-debug-statements', action='store_true',
86 help='Enable debug warning statements in functions')
89 def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches, too-many-statements
90 from ..tools import refresh, postcodes
91 from ..indexer.indexer import Indexer
93 need_function_refresh = args.functions
96 if postcodes.can_compute(args.config.get_libpq_dsn()):
97 LOG.warning("Update postcodes centroid")
98 tokenizer = self._get_tokenizer(args.config)
99 postcodes.update_postcodes(args.config.get_libpq_dsn(),
100 args.project_dir, tokenizer)
101 indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
103 asyncio.run(indexer.index_postcodes())
105 LOG.error("The place table doesn't exist. "
106 "Postcode updates on a frozen database is not possible.")
109 LOG.warning('Updating word tokens')
110 tokenizer = self._get_tokenizer(args.config)
111 tokenizer.update_word_tokens()
114 LOG.warning('Recompute word statistics')
115 self._get_tokenizer(args.config).update_statistics(args.config,
116 threads=args.threads or 1)
118 if args.address_levels:
119 LOG.warning('Updating address levels')
120 with connect(args.config.get_libpq_dsn()) as conn:
121 refresh.load_address_levels_from_config(conn, args.config)
123 # Attention: must come BEFORE functions
124 if args.secondary_importance:
125 with connect(args.config.get_libpq_dsn()) as conn:
126 # If the table did not exist before, then the importance code
127 # needs to be enabled.
128 if not table_exists(conn, 'secondary_importance'):
129 args.functions = True
131 LOG.warning('Import secondary importance raster data from %s', args.project_dir)
132 if refresh.import_secondary_importance(args.config.get_libpq_dsn(),
133 args.project_dir) > 0:
134 LOG.fatal('FATAL: Cannot update secondary importance raster data')
136 need_function_refresh = True
139 data_path = Path(args.config.WIKIPEDIA_DATA_PATH
141 LOG.warning('Import wikipedia article importance from %s', data_path)
142 if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
144 LOG.fatal('FATAL: Wikipedia importance file not found in %s', data_path)
146 need_function_refresh = True
148 if need_function_refresh:
149 LOG.warning('Create functions')
150 with connect(args.config.get_libpq_dsn()) as conn:
151 refresh.create_functions(conn, args.config,
152 args.diffs, args.enable_debug_statements)
153 self._get_tokenizer(args.config).update_sql_functions(args.config)
155 # Attention: importance MUST come after wiki data import and after functions.
157 LOG.warning('Update importance values for database')
158 with connect(args.config.get_libpq_dsn()) as conn:
159 refresh.recompute_importance(conn)
162 webdir = args.project_dir / 'website'
163 LOG.warning('Setting up website directory at %s', webdir)
164 # This is a little bit hacky: call the tokenizer setup, so that
165 # the tokenizer directory gets repopulated as well, in case it
167 self._get_tokenizer(args.config)
168 with connect(args.config.get_libpq_dsn()) as conn:
169 refresh.setup_website(webdir, args.config, conn)
171 if args.data_object or args.data_area:
172 with connect(args.config.get_libpq_dsn()) as conn:
173 for obj in args.data_object or []:
174 refresh.invalidate_osm_object(*obj, conn, recursive=False)
175 for obj in args.data_area or []:
176 refresh.invalidate_osm_object(*obj, conn, recursive=True)
182 def _get_tokenizer(self, config: Configuration) -> AbstractTokenizer:
183 if self.tokenizer is None:
184 from ..tokenizer import factory as tokenizer_factory
186 self.tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
188 return self.tokenizer