nominatim/clicmd/refresh.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Implementation of 'refresh' subcommand.
   9 """
  10 from typing import Tuple, Optional
  11 import argparse
  12 import logging
  13 from pathlib import Path
  14
  15 from nominatim.config import Configuration
  16 from nominatim.db.connection import connect
  17 from nominatim.tokenizer.base import AbstractTokenizer
  18 from nominatim.clicmd.args import NominatimArgs
  19
  20 # Do not repeat documentation of subcommand classes.
  21 # pylint: disable=C0111
  22 # Using non-top-level imports to avoid eventually unused imports.
  23 # pylint: disable=E0012,C0415
  24
  25 LOG = logging.getLogger()
  26
  27 def _parse_osm_object(obj: str) -> Tuple[str, int]:
  28     """ Parse the given argument into a tuple of OSM type and ID.
  29         Raises an ArgumentError if the format is not recognized.
  30     """
  31     if len(obj) < 2 or obj[0].lower() not in 'nrw' or not obj[1:].isdigit():
  32         raise argparse.ArgumentTypeError("Cannot parse OSM ID. Expect format: [N|W|R]<id>.")
  33
  34     return (obj[0].upper(), int(obj[1:]))
  35
  36
  37 class UpdateRefresh:
  38     """\
  39     Recompute auxiliary data used by the indexing process.
  40
  41     This sub-commands updates various static data and functions in the database.
  42     It usually needs to be run after changing various aspects of the
  43     configuration. The configuration documentation will mention the exact
  44     command to use in such case.
  45
  46     Warning: the 'update' command must not be run in parallel with other update
  47              commands like 'replication' or 'add-data'.
  48     """
  49     def __init__(self) -> None:
  50         self.tokenizer: Optional[AbstractTokenizer] = None
  51
  52     def add_args(self, parser: argparse.ArgumentParser) -> None:
  53         group = parser.add_argument_group('Data arguments')
  54         group.add_argument('--postcodes', action='store_true',
  55                            help='Update postcode centroid table')
  56         group.add_argument('--word-tokens', action='store_true',
  57                            help='Clean up search terms')
  58         group.add_argument('--word-counts', action='store_true',
  59                            help='Compute frequency of full-word search terms')
  60         group.add_argument('--address-levels', action='store_true',
  61                            help='Reimport address level configuration')
  62         group.add_argument('--functions', action='store_true',
  63                            help='Update the PL/pgSQL functions in the database')
  64         group.add_argument('--wiki-data', action='store_true',
  65                            help='Update Wikipedia/data importance numbers')
  66         group.add_argument('--importance', action='store_true',
  67                            help='Recompute place importances (expensive!)')
  68         group.add_argument('--website', action='store_true',
  69                            help='Refresh the directory that serves the scripts for the web API')
  70         group.add_argument('--data-object', action='append',
  71                            type=_parse_osm_object, metavar='OBJECT',
  72                            help='Mark the given OSM object as requiring an update'
  73                                 ' (format: [NWR]<id>)')
  74         group.add_argument('--data-area', action='append',
  75                            type=_parse_osm_object, metavar='OBJECT',
  76                            help='Mark the area around the given OSM object as requiring an update'
  77                                 ' (format: [NWR]<id>)')
  78
  79         group = parser.add_argument_group('Arguments for function refresh')
  80         group.add_argument('--no-diff-updates', action='store_false', dest='diffs',
  81                            help='Do not enable code for propagating updates')
  82         group.add_argument('--enable-debug-statements', action='store_true',
  83                            help='Enable debug warning statements in functions')
  84
  85
  86     def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches
  87         from ..tools import refresh, postcodes
  88         from ..indexer.indexer import Indexer
  89
  90
  91         if args.postcodes:
  92             if postcodes.can_compute(args.config.get_libpq_dsn()):
  93                 LOG.warning("Update postcodes centroid")
  94                 tokenizer = self._get_tokenizer(args.config)
  95                 postcodes.update_postcodes(args.config.get_libpq_dsn(),
  96                                            args.project_dir, tokenizer)
  97                 indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
  98                                   args.threads or 1)
  99                 indexer.index_postcodes()
 100             else:
 101                 LOG.error("The place table doesn't exist. "
 102                           "Postcode updates on a frozen database is not possible.")
 103
 104         if args.word_tokens:
 105             LOG.warning('Updating word tokens')
 106             tokenizer = self._get_tokenizer(args.config)
 107             tokenizer.update_word_tokens()
 108
 109         if args.word_counts:
 110             LOG.warning('Recompute word statistics')
 111             self._get_tokenizer(args.config).update_statistics()
 112
 113         if args.address_levels:
 114             LOG.warning('Updating address levels')
 115             with connect(args.config.get_libpq_dsn()) as conn:
 116                 refresh.load_address_levels_from_config(conn, args.config)
 117
 118         if args.functions:
 119             LOG.warning('Create functions')
 120             with connect(args.config.get_libpq_dsn()) as conn:
 121                 refresh.create_functions(conn, args.config,
 122                                          args.diffs, args.enable_debug_statements)
 123                 self._get_tokenizer(args.config).update_sql_functions(args.config)
 124
 125         if args.wiki_data:
 126             data_path = Path(args.config.WIKIPEDIA_DATA_PATH
 127                              or args.project_dir)
 128             LOG.warning('Import wikipdia article importance from %s', data_path)
 129             if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
 130                                                  data_path) > 0:
 131                 LOG.fatal('FATAL: Wikipedia importance dump file not found')
 132                 return 1
 133
 134         # Attention: importance MUST come after wiki data import.
 135         if args.importance:
 136             LOG.warning('Update importance values for database')
 137             with connect(args.config.get_libpq_dsn()) as conn:
 138                 refresh.recompute_importance(conn)
 139
 140         if args.website:
 141             webdir = args.project_dir / 'website'
 142             LOG.warning('Setting up website directory at %s', webdir)
 143             # This is a little bit hacky: call the tokenizer setup, so that
 144             # the tokenizer directory gets repopulated as well, in case it
 145             # wasn't there yet.
 146             self._get_tokenizer(args.config)
 147             with connect(args.config.get_libpq_dsn()) as conn:
 148                 refresh.setup_website(webdir, args.config, conn)
 149
 150         if args.data_object or args.data_area:
 151             with connect(args.config.get_libpq_dsn()) as conn:
 152                 for obj in args.data_object or []:
 153                     refresh.invalidate_osm_object(*obj, conn, recursive=False)
 154                 for obj in args.data_area or []:
 155                     refresh.invalidate_osm_object(*obj, conn, recursive=True)
 156                 conn.commit()
 157
 158         return 0
 159
 160
 161     def _get_tokenizer(self, config: Configuration) -> AbstractTokenizer:
 162         if self.tokenizer is None:
 163             from ..tokenizer import factory as tokenizer_factory
 164
 165             self.tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
 166
 167         return self.tokenizer