src/nominatim_db/tools/refresh.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2024 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Functions for bringing auxiliary data in the database up-to-date.
   9 """
  10 from typing import MutableSequence, Tuple, Any, Type, Mapping, Sequence, List, cast
  11 import csv
  12 import gzip
  13 import logging
  14 from textwrap import dedent
  15 from pathlib import Path
  16
  17 from psycopg2 import sql as pysql
  18
  19 from ..config import Configuration
  20 from ..db.connection import Connection, connect
  21 from ..db.utils import execute_file, CopyBuffer
  22 from ..db.sql_preprocessor import SQLPreprocessor
  23 from ..version import NOMINATIM_VERSION
  24
  25 LOG = logging.getLogger()
  26
  27 OSM_TYPE = {'N': 'node', 'W': 'way', 'R': 'relation'}
  28
  29 def _add_address_level_rows_from_entry(rows: MutableSequence[Tuple[Any, ...]],
  30                                        entry: Mapping[str, Any]) -> None:
  31     """ Converts a single entry from the JSON format for address rank
  32         descriptions into a flat format suitable for inserting into a
  33         PostgreSQL table and adds these lines to `rows`.
  34     """
  35     countries = entry.get('countries') or (None, )
  36     for key, values in entry['tags'].items():
  37         for value, ranks in values.items():
  38             if isinstance(ranks, list):
  39                 rank_search, rank_address = ranks
  40             else:
  41                 rank_search = rank_address = ranks
  42             if not value:
  43                 value = None
  44             for country in countries:
  45                 rows.append((country, key, value, rank_search, rank_address))
  46
  47
  48 def load_address_levels(conn: Connection, table: str, levels: Sequence[Mapping[str, Any]]) -> None:
  49     """ Replace the `address_levels` table with the contents of `levels'.
  50
  51         A new table is created any previously existing table is dropped.
  52         The table has the following columns:
  53             country, class, type, rank_search, rank_address
  54     """
  55     rows: List[Tuple[Any, ...]]  = []
  56     for entry in levels:
  57         _add_address_level_rows_from_entry(rows, entry)
  58
  59     with conn.cursor() as cur:
  60         cur.drop_table(table)
  61
  62         cur.execute(pysql.SQL("""CREATE TABLE {} (
  63                                         country_code varchar(2),
  64                                         class TEXT,
  65                                         type TEXT,
  66                                         rank_search SMALLINT,
  67                                         rank_address SMALLINT)
  68                               """).format(pysql.Identifier(table)))
  69
  70         cur.execute_values(pysql.SQL("INSERT INTO {} VALUES %s")
  71                            .format(pysql.Identifier(table)), rows)
  72
  73         cur.execute(pysql.SQL('CREATE UNIQUE INDEX ON {} (country_code, class, type)')
  74                     .format(pysql.Identifier(table)))
  75
  76     conn.commit()
  77
  78
  79 def load_address_levels_from_config(conn: Connection, config: Configuration) -> None:
  80     """ Replace the `address_levels` table with the content as
  81         defined in the given configuration. Uses the parameter
  82         NOMINATIM_ADDRESS_LEVEL_CONFIG to determine the location of the
  83         configuration file.
  84     """
  85     cfg = config.load_sub_configuration('', config='ADDRESS_LEVEL_CONFIG')
  86     load_address_levels(conn, 'address_levels', cfg)
  87
  88
  89 def create_functions(conn: Connection, config: Configuration,
  90                      enable_diff_updates: bool = True,
  91                      enable_debug: bool = False) -> None:
  92     """ (Re)create the PL/pgSQL functions.
  93     """
  94     sql = SQLPreprocessor(conn, config)
  95
  96     sql.run_sql_file(conn, 'functions.sql',
  97                      disable_diff_updates=not enable_diff_updates,
  98                      debug=enable_debug)
  99
 100
 101
 102 WEBSITE_SCRIPTS = (
 103     'deletable.php',
 104     'details.php',
 105     'lookup.php',
 106     'polygons.php',
 107     'reverse.php',
 108     'search.php',
 109     'status.php'
 110 )
 111
 112 # constants needed by PHP scripts: PHP name, config name, type
 113 PHP_CONST_DEFS = (
 114     ('Database_DSN', 'DATABASE_DSN', str),
 115     ('Default_Language', 'DEFAULT_LANGUAGE', str),
 116     ('Log_DB', 'LOG_DB', bool),
 117     ('Log_File', 'LOG_FILE', Path),
 118     ('NoAccessControl', 'CORS_NOACCESSCONTROL', bool),
 119     ('Places_Max_ID_count', 'LOOKUP_MAX_COUNT', int),
 120     ('PolygonOutput_MaximumTypes', 'POLYGON_OUTPUT_MAX_TYPES', int),
 121     ('Search_BatchMode', 'SEARCH_BATCH_MODE', bool),
 122     ('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str),
 123     ('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool),
 124     ('MapIcon_URL', 'MAPICON_URL', str),
 125     ('Search_WithinCountries', 'SEARCH_WITHIN_COUNTRIES', bool),
 126 )
 127
 128
 129 def import_wikipedia_articles(dsn: str, data_path: Path, ignore_errors: bool = False) -> int:
 130     """ Replaces the wikipedia importance tables with new data.
 131         The import is run in a single transaction so that the new data
 132         is replace seamlessly.
 133
 134         Returns 0 if all was well and 1 if the importance file could not
 135         be found. Throws an exception if there was an error reading the file.
 136     """
 137     if import_importance_csv(dsn, data_path / 'wikimedia-importance.csv.gz') == 0 \
 138        or import_importance_sql(dsn, data_path / 'wikimedia-importance.sql.gz',
 139                                 ignore_errors) == 0:
 140         return 0
 141
 142     return 1
 143
 144
 145 def import_importance_csv(dsn: str, data_file: Path) -> int:
 146     """ Replace wikipedia importance table with data from a
 147         single CSV file.
 148
 149         The file must be a gzipped CSV and have the following columns:
 150         language, title, importance, wikidata_id
 151
 152         Other columns may be present but will be ignored.
 153     """
 154     if not data_file.exists():
 155         return 1
 156
 157     # Only import the first occurence of a wikidata ID.
 158     # This keeps indexes and table small.
 159     wd_done = set()
 160
 161     with connect(dsn) as conn:
 162         with conn.cursor() as cur:
 163             cur.drop_table('wikipedia_article')
 164             cur.drop_table('wikipedia_redirect')
 165             cur.drop_table('wikimedia_importance')
 166             cur.execute("""CREATE TABLE wikimedia_importance (
 167                              language TEXT NOT NULL,
 168                              title TEXT NOT NULL,
 169                              importance double precision NOT NULL,
 170                              wikidata TEXT
 171                            ) """)
 172
 173         with gzip.open(str(data_file), 'rt') as fd, CopyBuffer() as buf:
 174             for row in csv.DictReader(fd, delimiter='\t', quotechar='|'):
 175                 wd_id = int(row['wikidata_id'][1:])
 176                 buf.add(row['language'], row['title'], row['importance'],
 177                         None if wd_id in wd_done else row['wikidata_id'])
 178                 wd_done.add(wd_id)
 179
 180                 if buf.size() > 10000000:
 181                     with conn.cursor() as cur:
 182                         buf.copy_out(cur, 'wikimedia_importance',
 183                                      columns=['language', 'title', 'importance',
 184                                               'wikidata'])
 185
 186             with conn.cursor() as cur:
 187                 buf.copy_out(cur, 'wikimedia_importance',
 188                              columns=['language', 'title', 'importance', 'wikidata'])
 189
 190         with conn.cursor() as cur:
 191             cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_title
 192                            ON wikimedia_importance (title)""")
 193             cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_wikidata
 194                            ON wikimedia_importance (wikidata)
 195                            WHERE wikidata is not null""")
 196
 197         conn.commit()
 198
 199     return 0
 200
 201
 202 def import_importance_sql(dsn: str, data_file: Path, ignore_errors: bool) -> int:
 203     """ Replace wikipedia importance table with data from an SQL file.
 204     """
 205     if not data_file.exists():
 206         return 1
 207
 208     pre_code = """BEGIN;
 209                   DROP TABLE IF EXISTS "wikipedia_article";
 210                   DROP TABLE IF EXISTS "wikipedia_redirect";
 211                   DROP TABLE IF EXISTS "wikipedia_importance";
 212                """
 213     post_code = "COMMIT"
 214     execute_file(dsn, data_file, ignore_errors=ignore_errors,
 215                  pre_code=pre_code, post_code=post_code)
 216
 217     return 0
 218
 219
 220 def import_secondary_importance(dsn: str, data_path: Path, ignore_errors: bool = False) -> int:
 221     """ Replaces the secondary importance raster data table with new data.
 222
 223         Returns 0 if all was well and 1 if the raster SQL file could not
 224         be found. Throws an exception if there was an error reading the file.
 225     """
 226     datafile = data_path / 'secondary_importance.sql.gz'
 227     if not datafile.exists():
 228         return 1
 229
 230     with connect(dsn) as conn:
 231         postgis_version = conn.postgis_version_tuple()
 232         if postgis_version[0] < 3:
 233             LOG.error('PostGIS version is too old for using OSM raster data.')
 234             return 2
 235
 236     execute_file(dsn, datafile, ignore_errors=ignore_errors)
 237
 238     return 0
 239
 240 def recompute_importance(conn: Connection) -> None:
 241     """ Recompute wikipedia links and importance for all entries in placex.
 242         This is a long-running operations that must not be executed in
 243         parallel with updates.
 244     """
 245     with conn.cursor() as cur:
 246         cur.execute('ALTER TABLE placex DISABLE TRIGGER ALL')
 247         cur.execute("""
 248             UPDATE placex SET (wikipedia, importance) =
 249                (SELECT wikipedia, importance
 250                 FROM compute_importance(extratags, country_code, rank_search, centroid))
 251             """)
 252         cur.execute("""
 253             UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance
 254              FROM placex d
 255              WHERE s.place_id = d.linked_place_id and d.wikipedia is not null
 256                    and (s.wikipedia is null or s.importance < d.importance);
 257             """)
 258
 259         cur.execute('ALTER TABLE placex ENABLE TRIGGER ALL')
 260     conn.commit()
 261
 262
 263 def _quote_php_variable(var_type: Type[Any], config: Configuration,
 264                         conf_name: str) -> str:
 265     if var_type == bool:
 266         return 'true' if config.get_bool(conf_name) else 'false'
 267
 268     if var_type == int:
 269         return cast(str, getattr(config, conf_name))
 270
 271     if not getattr(config, conf_name):
 272         return 'false'
 273
 274     if var_type == Path:
 275         value = str(config.get_path(conf_name) or '')
 276     else:
 277         value = getattr(config, conf_name)
 278
 279     quoted = value.replace("'", "\\'")
 280     return f"'{quoted}'"
 281
 282
 283 def setup_website(basedir: Path, config: Configuration, conn: Connection) -> None:
 284     """ Create the website script stubs.
 285     """
 286     if config.lib_dir.php is None:
 287         LOG.info("Python frontend does not require website setup. Skipping.")
 288         return
 289
 290     if not basedir.exists():
 291         LOG.info('Creating website directory.')
 292         basedir.mkdir()
 293
 294     assert config.project_dir is not None
 295     basedata = dedent(f"""\
 296                       <?php
 297
 298                       @define('CONST_Debug', $_GET['debug'] ?? false);
 299                       @define('CONST_LibDir', '{config.lib_dir.php}');
 300                       @define('CONST_TokenizerDir', '{config.project_dir / 'tokenizer'}');
 301                       @define('CONST_NominatimVersion', '{NOMINATIM_VERSION!s}');
 302
 303                       """)
 304
 305     for php_name, conf_name, var_type in PHP_CONST_DEFS:
 306         varout = _quote_php_variable(var_type, config, conf_name)
 307
 308         basedata += f"@define('CONST_{php_name}', {varout});\n"
 309
 310     template = "\nrequire_once(CONST_LibDir.'/website/{}');\n"
 311
 312     search_name_table_exists = bool(conn and conn.table_exists('search_name'))
 313
 314     for script in WEBSITE_SCRIPTS:
 315         if not search_name_table_exists and script == 'search.php':
 316             out = template.format('reverse-only-search.php')
 317         else:
 318             out = template.format(script)
 319
 320         (basedir / script).write_text(basedata + out, 'utf-8')
 321
 322
 323 def invalidate_osm_object(osm_type: str, osm_id: int, conn: Connection,
 324                           recursive: bool = True) -> None:
 325     """ Mark the given OSM object for reindexing. When 'recursive' is set
 326         to True (the default), then all dependent objects are marked for
 327         reindexing as well.
 328
 329         'osm_type' must be on of 'N' (node), 'W' (way) or 'R' (relation).
 330         If the given object does not exist, then nothing happens.
 331     """
 332     assert osm_type in ('N', 'R', 'W')
 333
 334     LOG.warning("Invalidating OSM %s %s%s.",
 335                 OSM_TYPE[osm_type], osm_id,
 336                 ' and its dependent places' if recursive else '')
 337
 338     with conn.cursor() as cur:
 339         if recursive:
 340             sql = """SELECT place_force_update(place_id)
 341                      FROM placex WHERE osm_type = %s and osm_id = %s"""
 342         else:
 343             sql = """UPDATE placex SET indexed_status = 2
 344                      WHERE osm_type = %s and osm_id = %s"""
 345
 346         cur.execute(sql, (osm_type, osm_id))