src/nominatim_db/tools/refresh.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2024 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Functions for bringing auxiliary data in the database up-to-date.
   9 """
  10 from typing import MutableSequence, Tuple, Any, Type, Mapping, Sequence, List, cast
  11 import csv
  12 import gzip
  13 import logging
  14 from textwrap import dedent
  15 from pathlib import Path
  16
  17 from psycopg2 import sql as pysql
  18
  19 from ..config import Configuration
  20 from ..db.connection import Connection, connect, postgis_version_tuple,\
  21                             drop_tables, table_exists
  22 from ..db.utils import execute_file, CopyBuffer
  23 from ..db.sql_preprocessor import SQLPreprocessor
  24 from ..version import NOMINATIM_VERSION
  25
  26 LOG = logging.getLogger()
  27
  28 OSM_TYPE = {'N': 'node', 'W': 'way', 'R': 'relation'}
  29
  30 def _add_address_level_rows_from_entry(rows: MutableSequence[Tuple[Any, ...]],
  31                                        entry: Mapping[str, Any]) -> None:
  32     """ Converts a single entry from the JSON format for address rank
  33         descriptions into a flat format suitable for inserting into a
  34         PostgreSQL table and adds these lines to `rows`.
  35     """
  36     countries = entry.get('countries') or (None, )
  37     for key, values in entry['tags'].items():
  38         for value, ranks in values.items():
  39             if isinstance(ranks, list):
  40                 rank_search, rank_address = ranks
  41             else:
  42                 rank_search = rank_address = ranks
  43             if not value:
  44                 value = None
  45             for country in countries:
  46                 rows.append((country, key, value, rank_search, rank_address))
  47
  48
  49 def load_address_levels(conn: Connection, table: str, levels: Sequence[Mapping[str, Any]]) -> None:
  50     """ Replace the `address_levels` table with the contents of `levels'.
  51
  52         A new table is created any previously existing table is dropped.
  53         The table has the following columns:
  54             country, class, type, rank_search, rank_address
  55     """
  56     rows: List[Tuple[Any, ...]]  = []
  57     for entry in levels:
  58         _add_address_level_rows_from_entry(rows, entry)
  59
  60     drop_tables(conn, table)
  61
  62     with conn.cursor() as cur:
  63         cur.execute(pysql.SQL("""CREATE TABLE {} (
  64                                         country_code varchar(2),
  65                                         class TEXT,
  66                                         type TEXT,
  67                                         rank_search SMALLINT,
  68                                         rank_address SMALLINT)
  69                               """).format(pysql.Identifier(table)))
  70
  71         cur.execute_values(pysql.SQL("INSERT INTO {} VALUES %s")
  72                            .format(pysql.Identifier(table)), rows)
  73
  74         cur.execute(pysql.SQL('CREATE UNIQUE INDEX ON {} (country_code, class, type)')
  75                     .format(pysql.Identifier(table)))
  76
  77     conn.commit()
  78
  79
  80 def load_address_levels_from_config(conn: Connection, config: Configuration) -> None:
  81     """ Replace the `address_levels` table with the content as
  82         defined in the given configuration. Uses the parameter
  83         NOMINATIM_ADDRESS_LEVEL_CONFIG to determine the location of the
  84         configuration file.
  85     """
  86     cfg = config.load_sub_configuration('', config='ADDRESS_LEVEL_CONFIG')
  87     load_address_levels(conn, 'address_levels', cfg)
  88
  89
  90 def create_functions(conn: Connection, config: Configuration,
  91                      enable_diff_updates: bool = True,
  92                      enable_debug: bool = False) -> None:
  93     """ (Re)create the PL/pgSQL functions.
  94     """
  95     sql = SQLPreprocessor(conn, config)
  96
  97     sql.run_sql_file(conn, 'functions.sql',
  98                      disable_diff_updates=not enable_diff_updates,
  99                      debug=enable_debug)
 100
 101
 102
 103 WEBSITE_SCRIPTS = (
 104     'deletable.php',
 105     'details.php',
 106     'lookup.php',
 107     'polygons.php',
 108     'reverse.php',
 109     'search.php',
 110     'status.php'
 111 )
 112
 113 # constants needed by PHP scripts: PHP name, config name, type
 114 PHP_CONST_DEFS = (
 115     ('Database_DSN', 'DATABASE_DSN', str),
 116     ('Default_Language', 'DEFAULT_LANGUAGE', str),
 117     ('Log_DB', 'LOG_DB', bool),
 118     ('Log_File', 'LOG_FILE', Path),
 119     ('NoAccessControl', 'CORS_NOACCESSCONTROL', bool),
 120     ('Places_Max_ID_count', 'LOOKUP_MAX_COUNT', int),
 121     ('PolygonOutput_MaximumTypes', 'POLYGON_OUTPUT_MAX_TYPES', int),
 122     ('Search_BatchMode', 'SEARCH_BATCH_MODE', bool),
 123     ('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str),
 124     ('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool),
 125     ('MapIcon_URL', 'MAPICON_URL', str),
 126     ('Search_WithinCountries', 'SEARCH_WITHIN_COUNTRIES', bool),
 127 )
 128
 129
 130 def import_wikipedia_articles(dsn: str, data_path: Path, ignore_errors: bool = False) -> int:
 131     """ Replaces the wikipedia importance tables with new data.
 132         The import is run in a single transaction so that the new data
 133         is replace seamlessly.
 134
 135         Returns 0 if all was well and 1 if the importance file could not
 136         be found. Throws an exception if there was an error reading the file.
 137     """
 138     if import_importance_csv(dsn, data_path / 'wikimedia-importance.csv.gz') == 0 \
 139        or import_importance_sql(dsn, data_path / 'wikimedia-importance.sql.gz',
 140                                 ignore_errors) == 0:
 141         return 0
 142
 143     return 1
 144
 145
 146 def import_importance_csv(dsn: str, data_file: Path) -> int:
 147     """ Replace wikipedia importance table with data from a
 148         single CSV file.
 149
 150         The file must be a gzipped CSV and have the following columns:
 151         language, title, importance, wikidata_id
 152
 153         Other columns may be present but will be ignored.
 154     """
 155     if not data_file.exists():
 156         return 1
 157
 158     # Only import the first occurence of a wikidata ID.
 159     # This keeps indexes and table small.
 160     wd_done = set()
 161
 162     with connect(dsn) as conn:
 163         drop_tables(conn, 'wikipedia_article', 'wikipedia_redirect', 'wikimedia_importance')
 164         with conn.cursor() as cur:
 165             cur.execute("""CREATE TABLE wikimedia_importance (
 166                              language TEXT NOT NULL,
 167                              title TEXT NOT NULL,
 168                              importance double precision NOT NULL,
 169                              wikidata TEXT
 170                            ) """)
 171
 172         with gzip.open(str(data_file), 'rt') as fd, CopyBuffer() as buf:
 173             for row in csv.DictReader(fd, delimiter='\t', quotechar='|'):
 174                 wd_id = int(row['wikidata_id'][1:])
 175                 buf.add(row['language'], row['title'], row['importance'],
 176                         None if wd_id in wd_done else row['wikidata_id'])
 177                 wd_done.add(wd_id)
 178
 179                 if buf.size() > 10000000:
 180                     with conn.cursor() as cur:
 181                         buf.copy_out(cur, 'wikimedia_importance',
 182                                      columns=['language', 'title', 'importance',
 183                                               'wikidata'])
 184
 185             with conn.cursor() as cur:
 186                 buf.copy_out(cur, 'wikimedia_importance',
 187                              columns=['language', 'title', 'importance', 'wikidata'])
 188
 189         with conn.cursor() as cur:
 190             cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_title
 191                            ON wikimedia_importance (title)""")
 192             cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_wikidata
 193                            ON wikimedia_importance (wikidata)
 194                            WHERE wikidata is not null""")
 195
 196         conn.commit()
 197
 198     return 0
 199
 200
 201 def import_importance_sql(dsn: str, data_file: Path, ignore_errors: bool) -> int:
 202     """ Replace wikipedia importance table with data from an SQL file.
 203     """
 204     if not data_file.exists():
 205         return 1
 206
 207     pre_code = """BEGIN;
 208                   DROP TABLE IF EXISTS "wikipedia_article";
 209                   DROP TABLE IF EXISTS "wikipedia_redirect";
 210                   DROP TABLE IF EXISTS "wikipedia_importance";
 211                """
 212     post_code = "COMMIT"
 213     execute_file(dsn, data_file, ignore_errors=ignore_errors,
 214                  pre_code=pre_code, post_code=post_code)
 215
 216     return 0
 217
 218
 219 def import_secondary_importance(dsn: str, data_path: Path, ignore_errors: bool = False) -> int:
 220     """ Replaces the secondary importance raster data table with new data.
 221
 222         Returns 0 if all was well and 1 if the raster SQL file could not
 223         be found. Throws an exception if there was an error reading the file.
 224     """
 225     datafile = data_path / 'secondary_importance.sql.gz'
 226     if not datafile.exists():
 227         return 1
 228
 229     with connect(dsn) as conn:
 230         postgis_version = postgis_version_tuple(conn)
 231         if postgis_version[0] < 3:
 232             LOG.error('PostGIS version is too old for using OSM raster data.')
 233             return 2
 234
 235     execute_file(dsn, datafile, ignore_errors=ignore_errors)
 236
 237     return 0
 238
 239 def recompute_importance(conn: Connection) -> None:
 240     """ Recompute wikipedia links and importance for all entries in placex.
 241         This is a long-running operations that must not be executed in
 242         parallel with updates.
 243     """
 244     with conn.cursor() as cur:
 245         cur.execute('ALTER TABLE placex DISABLE TRIGGER ALL')
 246         cur.execute("""
 247             UPDATE placex SET (wikipedia, importance) =
 248                (SELECT wikipedia, importance
 249                 FROM compute_importance(extratags, country_code, rank_search, centroid))
 250             """)
 251         cur.execute("""
 252             UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance
 253              FROM placex d
 254              WHERE s.place_id = d.linked_place_id and d.wikipedia is not null
 255                    and (s.wikipedia is null or s.importance < d.importance);
 256             """)
 257
 258         cur.execute('ALTER TABLE placex ENABLE TRIGGER ALL')
 259     conn.commit()
 260
 261
 262 def _quote_php_variable(var_type: Type[Any], config: Configuration,
 263                         conf_name: str) -> str:
 264     if var_type == bool:
 265         return 'true' if config.get_bool(conf_name) else 'false'
 266
 267     if var_type == int:
 268         return cast(str, getattr(config, conf_name))
 269
 270     if not getattr(config, conf_name):
 271         return 'false'
 272
 273     if var_type == Path:
 274         value = str(config.get_path(conf_name) or '')
 275     else:
 276         value = getattr(config, conf_name)
 277
 278     quoted = value.replace("'", "\\'")
 279     return f"'{quoted}'"
 280
 281
 282 def setup_website(basedir: Path, config: Configuration, conn: Connection) -> None:
 283     """ Create the website script stubs.
 284     """
 285     if config.lib_dir.php is None:
 286         LOG.info("Python frontend does not require website setup. Skipping.")
 287         return
 288
 289     if not basedir.exists():
 290         LOG.info('Creating website directory.')
 291         basedir.mkdir()
 292
 293     assert config.project_dir is not None
 294     basedata = dedent(f"""\
 295                       <?php
 296
 297                       @define('CONST_Debug', $_GET['debug'] ?? false);
 298                       @define('CONST_LibDir', '{config.lib_dir.php}');
 299                       @define('CONST_TokenizerDir', '{config.project_dir / 'tokenizer'}');
 300                       @define('CONST_NominatimVersion', '{NOMINATIM_VERSION!s}');
 301
 302                       """)
 303
 304     for php_name, conf_name, var_type in PHP_CONST_DEFS:
 305         varout = _quote_php_variable(var_type, config, conf_name)
 306
 307         basedata += f"@define('CONST_{php_name}', {varout});\n"
 308
 309     template = "\nrequire_once(CONST_LibDir.'/website/{}');\n"
 310
 311     search_name_table_exists = bool(conn and table_exists(conn, 'search_name'))
 312
 313     for script in WEBSITE_SCRIPTS:
 314         if not search_name_table_exists and script == 'search.php':
 315             out = template.format('reverse-only-search.php')
 316         else:
 317             out = template.format(script)
 318
 319         (basedir / script).write_text(basedata + out, 'utf-8')
 320
 321
 322 def invalidate_osm_object(osm_type: str, osm_id: int, conn: Connection,
 323                           recursive: bool = True) -> None:
 324     """ Mark the given OSM object for reindexing. When 'recursive' is set
 325         to True (the default), then all dependent objects are marked for
 326         reindexing as well.
 327
 328         'osm_type' must be on of 'N' (node), 'W' (way) or 'R' (relation).
 329         If the given object does not exist, then nothing happens.
 330     """
 331     assert osm_type in ('N', 'R', 'W')
 332
 333     LOG.warning("Invalidating OSM %s %s%s.",
 334                 OSM_TYPE[osm_type], osm_id,
 335                 ' and its dependent places' if recursive else '')
 336
 337     with conn.cursor() as cur:
 338         if recursive:
 339             sql = """SELECT place_force_update(place_id)
 340                      FROM placex WHERE osm_type = %s and osm_id = %s"""
 341         else:
 342             sql = """UPDATE placex SET indexed_status = 2
 343                      WHERE osm_type = %s and osm_id = %s"""
 344
 345         cur.execute(sql, (osm_type, osm_id))