Merge remote-tracking branch 'upstream/master'

[nominatim.git] / nominatim / tools / refresh.py
diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py

index 531de14db8d4911270cf603a4bc6fc0c27f29337..a200ee1348b9fdc717cd8db39c3e7bffd1438a64 100644 (file)
--- a/nominatim/tools/refresh.py
+++ b/nominatim/tools/refresh.py
@@ -8,8 +8,9 @@
  Functions for bringing auxiliary data in the database up-to-date.
  """
  from typing import MutableSequence, Tuple, Any, Type, Mapping, Sequence, List, cast
  Functions for bringing auxiliary data in the database up-to-date.
  """
  from typing import MutableSequence, Tuple, Any, Type, Mapping, Sequence, List, cast
+import csv
+import gzip
  import logging
  import logging
-import subprocess
  from textwrap import dedent
  from pathlib import Path
  
  from textwrap import dedent
  from pathlib import Path
  
@@ -17,9 +18,9 @@ from psycopg2 import sql as pysql
  
  from nominatim.config import Configuration
  from nominatim.db.connection import Connection, connect
  
  from nominatim.config import Configuration
  from nominatim.db.connection import Connection, connect
-from nominatim.db.utils import execute_file
+from nominatim.db.utils import execute_file, CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.db.sql_preprocessor import SQLPreprocessor
-from nominatim.version import version_str
+from nominatim.version import NOMINATIM_VERSION
  
  LOG = logging.getLogger()
  
  
  LOG = logging.getLogger()
  
@@ -121,6 +122,7 @@ PHP_CONST_DEFS = (
      ('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str),
      ('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool),
      ('MapIcon_URL', 'MAPICON_URL', str),
      ('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str),
      ('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool),
      ('MapIcon_URL', 'MAPICON_URL', str),
+    ('Search_WithinCountries', 'SEARCH_WITHIN_COUNTRIES', bool),
  )
  
  
  )
  
  
@@ -132,63 +134,106 @@ def import_wikipedia_articles(dsn: str, data_path: Path, ignore_errors: bool = F
          Returns 0 if all was well and 1 if the importance file could not
          be found. Throws an exception if there was an error reading the file.
      """
          Returns 0 if all was well and 1 if the importance file could not
          be found. Throws an exception if there was an error reading the file.
      """
-    datafile = data_path / 'wikimedia-importance.sql.gz'
+    if import_importance_csv(dsn, data_path / 'wikimedia-importance.csv.gz') == 0 \
+       or import_importance_sql(dsn, data_path / 'wikimedia-importance.sql.gz',
+                                ignore_errors) == 0:
+        return 0
  
  
-    if not datafile.exists():
+    return 1
+
+
+def import_importance_csv(dsn: str, data_file: Path) -> int:
+    """ Replace wikipedia importance table with data from a
+        single CSV file.
+
+        The file must be a gzipped CSV and have the following columns:
+        language, title, importance, wikidata_id
+
+        Other columns may be present but will be ignored.
+    """
+    if not data_file.exists():
+        return 1
+
+    # Only import the first occurance of a wikidata ID.
+    # This keeps indexes and table small.
+    wd_done = set()
+
+    with connect(dsn) as conn:
+        with conn.cursor() as cur:
+            cur.drop_table('wikipedia_article')
+            cur.drop_table('wikipedia_redirect')
+            cur.drop_table('wikimedia_importance')
+            cur.execute("""CREATE TABLE wikimedia_importance (
+                             language TEXT NOT NULL,
+                             title TEXT NOT NULL,
+                             importance double precision NOT NULL,
+                             wikidata TEXT
+                           ) """)
+
+        with gzip.open(str(data_file), 'rt') as fd, CopyBuffer() as buf:
+            for row in csv.DictReader(fd, delimiter='\t', quotechar='|'):
+                wd_id = int(row['wikidata_id'][1:])
+                buf.add(row['language'], row['title'], row['importance'],
+                        None if wd_id in wd_done else row['wikidata_id'])
+                wd_done.add(wd_id)
+
+                if buf.size() > 10000000:
+                    with conn.cursor() as cur:
+                        buf.copy_out(cur, 'wikimedia_importance',
+                                     columns=['language', 'title', 'importance',
+                                              'wikidata'])
+
+            with conn.cursor() as cur:
+                buf.copy_out(cur, 'wikimedia_importance',
+                             columns=['language', 'title', 'importance', 'wikidata'])
+
+        with conn.cursor() as cur:
+            cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_title
+                           ON wikimedia_importance (title)""")
+            cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_wikidata
+                           ON wikimedia_importance (wikidata)
+                           WHERE wikidata is not null""")
+
+        conn.commit()
+
+    return 0
+
+
+def import_importance_sql(dsn: str, data_file: Path, ignore_errors: bool) -> int:
+    """ Replace wikipedia importance table with data from an SQL file.
+    """
+    if not data_file.exists():
          return 1
  
      pre_code = """BEGIN;
                    DROP TABLE IF EXISTS "wikipedia_article";
          return 1
  
      pre_code = """BEGIN;
                    DROP TABLE IF EXISTS "wikipedia_article";
-                  DROP TABLE IF EXISTS "wikipedia_redirect"
+                  DROP TABLE IF EXISTS "wikipedia_redirect";
+                  DROP TABLE IF EXISTS "wikipedia_importance";
                 """
      post_code = "COMMIT"
                 """
      post_code = "COMMIT"
-    execute_file(dsn, datafile, ignore_errors=ignore_errors,
+    execute_file(dsn, data_file, ignore_errors=ignore_errors,
                   pre_code=pre_code, post_code=post_code)
  
      return 0
  
                   pre_code=pre_code, post_code=post_code)
  
      return 0
  
-def import_osm_views_geotiff(dsn: str, data_path: Path) -> int:
-    """ Replaces the OSM views table with new data.
  
  
-        Returns 0 if all was well and 1 if the OSM views GeoTIFF file could not
+def import_secondary_importance(dsn: str, data_path: Path, ignore_errors: bool = False) -> int:
+    """ Replaces the secondary importance raster data table with new data.
+
+        Returns 0 if all was well and 1 if the raster SQL file could not
          be found. Throws an exception if there was an error reading the file.
      """
          be found. Throws an exception if there was an error reading the file.
      """
-    datafile = data_path / 'osmviews.tiff'
+    datafile = data_path / 'secondary_importance.sql.gz'
      if not datafile.exists():
          return 1
      if not datafile.exists():
          return 1
-    with connect(dsn) as conn:
  
  
+    with connect(dsn) as conn:
          postgis_version = conn.postgis_version_tuple()
          if postgis_version[0] < 3:
          postgis_version = conn.postgis_version_tuple()
          if postgis_version[0] < 3:
+            LOG.error('PostGIS version is too old for using OSM raster data.')
              return 2
  
              return 2
  
-        with conn.cursor() as cur:
-            cur.drop_table("osm_views")
-            cur.drop_table("osm_views_stat")
-
-            # -ovr: 6 -> zoom 12, 5 -> zoom 13, 4 -> zoom 14, 3 -> zoom 15
-            reproject_geotiff = f"gdalwarp -q -multi -ovr 3 -overwrite \
-                -co COMPRESS=LZW -tr 0.01 0.01 -t_srs EPSG:4326 {datafile} raster2import.tiff"
-            subprocess.run(["/bin/bash", "-c" , reproject_geotiff], check=True)
-
-            tile_size = 256
-            import_geotiff = f"raster2pgsql -I -C -Y -t {tile_size}x{tile_size} raster2import.tiff \
-                public.osm_views | psql {dsn} > /dev/null"
-            subprocess.run(["/bin/bash", "-c" , import_geotiff], check=True)
-
-            cleanup = "rm raster2import.tiff"
-            subprocess.run(["/bin/bash", "-c" , cleanup], check=True)
-
-            # To normalize osm views data, the max view value is needed
-            cur.execute(f"""
-            CREATE TABLE osm_views_stat AS (
-                SELECT MAX(ST_Value(osm_views.rast, 1, x, y)) AS max_views_count
-                FROM osm_views CROSS JOIN
-                generate_series(1, {tile_size}) As x
-                CROSS JOIN generate_series(1, {tile_size}) As y
-                WHERE x <= ST_Width(rast) AND y <= ST_Height(rast));
-            """)
-            conn.commit()
+    execute_file(dsn, datafile, ignore_errors=ignore_errors)
  
      return 0
  
  
      return 0
  
@@ -202,7 +247,7 @@ def recompute_importance(conn: Connection) -> None:
          cur.execute("""
              UPDATE placex SET (wikipedia, importance) =
                 (SELECT wikipedia, importance
          cur.execute("""
              UPDATE placex SET (wikipedia, importance) =
                 (SELECT wikipedia, importance
-                FROM compute_importance(extratags, country_code, osm_type, osm_id, centroid))
+                FROM compute_importance(extratags, country_code, rank_search, centroid))
              """)
          cur.execute("""
              UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance
              """)
          cur.execute("""
              UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance
@@ -238,34 +283,41 @@ def _quote_php_variable(var_type: Type[Any], config: Configuration,
  def setup_website(basedir: Path, config: Configuration, conn: Connection) -> None:
      """ Create the website script stubs.
      """
  def setup_website(basedir: Path, config: Configuration, conn: Connection) -> None:
      """ Create the website script stubs.
      """
+    if config.lib_dir.php is None:
+        LOG.info("Python frontend does not require website setup. Skipping.")
+        return
+
      if not basedir.exists():
          LOG.info('Creating website directory.')
          basedir.mkdir()
  
      if not basedir.exists():
          LOG.info('Creating website directory.')
          basedir.mkdir()
  
-    template = dedent(f"""\
+    assert config.project_dir is not None
+    basedata = dedent(f"""\
                        <?php
  
                        @define('CONST_Debug', $_GET['debug'] ?? false);
                        @define('CONST_LibDir', '{config.lib_dir.php}');
                        @define('CONST_TokenizerDir', '{config.project_dir / 'tokenizer'}');
                        <?php
  
                        @define('CONST_Debug', $_GET['debug'] ?? false);
                        @define('CONST_LibDir', '{config.lib_dir.php}');
                        @define('CONST_TokenizerDir', '{config.project_dir / 'tokenizer'}');
-                      @define('CONST_NominatimVersion', '{version_str()}');
+                      @define('CONST_NominatimVersion', '{NOMINATIM_VERSION!s}');
  
                        """)
  
      for php_name, conf_name, var_type in PHP_CONST_DEFS:
          varout = _quote_php_variable(var_type, config, conf_name)
  
  
                        """)
  
      for php_name, conf_name, var_type in PHP_CONST_DEFS:
          varout = _quote_php_variable(var_type, config, conf_name)
  
-        template += f"@define('CONST_{php_name}', {varout});\n"
+        basedata += f"@define('CONST_{php_name}', {varout});\n"
  
  
-    template += f"\nrequire_once('{config.lib_dir.php}/website/{{}}');\n"
+    template = "\nrequire_once(CONST_LibDir.'/website/{}');\n"
  
      search_name_table_exists = bool(conn and conn.table_exists('search_name'))
  
      for script in WEBSITE_SCRIPTS:
          if not search_name_table_exists and script == 'search.php':
  
      search_name_table_exists = bool(conn and conn.table_exists('search_name'))
  
      for script in WEBSITE_SCRIPTS:
          if not search_name_table_exists and script == 'search.php':
-            (basedir / script).write_text(template.format('reverse-only-search.php'), 'utf-8')
+            out = template.format('reverse-only-search.php')
          else:
          else:
-            (basedir / script).write_text(template.format(script), 'utf-8')
+            out = template.format(script)
+
+        (basedir / script).write_text(basedata + out, 'utf-8')
  
  
  def invalidate_osm_object(osm_type: str, osm_id: int, conn: Connection,
  
  
  def invalidate_osm_object(osm_type: str, osm_id: int, conn: Connection,