Merge pull request #2401 from lonvia/port-add-data-to-python

[nominatim.git] / nominatim / tools / database_import.py
diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py

index a6df275517a4134af973fc10943f83bfbd8404c6..a4d7220fb8c73bcf7f38f1370c028122fb020b39 100644 (file)
--- a/nominatim/tools/database_import.py
+++ b/nominatim/tools/database_import.py
@@ -5,20 +5,37 @@ import logging
  import os
  import selectors
  import subprocess
  import os
  import selectors
  import subprocess
-import shutil
  from pathlib import Path
  
  import psutil
  from pathlib import Path
  
  import psutil
+import psycopg2.extras
+from psycopg2 import sql as pysql
  
  
-from ..db.connection import connect, get_pg_env
-from ..db import utils as db_utils
-from ..db.async_connection import DBConnection
-from .exec_utils import run_osm2pgsql
-from ..errors import UsageError
-from ..version import POSTGRESQL_REQUIRED_VERSION, POSTGIS_REQUIRED_VERSION
+from nominatim.db.connection import connect, get_pg_env
+from nominatim.db import utils as db_utils
+from nominatim.db.async_connection import DBConnection
+from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.tools.exec_utils import run_osm2pgsql
+from nominatim.errors import UsageError
+from nominatim.version import POSTGRESQL_REQUIRED_VERSION, POSTGIS_REQUIRED_VERSION
  
  LOG = logging.getLogger()
  
  
  LOG = logging.getLogger()
  
+def setup_database_skeleton(dsn, data_dir, no_partitions, rouser=None):
+    """ Create a new database for Nominatim and populate it with the
+        essential extensions and data.
+    """
+    LOG.warning('Creating database')
+    create_db(dsn, rouser)
+
+    LOG.warning('Setting up database')
+    with connect(dsn) as conn:
+        setup_extensions(conn)
+
+    LOG.warning('Loading basic data')
+    import_base_data(dsn, data_dir, no_partitions)
+
+
  def create_db(dsn, rouser=None):
      """ Create a new database for the given DSN. Fails when the database
          already exists or the PostgreSQL version is too old.
  def create_db(dsn, rouser=None):
      """ Create a new database for the given DSN. Fails when the database
          already exists or the PostgreSQL version is too old.
@@ -72,48 +89,6 @@ def setup_extensions(conn):
          raise UsageError('PostGIS version is too old.')
  
  
          raise UsageError('PostGIS version is too old.')
  
  
-def install_module(src_dir, project_dir, module_dir):
-    """ Copy the normalization module from src_dir into the project
-        directory under the '/module' directory. If 'module_dir' is set, then
-        use the module from there instead and check that it is accessible
-        for Postgresql.
-
-        The function detects when the installation is run from the
-        build directory. It doesn't touch the module in that case.
-    """
-    if not module_dir:
-        module_dir = project_dir / 'module'
-
-        if not module_dir.exists() or not src_dir.samefile(module_dir):
-
-            if not module_dir.exists():
-                module_dir.mkdir()
-
-            destfile = module_dir / 'nominatim.so'
-            shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
-            destfile.chmod(0o755)
-
-            LOG.info('Database module installed at %s', str(destfile))
-        else:
-            LOG.info('Running from build directory. Leaving database module as is.')
-    else:
-        LOG.info("Using custom path for database module at '%s'", module_dir)
-
-    return module_dir
-
-
-def check_module_dir_path(conn, path):
-    """ Check that the normalisation module can be found and executed
-        from the given path.
-    """
-    with conn.cursor() as cur:
-        cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
-                       RETURNS text AS '{}/nominatim.so', 'transliteration'
-                       LANGUAGE c IMMUTABLE STRICT;
-                       DROP FUNCTION nominatim_test_import_func(text)
-                    """.format(path))
-
-
  def import_base_data(dsn, sql_dir, ignore_partitions=False):
      """ Create and populate the tables with basic static data that provides
          the background for geocoding. Data is assumed to not yet exist.
  def import_base_data(dsn, sql_dir, ignore_partitions=False):
      """ Create and populate the tables with basic static data that provides
          the background for geocoding. Data is assumed to not yet exist.
@@ -128,7 +103,7 @@ def import_base_data(dsn, sql_dir, ignore_partitions=False):
              conn.commit()
  
  
              conn.commit()
  
  
-def import_osm_data(osm_file, options, drop=False):
+def import_osm_data(osm_file, options, drop=False, ignore_errors=False):
      """ Import the given OSM file. 'options' contains the list of
          default settings for osm2pgsql.
      """
      """ Import the given OSM file. 'options' contains the list of
          default settings for osm2pgsql.
      """
@@ -147,34 +122,59 @@ def import_osm_data(osm_file, options, drop=False):
      run_osm2pgsql(options)
  
      with connect(options['dsn']) as conn:
      run_osm2pgsql(options)
  
      with connect(options['dsn']) as conn:
-        with conn.cursor() as cur:
-            cur.execute('SELECT * FROM place LIMIT 1')
-            if cur.rowcount == 0:
-                raise UsageError('No data imported by osm2pgsql.')
+        if not ignore_errors:
+            with conn.cursor() as cur:
+                cur.execute('SELECT * FROM place LIMIT 1')
+                if cur.rowcount == 0:
+                    raise UsageError('No data imported by osm2pgsql.')
  
          if drop:
              conn.drop_table('planet_osm_nodes')
  
  
          if drop:
              conn.drop_table('planet_osm_nodes')
  
-    if drop:
-        if options['flatnode_file']:
-            Path(options['flatnode_file']).unlink()
+    if drop and options['flatnode_file']:
+        Path(options['flatnode_file']).unlink()
+
+
+def create_tables(conn, config, reverse_only=False):
+    """ Create the set of basic tables.
+        When `reverse_only` is True, then the main table for searching will
+        be skipped and only reverse search is possible.
+    """
+    sql = SQLPreprocessor(conn, config)
+    sql.env.globals['db']['reverse_only'] = reverse_only
+
+    sql.run_sql_file(conn, 'tables.sql')
+
+
+def create_table_triggers(conn, config):
+    """ Create the triggers for the tables. The trigger functions must already
+        have been imported with refresh.create_functions().
+    """
+    sql = SQLPreprocessor(conn, config)
+    sql.run_sql_file(conn, 'table-triggers.sql')
+
+
+def create_partition_tables(conn, config):
+    """ Create tables that have explicit partitioning.
+    """
+    sql = SQLPreprocessor(conn, config)
+    sql.run_sql_file(conn, 'partition-tables.src.sql')
  
  
  
  
-def truncate_data_tables(conn, max_word_frequency=None):
+def truncate_data_tables(conn):
      """ Truncate all data tables to prepare for a fresh load.
      """
      with conn.cursor() as cur:
      """ Truncate all data tables to prepare for a fresh load.
      """
      with conn.cursor() as cur:
-        cur.execute('TRUNCATE word')
          cur.execute('TRUNCATE placex')
          cur.execute('TRUNCATE place_addressline')
          cur.execute('TRUNCATE location_area')
          cur.execute('TRUNCATE location_area_country')
          cur.execute('TRUNCATE placex')
          cur.execute('TRUNCATE place_addressline')
          cur.execute('TRUNCATE location_area')
          cur.execute('TRUNCATE location_area_country')
-        cur.execute('TRUNCATE location_property')
          cur.execute('TRUNCATE location_property_tiger')
          cur.execute('TRUNCATE location_property_osmline')
          cur.execute('TRUNCATE location_postcode')
          cur.execute('TRUNCATE location_property_tiger')
          cur.execute('TRUNCATE location_property_osmline')
          cur.execute('TRUNCATE location_postcode')
-        cur.execute('TRUNCATE search_name')
-        cur.execute('DROP SEQUENCE seq_place')
+        if conn.table_exists('search_name'):
+            cur.execute('TRUNCATE search_name')
+        cur.execute('DROP SEQUENCE IF EXISTS seq_place')
          cur.execute('CREATE SEQUENCE seq_place start 100000')
  
          cur.execute("""SELECT tablename FROM pg_tables
          cur.execute('CREATE SEQUENCE seq_place start 100000')
  
          cur.execute("""SELECT tablename FROM pg_tables
@@ -183,35 +183,33 @@ def truncate_data_tables(conn, max_word_frequency=None):
          for table in [r[0] for r in list(cur)]:
              cur.execute('TRUNCATE ' + table)
  
          for table in [r[0] for r in list(cur)]:
              cur.execute('TRUNCATE ' + table)
  
-        if max_word_frequency is not None:
-            # Used by getorcreate_word_id to ignore frequent partial words.
-            cur.execute("""CREATE OR REPLACE FUNCTION get_maxwordfreq()
-                           RETURNS integer AS $$
-                             SELECT {} as maxwordfreq;
-                           $$ LANGUAGE SQL IMMUTABLE
-                        """.format(max_word_frequency))
-        conn.commit()
+    conn.commit()
+
+
+_COPY_COLUMNS = pysql.SQL(',').join(map(pysql.Identifier,
+                                        ('osm_type', 'osm_id', 'class', 'type',
+                                         'name', 'admin_level', 'address',
+                                         'extratags', 'geometry')))
  
  
-_COPY_COLUMNS = 'osm_type, osm_id, class, type, name, admin_level, address, extratags, geometry'
  
  
-def load_data(dsn, data_dir, threads):
+def load_data(dsn, threads):
      """ Copy data into the word and placex table.
      """
      """ Copy data into the word and placex table.
      """
-    # Pre-calculate the most important terms in the word list.
-    db_utils.execute_file(dsn, data_dir / 'words.sql')
-
      sel = selectors.DefaultSelector()
      # Then copy data from place to placex in <threads - 1> chunks.
      place_threads = max(1, threads - 1)
      for imod in range(place_threads):
          conn = DBConnection(dsn)
          conn.connect()
      sel = selectors.DefaultSelector()
      # Then copy data from place to placex in <threads - 1> chunks.
      place_threads = max(1, threads - 1)
      for imod in range(place_threads):
          conn = DBConnection(dsn)
          conn.connect()
-        conn.perform("""INSERT INTO placex ({0})
-                         SELECT {0} FROM place
-                         WHERE osm_id % {1} = {2}
-                           AND NOT (class='place' and type='houses')
-                           AND ST_IsValid(geometry)
-                     """.format(_COPY_COLUMNS, place_threads, imod))
+        conn.perform(
+            pysql.SQL("""INSERT INTO placex ({columns})
+                           SELECT {columns} FROM place
+                           WHERE osm_id % {total} = {mod}
+                             AND NOT (class='place' and (type='houses' or type='postcode'))
+                             AND ST_IsValid(geometry)
+                      """).format(columns=_COPY_COLUMNS,
+                                  total=pysql.Literal(place_threads),
+                                  mod=pysql.Literal(imod)))
          sel.register(conn, selectors.EVENT_READ, conn)
  
      # Address interpolations go into another table.
          sel.register(conn, selectors.EVENT_READ, conn)
  
      # Address interpolations go into another table.
@@ -239,3 +237,59 @@ def load_data(dsn, data_dir, threads):
      with connect(dsn) as conn:
          with conn.cursor() as cur:
              cur.execute('ANALYSE')
      with connect(dsn) as conn:
          with conn.cursor() as cur:
              cur.execute('ANALYSE')
+
+
+def create_search_indices(conn, config, drop=False):
+    """ Create tables that have explicit partitioning.
+    """
+
+    # If index creation failed and left an index invalid, they need to be
+    # cleaned out first, so that the script recreates them.
+    with conn.cursor() as cur:
+        cur.execute("""SELECT relname FROM pg_class, pg_index
+                       WHERE pg_index.indisvalid = false
+                             AND pg_index.indexrelid = pg_class.oid""")
+        bad_indices = [row[0] for row in list(cur)]
+        for idx in bad_indices:
+            LOG.info("Drop invalid index %s.", idx)
+            cur.execute('DROP INDEX "{}"'.format(idx))
+    conn.commit()
+
+    sql = SQLPreprocessor(conn, config)
+
+    sql.run_sql_file(conn, 'indices.sql', drop=drop)
+
+
+def create_country_names(conn, tokenizer, languages=None):
+    """ Add default country names to search index. `languages` is a comma-
+        separated list of language codes as used in OSM. If `languages` is not
+        empty then only name translations for the given languages are added
+        to the index.
+    """
+    if languages:
+        languages = languages.split(',')
+
+    def _include_key(key):
+        return key == 'name' or \
+               (key.startswith('name:') and (not languages or key[5:] in languages))
+
+    with conn.cursor() as cur:
+        psycopg2.extras.register_hstore(cur)
+        cur.execute("""SELECT country_code, name FROM country_name
+                       WHERE country_code is not null""")
+
+        with tokenizer.name_analyzer() as analyzer:
+            for code, name in cur:
+                names = {'countrycode': code}
+                if code == 'gb':
+                    names['short_name'] = 'UK'
+                if code == 'us':
+                    names['short_name'] = 'United States'
+
+                # country names (only in languages as provided)
+                if name:
+                    names.update(((k, v) for k, v in name.items() if _include_key(k)))
+
+                analyzer.add_country_names(code, names)
+
+    conn.commit()