nominatim/tokenizer/legacy_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4.
   3 """
   4 import logging
   5 import shutil
   6
   7 import psycopg2
   8
   9 from nominatim.db.connection import connect
  10 from nominatim.db import properties
  11 from nominatim.errors import UsageError
  12
  13 DBCFG_NORMALIZATION = "tokenizer_normalization"
  14
  15 LOG = logging.getLogger()
  16
  17 def create(dsn, data_dir):
  18     """ Create a new instance of the tokenizer provided by this module.
  19     """
  20     return LegacyTokenizer(dsn, data_dir)
  21
  22
  23 def _install_module(config_module_path, src_dir, module_dir):
  24     """ Copies the PostgreSQL normalisation module into the project
  25         directory if necessary. For historical reasons the module is
  26         saved in the '/module' subdirectory and not with the other tokenizer
  27         data.
  28
  29         The function detects when the installation is run from the
  30         build directory. It doesn't touch the module in that case.
  31     """
  32     # Custom module locations are simply used as is.
  33     if config_module_path:
  34         LOG.info("Using custom path for database module at '%s'", config_module_path)
  35         return config_module_path
  36
  37     # Compatibility mode for builddir installations.
  38     if module_dir.exists() and src_dir.samefile(module_dir):
  39         LOG.info('Running from build directory. Leaving database module as is.')
  40         return module_dir
  41
  42     # In any other case install the module in the project directory.
  43     if not module_dir.exists():
  44         module_dir.mkdir()
  45
  46     destfile = module_dir / 'nominatim.so'
  47     shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
  48     destfile.chmod(0o755)
  49
  50     LOG.info('Database module installed at %s', str(destfile))
  51
  52     return module_dir
  53
  54
  55 def _check_module(module_dir, conn):
  56     with conn.cursor() as cur:
  57         try:
  58             cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
  59                            RETURNS text AS '{}/nominatim.so', 'transliteration'
  60                            LANGUAGE c IMMUTABLE STRICT;
  61                            DROP FUNCTION nominatim_test_import_func(text)
  62                         """.format(module_dir))
  63         except psycopg2.DatabaseError as err:
  64             LOG.fatal("Error accessing database module: %s", err)
  65             raise UsageError("Database module cannot be accessed.") from err
  66
  67
  68 class LegacyTokenizer:
  69     """ The legacy tokenizer uses a special PostgreSQL module to normalize
  70         names and queries. The tokenizer thus implements normalization through
  71         calls to the database.
  72     """
  73
  74     def __init__(self, dsn, data_dir):
  75         self.dsn = dsn
  76         self.data_dir = data_dir
  77         self.normalization = None
  78
  79
  80     def init_new_db(self, config):
  81         """ Set up a new tokenizer for the database.
  82
  83             This copies all necessary data in the project directory to make
  84             sure the tokenizer remains stable even over updates.
  85         """
  86         module_dir = _install_module(config.DATABASE_MODULE_PATH,
  87                                      config.lib_dir.module,
  88                                      config.project_dir / 'module')
  89
  90         self.normalization = config.TERM_NORMALIZATION
  91
  92         with connect(self.dsn) as conn:
  93             _check_module(module_dir, conn)
  94             self._save_config(conn)
  95
  96
  97     def init_from_project(self):
  98         """ Initialise the tokenizer from the project directory.
  99         """
 100         with connect(self.dsn) as conn:
 101             self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
 102
 103
 104     def migrate_database(self, config):
 105         """ Initialise the project directory of an existing database for
 106             use with this tokenizer.
 107
 108             This is a special migration function for updating existing databases
 109             to new software versions.
 110         """
 111         module_dir = _install_module(config.DATABASE_MODULE_PATH,
 112                                      config.lib_dir.module,
 113                                      config.project_dir / 'module')
 114
 115         with connect(self.dsn) as conn:
 116             _check_module(module_dir, conn)
 117             self._save_config(conn)
 118
 119
 120     def _save_config(self, conn):
 121         """ Save the configuration that needs to remain stable for the given
 122             database as database properties.
 123         """
 124         properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)