2 Tokenizer implementing normalisation as used before Nominatim 4.
9 from nominatim.db.connection import connect
10 from nominatim.db import properties
11 from nominatim.errors import UsageError
13 DBCFG_NORMALIZATION = "tokenizer_normalization"
15 LOG = logging.getLogger()
17 def create(dsn, data_dir):
18 """ Create a new instance of the tokenizer provided by this module.
20 return LegacyTokenizer(dsn, data_dir)
23 def _install_module(config_module_path, src_dir, module_dir):
24 """ Copies the PostgreSQL normalisation module into the project
25 directory if necessary. For historical reasons the module is
26 saved in the '/module' subdirectory and not with the other tokenizer
29 The function detects when the installation is run from the
30 build directory. It doesn't touch the module in that case.
32 # Custom module locations are simply used as is.
33 if config_module_path:
34 LOG.info("Using custom path for database module at '%s'", config_module_path)
35 return config_module_path
37 # Compatibility mode for builddir installations.
38 if module_dir.exists() and src_dir.samefile(module_dir):
39 LOG.info('Running from build directory. Leaving database module as is.')
42 # In any other case install the module in the project directory.
43 if not module_dir.exists():
46 destfile = module_dir / 'nominatim.so'
47 shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
50 LOG.info('Database module installed at %s', str(destfile))
55 def _check_module(module_dir, conn):
56 with conn.cursor() as cur:
58 cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
59 RETURNS text AS '{}/nominatim.so', 'transliteration'
60 LANGUAGE c IMMUTABLE STRICT;
61 DROP FUNCTION nominatim_test_import_func(text)
62 """.format(module_dir))
63 except psycopg2.DatabaseError as err:
64 LOG.fatal("Error accessing database module: %s", err)
65 raise UsageError("Database module cannot be accessed.") from err
68 class LegacyTokenizer:
69 """ The legacy tokenizer uses a special PostgreSQL module to normalize
70 names and queries. The tokenizer thus implements normalization through
71 calls to the database.
74 def __init__(self, dsn, data_dir):
76 self.data_dir = data_dir
77 self.normalization = None
80 def init_new_db(self, config):
81 """ Set up a new tokenizer for the database.
83 This copies all necessary data in the project directory to make
84 sure the tokenizer remains stable even over updates.
86 module_dir = _install_module(config.DATABASE_MODULE_PATH,
87 config.lib_dir.module,
88 config.project_dir / 'module')
90 self.normalization = config.TERM_NORMALIZATION
92 with connect(self.dsn) as conn:
93 _check_module(module_dir, conn)
94 self._save_config(conn)
97 def init_from_project(self):
98 """ Initialise the tokenizer from the project directory.
100 with connect(self.dsn) as conn:
101 self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
104 def migrate_database(self, config):
105 """ Initialise the project directory of an existing database for
106 use with this tokenizer.
108 This is a special migration function for updating existing databases
109 to new software versions.
111 module_dir = _install_module(config.DATABASE_MODULE_PATH,
112 config.lib_dir.module,
113 config.project_dir / 'module')
115 with connect(self.dsn) as conn:
116 _check_module(module_dir, conn)
117 self._save_config(conn)
120 def _save_config(self, conn):
121 """ Save the configuration that needs to remain stable for the given
122 database as database properties.
124 properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)