import logging
import re
import shutil
+from textwrap import dedent
from icu import Transliterator
import psycopg2
self.normalization = None
- def init_new_db(self, config):
+ def init_new_db(self, config, init_db=True):
""" Set up a new tokenizer for the database.
This copies all necessary data in the project directory to make
self.normalization = config.TERM_NORMALIZATION
+ self._install_php(config)
+
with connect(self.dsn) as conn:
_check_module(module_dir, conn)
self._save_config(conn, config)
conn.commit()
- self.update_sql_functions(config)
- self._init_db_tables(config)
+ if init_db:
+ self.update_sql_functions(config)
+ self._init_db_tables(config)
def init_from_project(self):
self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
+ def finalize_import(self, config):
+ """ Do any required postprocessing to make the tokenizer data ready
+ for use.
+ """
+ with connect(self.dsn) as conn:
+ sqlp = SQLPreprocessor(conn, config)
+ sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
+
+
def update_sql_functions(self, config):
""" Reimport the SQL functions for this tokenizer.
"""
modulepath=modulepath)
+ def check_database(self):
+ """ Check that the tokenizer is set up correctly.
+ """
+ hint = """\
+ The Postgresql extension nominatim.so was not correctly loaded.
+
+ Error: {error}
+
+ Hints:
+ * Check the output of the CMmake/make installation step
+ * Does nominatim.so exist?
+ * Does nominatim.so exist on the database server?
+ * Can nominatim.so be accessed by the database user?
+ """
+ with connect(self.dsn) as conn:
+ with conn.cursor() as cur:
+ try:
+ out = cur.scalar("SELECT make_standard_name('a')")
+ except psycopg2.Error as err:
+ return hint.format(error=str(err))
+
+ if out != 'a':
+ return hint.format(error='Unexpected result for make_standard_name()')
+
+ return None
+
+
def migrate_database(self, config):
""" Initialise the project directory of an existing database for
use with this tokenizer.
This is a special migration function for updating existing databases
to new software versions.
"""
+ self.normalization = config.TERM_NORMALIZATION
module_dir = _install_module(config.DATABASE_MODULE_PATH,
config.lib_dir.module,
config.project_dir / 'module')
return LegacyNameAnalyzer(self.dsn, normalizer)
+ def _install_php(self, config):
+ """ Install the php script for the tokenizer.
+ """
+ php_file = self.data_dir / "tokenizer.php"
+ php_file.write_text(dedent("""\
+ <?php
+ @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
+ @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
+ require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
+ """.format(config)))
+
+
def _init_db_tables(self, config):
""" Set up the word table and fill it with pre-computed word
frequencies.