]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/legacy_tokenizer.py
add PHP part for new ICU-base tokenizer
[nominatim.git] / nominatim / tokenizer / legacy_tokenizer.py
index 00b9f50e6fedf6ef8ac3aba68111c9a66114c345..2f060b84aa98761b653f15a41f68566dc31cb2d8 100644 (file)
@@ -5,6 +5,7 @@ from collections import OrderedDict
 import logging
 import re
 import shutil
 import logging
 import re
 import shutil
+from textwrap import dedent
 
 from icu import Transliterator
 import psycopg2
 
 from icu import Transliterator
 import psycopg2
@@ -87,7 +88,7 @@ class LegacyTokenizer:
         self.normalization = None
 
 
         self.normalization = None
 
 
-    def init_new_db(self, config):
+    def init_new_db(self, config, init_db=True):
         """ Set up a new tokenizer for the database.
 
             This copies all necessary data in the project directory to make
         """ Set up a new tokenizer for the database.
 
             This copies all necessary data in the project directory to make
@@ -99,13 +100,16 @@ class LegacyTokenizer:
 
         self.normalization = config.TERM_NORMALIZATION
 
 
         self.normalization = config.TERM_NORMALIZATION
 
+        self._install_php(config)
+
         with connect(self.dsn) as conn:
             _check_module(module_dir, conn)
             self._save_config(conn, config)
             conn.commit()
 
         with connect(self.dsn) as conn:
             _check_module(module_dir, conn)
             self._save_config(conn, config)
             conn.commit()
 
-        self.update_sql_functions(config)
-        self._init_db_tables(config)
+        if init_db:
+            self.update_sql_functions(config)
+            self._init_db_tables(config)
 
 
     def init_from_project(self):
 
 
     def init_from_project(self):
@@ -115,6 +119,15 @@ class LegacyTokenizer:
             self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
 
 
             self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
 
 
+    def finalize_import(self, config):
+        """ Do any required postprocessing to make the tokenizer data ready
+            for use.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
+
+
     def update_sql_functions(self, config):
         """ Reimport the SQL functions for this tokenizer.
         """
     def update_sql_functions(self, config):
         """ Reimport the SQL functions for this tokenizer.
         """
@@ -128,6 +141,33 @@ class LegacyTokenizer:
                               modulepath=modulepath)
 
 
                               modulepath=modulepath)
 
 
+    def check_database(self):
+        """ Check that the tokenizer is set up correctly.
+        """
+        hint = """\
+             The Postgresql extension nominatim.so was not correctly loaded.
+
+             Error: {error}
+
+             Hints:
+             * Check the output of the CMmake/make installation step
+             * Does nominatim.so exist?
+             * Does nominatim.so exist on the database server?
+             * Can nominatim.so be accessed by the database user?
+             """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                try:
+                    out = cur.scalar("SELECT make_standard_name('a')")
+                except psycopg2.Error as err:
+                    return hint.format(error=str(err))
+
+        if out != 'a':
+            return hint.format(error='Unexpected result for make_standard_name()')
+
+        return None
+
+
     def migrate_database(self, config):
         """ Initialise the project directory of an existing database for
             use with this tokenizer.
     def migrate_database(self, config):
         """ Initialise the project directory of an existing database for
             use with this tokenizer.
@@ -135,6 +175,7 @@ class LegacyTokenizer:
             This is a special migration function for updating existing databases
             to new software versions.
         """
             This is a special migration function for updating existing databases
             to new software versions.
         """
+        self.normalization = config.TERM_NORMALIZATION
         module_dir = _install_module(config.DATABASE_MODULE_PATH,
                                      config.lib_dir.module,
                                      config.project_dir / 'module')
         module_dir = _install_module(config.DATABASE_MODULE_PATH,
                                      config.lib_dir.module,
                                      config.project_dir / 'module')
@@ -164,6 +205,18 @@ class LegacyTokenizer:
         return LegacyNameAnalyzer(self.dsn, normalizer)
 
 
         return LegacyNameAnalyzer(self.dsn, normalizer)
 
 
+    def _install_php(self, config):
+        """ Install the php script for the tokenizer.
+        """
+        php_file = self.data_dir / "tokenizer.php"
+        php_file.write_text(dedent("""\
+            <?php
+            @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
+            @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
+            require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
+            """.format(config)))
+
+
     def _init_db_tables(self, config):
         """ Set up the word table and fill it with pre-computed word
             frequencies.
     def _init_db_tables(self, config):
         """ Set up the word table and fill it with pre-computed word
             frequencies.