]> git.openstreetmap.org Git - nominatim.git/commitdiff
Merge pull request #2641 from lonvia/reinit-tokenizer-dir
authorSarah Hoffmann <lonvia@denofr.de>
Sun, 20 Mar 2022 20:46:07 +0000 (21:46 +0100)
committerGitHub <noreply@github.com>
Sun, 20 Mar 2022 20:46:07 +0000 (21:46 +0100)
Transparantly reinitialize tokenizer directory when necessary

docs/admin/Advanced-Installations.md
nominatim/clicmd/refresh.py
nominatim/config.py
nominatim/db/properties.py
nominatim/tokenizer/factory.py
nominatim/tokenizer/icu_tokenizer.py
nominatim/tokenizer/legacy_tokenizer.py
test/bdd/steps/nominatim_environment.py
test/python/tokenizer/test_factory.py

index ff267cee28bd45f9d15fdbef4eb4ddc83a1ddaed..aeb2fa5b0e79ffbf526f709b69bb8d0a4cc2d9ac 100644 (file)
@@ -198,11 +198,10 @@ target machine.
     of a full database.
 
 Next install Nominatim on the target machine by following the standard installation
-instructions. Again make sure to use the same version as the source machine.
+instructions. Again, make sure to use the same version as the source machine.
 
-You can now copy the project directory from the source machine to the new machine.
-If necessary, edit the `.env` file to point it to the restored database.
-Finally run
+Create a project directory on your destination machine and set up the `.env`
+file to match the configuration on the source machine. Finally run
 
     nominatim refresh --website
 
@@ -210,6 +209,8 @@ to make sure that the local installation of Nominatim will be used.
 
 If you are using the legacy tokenizer you might also have to switch to the
 PostgreSQL module that was compiled on your target machine. If you get errors
-that PostgreSQL cannot find or access `nominatim.so` then copy the installed
-version into the `module` directory of your project directory. The installed
-copy can usually be found under `/usr/local/lib/nominatim/module/nominatim.so`.
+that PostgreSQL cannot find or access `nominatim.so` then rerun
+
+   nominatim refresh --functions
+
+on the target machine to update the the location of the module.
index b8a88b6d615b5b5c04445f393a71a81c1b6cc112..3c245cd46e84b385a39288f537bb50813f8f284f 100644 (file)
@@ -117,6 +117,10 @@ class UpdateRefresh:
         if args.website:
             webdir = args.project_dir / 'website'
             LOG.warning('Setting up website directory at %s', webdir)
+            # This is a little bit hacky: call the tokenizer setup, so that
+            # the tokenizer directory gets repopulated as well, in case it
+            # wasn't there yet.
+            self._get_tokenizer(args.config)
             with connect(args.config.get_libpq_dsn()) as conn:
                 refresh.setup_website(webdir, args.config, conn)
 
index 785f4acda14d0f0c7f344f79718a8d150bc56001..13d9cd8a0d502e4b1f1b5ab58ca7e0761772cfaf 100644 (file)
@@ -18,7 +18,7 @@ from dotenv import dotenv_values
 from nominatim.errors import UsageError
 
 LOG = logging.getLogger()
-
+CONFIG_CACHE = {}
 
 def flatten_config_list(content, section=''):
     """ Flatten YAML configuration lists that contain include sections
@@ -181,14 +181,19 @@ class Configuration:
         """
         configfile = self.find_config_file(filename, config)
 
-        if configfile.suffix in ('.yaml', '.yml'):
-            return self._load_from_yaml(configfile)
+        if str(configfile) in CONFIG_CACHE:
+            return CONFIG_CACHE[str(configfile)]
 
-        if configfile.suffix == '.json':
+        if configfile.suffix in ('.yaml', '.yml'):
+            result = self._load_from_yaml(configfile)
+        elif configfile.suffix == '.json':
             with configfile.open('r') as cfg:
-                return json.load(cfg)
+                result = json.load(cfg)
+        else:
+            raise UsageError(f"Config file '{configfile}' has unknown format.")
 
-        raise UsageError(f"Config file '{configfile}' has unknown format.")
+        CONFIG_CACHE[str(configfile)] = result
+        return result
 
 
     def find_config_file(self, filename, config=None):
index 19c090069ac9ab9ccaf33bd20caff1f53088fcd7..270204872dd56459691c4105156fe8073d6aa815 100644 (file)
@@ -27,6 +27,9 @@ def get_property(conn, name):
     """ Return the current value of the given propery or None if the property
         is not set.
     """
+    if not conn.table_exists('nominatim_properties'):
+        return None
+
     with conn.cursor() as cur:
         cur.execute('SELECT value FROM nominatim_properties WHERE property = %s',
                     (name, ))
index fbda246238f16bebb6b75806a735975e564ca815..108c7841e0c7c3e4f8bf6bd25b3aa8d9c35bba42 100644 (file)
@@ -78,8 +78,8 @@ def get_tokenizer_for_db(config):
     """
     basedir = config.project_dir / 'tokenizer'
     if not basedir.is_dir():
-        LOG.fatal("Cannot find tokenizer data in '%s'.", basedir)
-        raise UsageError('Cannot initialize tokenizer.')
+        # Directory will be repopulated by tokenizer below.
+        basedir.mkdir()
 
     with connect(config.get_libpq_dsn()) as conn:
         name = properties.get_property(conn, 'tokenizer')
index 1799ae86d0330ee61c2fc5fe05118ff00e0ef162..b553dbc641d708175e8f7281f05cf14cf4673484 100644 (file)
@@ -51,7 +51,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
         """
         self.loader = ICURuleLoader(config)
 
-        self._install_php(config.lib_dir.php)
+        self._install_php(config.lib_dir.php, overwrite=True)
         self._save_config()
 
         if init_db:
@@ -67,6 +67,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
         with connect(self.dsn) as conn:
             self.loader.load_config_from_db(conn)
 
+        self._install_php(config.lib_dir.php, overwrite=False)
+
 
     def finalize_import(self, config):
         """ Do any required postprocessing to make the tokenizer data ready
@@ -174,16 +176,18 @@ class LegacyICUTokenizer(AbstractTokenizer):
                                      self.loader.make_token_analysis())
 
 
-    def _install_php(self, phpdir):
+    def _install_php(self, phpdir, overwrite=True):
         """ Install the php script for the tokenizer.
         """
         php_file = self.data_dir / "tokenizer.php"
-        php_file.write_text(dedent(f"""\
-            <?php
-            @define('CONST_Max_Word_Frequency', 10000000);
-            @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
-            @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
-            require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
+
+        if not php_file.exists() or overwrite:
+            php_file.write_text(dedent(f"""\
+                <?php
+                @define('CONST_Max_Word_Frequency', 10000000);
+                @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
+                @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
+                require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 
 
     def _save_config(self):
index 28f4b32756c0756ea172ca3aa16a458ac6ce929d..3b8f75692f964e9c2e84dc3ada92b156dd0afb7b 100644 (file)
@@ -107,7 +107,7 @@ class LegacyTokenizer(AbstractTokenizer):
 
         self.normalization = config.TERM_NORMALIZATION
 
-        self._install_php(config)
+        self._install_php(config, overwrite=True)
 
         with connect(self.dsn) as conn:
             _check_module(module_dir, conn)
@@ -119,12 +119,18 @@ class LegacyTokenizer(AbstractTokenizer):
             self._init_db_tables(config)
 
 
-    def init_from_project(self, _):
+    def init_from_project(self, config):
         """ Initialise the tokenizer from the project directory.
         """
         with connect(self.dsn) as conn:
             self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
 
+        if not (config.project_dir / 'module' / 'nominatim.so').exists():
+            _install_module(config.DATABASE_MODULE_PATH,
+                            config.lib_dir.module,
+                            config.project_dir / 'module')
+
+        self._install_php(config, overwrite=False)
 
     def finalize_import(self, config):
         """ Do any required postprocessing to make the tokenizer data ready
@@ -238,16 +244,18 @@ class LegacyTokenizer(AbstractTokenizer):
         return LegacyNameAnalyzer(self.dsn, normalizer)
 
 
-    def _install_php(self, config):
+    def _install_php(self, config, overwrite=True):
         """ Install the php script for the tokenizer.
         """
         php_file = self.data_dir / "tokenizer.php"
-        php_file.write_text(dedent("""\
-            <?php
-            @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
-            @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
-            require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
-            """.format(config)))
+
+        if not php_file.exists() or overwrite:
+            php_file.write_text(dedent("""\
+                <?php
+                @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
+                @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
+                require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
+                """.format(config)))
 
 
     def _init_db_tables(self, config):
index 6f4f14a71b8a0f44fb190ea41d2c7722af70f8a4..7de32e484f50f1f645c79137a45541b2820538ca 100644 (file)
@@ -217,7 +217,7 @@ class NominatimEnvironment:
                     self.db_drop_database(self.api_test_db)
                     raise
 
-        tokenizer_factory.create_tokenizer(self.get_test_config(), init_db=False)
+        tokenizer_factory.get_tokenizer_for_db(self.get_test_config())
 
 
     def setup_unknown_db(self):
index aa763e28fd835f3b0a7119b254b52ac5fd059b4f..166e6ba6388f424dbbd2347751398294eec45a96 100644 (file)
@@ -63,13 +63,13 @@ class TestFactory:
         assert tokenizer.init_state == "loaded"
 
 
-    def test_load_no_tokenizer_dir(self):
+    def test_load_repopulate_tokenizer_dir(self):
         factory.create_tokenizer(self.config)
 
-        self.config.project_dir = self.config.project_dir / 'foo'
+        self.config.project_dir = self.config.project_dir
 
-        with pytest.raises(UsageError):
-            factory.get_tokenizer_for_db(self.config)
+        factory.get_tokenizer_for_db(self.config)
+        assert (self.config.project_dir / 'tokenizer').exists()
 
 
     def test_load_missing_property(self, temp_db_cursor):