]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/clicmd/refresh.py
icu: move token deduplication into TokenInfo
[nominatim.git] / nominatim / clicmd / refresh.py
index 8e69cacaf315fa8ab1c6c6a82bbfe0f2bea75efc..b8a88b6d615b5b5c04445f393a71a81c1b6cc112 100644 (file)
@@ -1,11 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
 """
 Implementation of 'refresh' subcommand.
 """
 import logging
 from pathlib import Path
 
 """
 Implementation of 'refresh' subcommand.
 """
 import logging
 from pathlib import Path
 
-from ..db.connection import connect
-from ..tools.exec_utils import run_legacy_script
+from nominatim.db.connection import connect
 
 # Do not repeat documentation of subcommand classes.
 # pylint: disable=C0111
 
 # Do not repeat documentation of subcommand classes.
 # pylint: disable=C0111
@@ -18,14 +23,24 @@ class UpdateRefresh:
     """\
     Recompute auxiliary data used by the indexing process.
 
     """\
     Recompute auxiliary data used by the indexing process.
 
-    These functions must not be run in parallel with other update commands.
+    This sub-commands updates various static data and functions in the database.
+    It usually needs to be run after changing various aspects of the
+    configuration. The configuration documentation will mention the exact
+    command to use in such case.
+
+    Warning: the 'update' command must not be run in parallel with other update
+             commands like 'replication' or 'add-data'.
     """
     """
+    def __init__(self):
+        self.tokenizer = None
 
     @staticmethod
     def add_args(parser):
         group = parser.add_argument_group('Data arguments')
         group.add_argument('--postcodes', action='store_true',
                            help='Update postcode centroid table')
 
     @staticmethod
     def add_args(parser):
         group = parser.add_argument_group('Data arguments')
         group.add_argument('--postcodes', action='store_true',
                            help='Update postcode centroid table')
+        group.add_argument('--word-tokens', action='store_true',
+                           help='Clean up search terms')
         group.add_argument('--word-counts', action='store_true',
                            help='Compute frequency of full-word search terms')
         group.add_argument('--address-levels', action='store_true',
         group.add_argument('--word-counts', action='store_true',
                            help='Compute frequency of full-word search terms')
         group.add_argument('--address-levels', action='store_true',
@@ -33,7 +48,7 @@ class UpdateRefresh:
         group.add_argument('--functions', action='store_true',
                            help='Update the PL/pgSQL functions in the database')
         group.add_argument('--wiki-data', action='store_true',
         group.add_argument('--functions', action='store_true',
                            help='Update the PL/pgSQL functions in the database')
         group.add_argument('--wiki-data', action='store_true',
-                           help='Update Wikipedia/data importance numbers.')
+                           help='Update Wikipedia/data importance numbers')
         group.add_argument('--importance', action='store_true',
                            help='Recompute place importances (expensive!)')
         group.add_argument('--website', action='store_true',
         group.add_argument('--importance', action='store_true',
                            help='Recompute place importances (expensive!)')
         group.add_argument('--website', action='store_true',
@@ -44,45 +59,74 @@ class UpdateRefresh:
         group.add_argument('--enable-debug-statements', action='store_true',
                            help='Enable debug warning statements in functions')
 
         group.add_argument('--enable-debug-statements', action='store_true',
                            help='Enable debug warning statements in functions')
 
-    @staticmethod
-    def run(args):
-        from ..tools import refresh
+
+    def run(self, args):
+        from ..tools import refresh, postcodes
+        from ..indexer.indexer import Indexer
+
 
         if args.postcodes:
 
         if args.postcodes:
-            LOG.warning("Update postcodes centroid")
-            conn = connect(args.config.get_libpq_dsn())
-            refresh.update_postcodes(conn, args.sqllib_dir)
-            conn.close()
+            if postcodes.can_compute(args.config.get_libpq_dsn()):
+                LOG.warning("Update postcodes centroid")
+                tokenizer = self._get_tokenizer(args.config)
+                postcodes.update_postcodes(args.config.get_libpq_dsn(),
+                                           args.project_dir, tokenizer)
+                indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
+                                  args.threads or 1)
+                indexer.index_postcodes()
+            else:
+                LOG.error("The place table doesn't exist. "
+                          "Postcode updates on a frozen database is not possible.")
+
+        if args.word_tokens:
+            LOG.warning('Updating word tokens')
+            tokenizer = self._get_tokenizer(args.config)
+            tokenizer.update_word_tokens()
 
         if args.word_counts:
 
         if args.word_counts:
-            LOG.warning('Recompute frequency of full-word search terms')
-            conn = connect(args.config.get_libpq_dsn())
-            refresh.recompute_word_counts(conn, args.sqllib_dir)
-            conn.close()
+            LOG.warning('Recompute word statistics')
+            self._get_tokenizer(args.config).update_statistics()
 
         if args.address_levels:
 
         if args.address_levels:
-            cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)
-            LOG.warning('Updating address levels from %s', cfg)
-            conn = connect(args.config.get_libpq_dsn())
-            refresh.load_address_levels_from_file(conn, cfg)
-            conn.close()
+            LOG.warning('Updating address levels')
+            with connect(args.config.get_libpq_dsn()) as conn:
+                refresh.load_address_levels_from_config(conn, args.config)
 
         if args.functions:
             LOG.warning('Create functions')
 
         if args.functions:
             LOG.warning('Create functions')
-            conn = connect(args.config.get_libpq_dsn())
-            refresh.create_functions(conn, args.config, args.sqllib_dir,
-                                     args.diffs, args.enable_debug_statements)
-            conn.close()
+            with connect(args.config.get_libpq_dsn()) as conn:
+                refresh.create_functions(conn, args.config,
+                                         args.diffs, args.enable_debug_statements)
+                self._get_tokenizer(args.config).update_sql_functions(args.config)
 
         if args.wiki_data:
 
         if args.wiki_data:
-            run_legacy_script('setup.php', '--import-wikipedia-articles',
-                              nominatim_env=args, throw_on_fail=True)
+            data_path = Path(args.config.WIKIPEDIA_DATA_PATH
+                             or args.project_dir)
+            LOG.warning('Import wikipdia article importance from %s', data_path)
+            if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
+                                                 data_path) > 0:
+                LOG.fatal('FATAL: Wikipedia importance dump file not found')
+                return 1
+
         # Attention: importance MUST come after wiki data import.
         if args.importance:
         # Attention: importance MUST come after wiki data import.
         if args.importance:
-            run_legacy_script('update.php', '--recompute-importance',
-                              nominatim_env=args, throw_on_fail=True)
+            LOG.warning('Update importance values for database')
+            with connect(args.config.get_libpq_dsn()) as conn:
+                refresh.recompute_importance(conn)
+
         if args.website:
         if args.website:
-            run_legacy_script('setup.php', '--setup-website',
-                              nominatim_env=args, throw_on_fail=True)
+            webdir = args.project_dir / 'website'
+            LOG.warning('Setting up website directory at %s', webdir)
+            with connect(args.config.get_libpq_dsn()) as conn:
+                refresh.setup_website(webdir, args.config, conn)
 
         return 0
 
         return 0
+
+
+    def _get_tokenizer(self, config):
+        if self.tokenizer is None:
+            from ..tokenizer import factory as tokenizer_factory
+
+            self.tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
+
+        return self.tokenizer