]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/clicmd/refresh.py
add new command for cleaning word tokens
[nominatim.git] / nominatim / clicmd / refresh.py
index ffbe628b8ff0ec6cbf16d2c16d871ff14392cb0e..c741dcf63632fc0c01d8592a66f46d3be0c8bdbd 100644 (file)
@@ -1,11 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
 """
 Implementation of 'refresh' subcommand.
 """
 import logging
 from pathlib import Path
 
 """
 Implementation of 'refresh' subcommand.
 """
 import logging
 from pathlib import Path
 
-from ..db.connection import connect
-from ..tools.exec_utils import run_legacy_script
+from nominatim.db.connection import connect
 
 # Do not repeat documentation of subcommand classes.
 # pylint: disable=C0111
 
 # Do not repeat documentation of subcommand classes.
 # pylint: disable=C0111
@@ -18,14 +23,24 @@ class UpdateRefresh:
     """\
     Recompute auxiliary data used by the indexing process.
 
     """\
     Recompute auxiliary data used by the indexing process.
 
-    These functions must not be run in parallel with other update commands.
+    This sub-commands updates various static data and functions in the database.
+    It usually needs to be run after changing various aspects of the
+    configuration. The configuration documentation will mention the exact
+    command to use in such case.
+
+    Warning: the 'update' command must not be run in parallel with other update
+             commands like 'replication' or 'add-data'.
     """
     """
+    def __init__(self):
+        self.tokenizer = None
 
     @staticmethod
     def add_args(parser):
         group = parser.add_argument_group('Data arguments')
         group.add_argument('--postcodes', action='store_true',
                            help='Update postcode centroid table')
 
     @staticmethod
     def add_args(parser):
         group = parser.add_argument_group('Data arguments')
         group.add_argument('--postcodes', action='store_true',
                            help='Update postcode centroid table')
+        group.add_argument('--word-tokens', action='store_true',
+                           help='Clean up search terms')
         group.add_argument('--word-counts', action='store_true',
                            help='Compute frequency of full-word search terms')
         group.add_argument('--address-levels', action='store_true',
         group.add_argument('--word-counts', action='store_true',
                            help='Compute frequency of full-word search terms')
         group.add_argument('--address-levels', action='store_true',
@@ -33,7 +48,7 @@ class UpdateRefresh:
         group.add_argument('--functions', action='store_true',
                            help='Update the PL/pgSQL functions in the database')
         group.add_argument('--wiki-data', action='store_true',
         group.add_argument('--functions', action='store_true',
                            help='Update the PL/pgSQL functions in the database')
         group.add_argument('--wiki-data', action='store_true',
-                           help='Update Wikipedia/data importance numbers.')
+                           help='Update Wikipedia/data importance numbers')
         group.add_argument('--importance', action='store_true',
                            help='Recompute place importances (expensive!)')
         group.add_argument('--website', action='store_true',
         group.add_argument('--importance', action='store_true',
                            help='Recompute place importances (expensive!)')
         group.add_argument('--website', action='store_true',
@@ -44,46 +59,73 @@ class UpdateRefresh:
         group.add_argument('--enable-debug-statements', action='store_true',
                            help='Enable debug warning statements in functions')
 
         group.add_argument('--enable-debug-statements', action='store_true',
                            help='Enable debug warning statements in functions')
 
-    @staticmethod
-    def run(args):
-        from ..tools import refresh
+
+    def run(self, args):
+        from ..tools import refresh, postcodes
+        from ..indexer.indexer import Indexer
+
 
         if args.postcodes:
 
         if args.postcodes:
-            LOG.warning("Update postcodes centroid")
-            conn = connect(args.config.get_libpq_dsn())
-            refresh.update_postcodes(conn, args.sqllib_dir)
-            conn.close()
+            if postcodes.can_compute(args.config.get_libpq_dsn()):
+                LOG.warning("Update postcodes centroid")
+                tokenizer = self._get_tokenizer(args.config)
+                postcodes.update_postcodes(args.config.get_libpq_dsn(),
+                                           args.project_dir, tokenizer)
+                indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
+                                  args.threads or 1)
+                indexer.index_postcodes()
+            else:
+                LOG.error("The place table doesn't exist. "
+                          "Postcode updates on a frozen database is not possible.")
+
+        if args.word_tokens:
+            tokenizer = self._get_tokenizer(args.config)
+            tokenizer.update_word_tokens()
 
         if args.word_counts:
 
         if args.word_counts:
-            LOG.warning('Recompute frequency of full-word search terms')
-            conn = connect(args.config.get_libpq_dsn())
-            refresh.recompute_word_counts(conn, args.sqllib_dir)
-            conn.close()
+            LOG.warning('Recompute word statistics')
+            self._get_tokenizer(args.config).update_statistics()
 
         if args.address_levels:
 
         if args.address_levels:
-            cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)
-            LOG.warning('Updating address levels from %s', cfg)
-            conn = connect(args.config.get_libpq_dsn())
-            refresh.load_address_levels_from_file(conn, cfg)
-            conn.close()
+            LOG.warning('Updating address levels')
+            with connect(args.config.get_libpq_dsn()) as conn:
+                refresh.load_address_levels_from_config(conn, args.config)
 
         if args.functions:
             LOG.warning('Create functions')
 
         if args.functions:
             LOG.warning('Create functions')
-            conn = connect(args.config.get_libpq_dsn())
-            refresh.create_functions(conn, args.config, args.sqllib_dir,
-                                     args.diffs, args.enable_debug_statements)
-            conn.close()
+            with connect(args.config.get_libpq_dsn()) as conn:
+                refresh.create_functions(conn, args.config,
+                                         args.diffs, args.enable_debug_statements)
+                self._get_tokenizer(args.config).update_sql_functions(args.config)
 
         if args.wiki_data:
 
         if args.wiki_data:
-            run_legacy_script('setup.php', '--import-wikipedia-articles',
-                              nominatim_env=args, throw_on_fail=True)
+            data_path = Path(args.config.WIKIPEDIA_DATA_PATH
+                             or args.project_dir)
+            LOG.warning('Import wikipdia article importance from %s', data_path)
+            if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
+                                                 data_path) > 0:
+                LOG.fatal('FATAL: Wikipedia importance dump file not found')
+                return 1
+
         # Attention: importance MUST come after wiki data import.
         if args.importance:
         # Attention: importance MUST come after wiki data import.
         if args.importance:
-            run_legacy_script('update.php', '--recompute-importance',
-                              nominatim_env=args, throw_on_fail=True)
+            LOG.warning('Update importance values for database')
+            with connect(args.config.get_libpq_dsn()) as conn:
+                refresh.recompute_importance(conn)
+
         if args.website:
             webdir = args.project_dir / 'website'
             LOG.warning('Setting up website directory at %s', webdir)
         if args.website:
             webdir = args.project_dir / 'website'
             LOG.warning('Setting up website directory at %s', webdir)
-            refresh.setup_website(webdir, args.phplib_dir, args.config)
+            with connect(args.config.get_libpq_dsn()) as conn:
+                refresh.setup_website(webdir, args.config, conn)
 
         return 0
 
         return 0
+
+
+    def _get_tokenizer(self, config):
+        if self.tokenizer is None:
+            from ..tokenizer import factory as tokenizer_factory
+
+            self.tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
+
+        return self.tokenizer