]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tools/special_phrases/sp_importer.py
Merge pull request #2761 from lonvia/repair-index-analysis
[nominatim.git] / nominatim / tools / special_phrases / sp_importer.py
index 1b42cb003dd08bea611d34620b85aa04ea85de75..31bbc3551cfed82a086feb25953ea073d0a4c011 100644 (file)
@@ -1,3 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
 """
     Module containing the class handling the import
     of the special phrases.
 """
     Module containing the class handling the import
     of the special phrases.
     valids anymore are removed.
 """
 import logging
     valids anymore are removed.
 """
 import logging
-import os
-from os.path import isfile
-from pathlib import Path
 import re
 import re
-import subprocess
-import json
 
 
-from psycopg2.sql import Identifier, Literal, SQL
-from nominatim.errors import UsageError
+from psycopg2.sql import Identifier, SQL
 from nominatim.tools.special_phrases.importer_statistics import SpecialPhrasesImporterStatistics
 
 LOG = logging.getLogger()
 from nominatim.tools.special_phrases.importer_statistics import SpecialPhrasesImporterStatistics
 
 LOG = logging.getLogger()
+
+def _classtype_table(phrase_class, phrase_type):
+    """ Return the name of the table for the given class and type.
+    """
+    return f'place_classtype_{phrase_class}_{phrase_type}'
+
 class SPImporter():
     # pylint: disable-msg=too-many-instance-attributes
     """
 class SPImporter():
     # pylint: disable-msg=too-many-instance-attributes
     """
-        Class handling the process of special phrases importations into the database.
+        Class handling the process of special phrases importation into the database.
 
 
-        Take a SPLoader which load the phrases from an external source.
+        Take a sp loader which load the phrases from an external source.
     """
     """
-    def __init__(self, config, phplib_dir, db_connection, sp_loader) -> None:
+    def __init__(self, config, db_connection, sp_loader) -> None:
         self.config = config
         self.config = config
-        self.phplib_dir = phplib_dir
         self.db_connection = db_connection
         self.sp_loader = sp_loader
         self.statistics_handler = SpecialPhrasesImporterStatistics()
         self.db_connection = db_connection
         self.sp_loader = sp_loader
         self.statistics_handler = SpecialPhrasesImporterStatistics()
@@ -38,33 +43,37 @@ class SPImporter():
         # This set will contain all existing phrases to be added.
         # It contains tuples with the following format: (lable, class, type, operator)
         self.word_phrases = set()
         # This set will contain all existing phrases to be added.
         # It contains tuples with the following format: (lable, class, type, operator)
         self.word_phrases = set()
-        #This set will contain all existing place_classtype tables which doesn't match any
-        #special phrases class/type on the wiki.
+        # This set will contain all existing place_classtype tables which doesn't match any
+        # special phrases class/type on the wiki.
         self.table_phrases_to_delete = set()
 
         self.table_phrases_to_delete = set()
 
-    def import_phrases(self, tokenizer):
+    def import_phrases(self, tokenizer, should_replace):
         """
         """
-            Iterate through all specified languages and
-            extract corresponding special phrases from the wiki.
+            Iterate through all SpecialPhrases extracted from the
+            loader and import them into the database.
+
+            If should_replace is set to True only the loaded phrases
+            will be kept into the database. All other phrases already
+            in the database will be removed.
         """
         LOG.warning('Special phrases importation starting')
         self._fetch_existing_place_classtype_tables()
 
         """
         LOG.warning('Special phrases importation starting')
         self._fetch_existing_place_classtype_tables()
 
-        #Store pairs of class/type for further processing
+        # Store pairs of class/type for further processing
         class_type_pairs = set()
 
         class_type_pairs = set()
 
-        for loaded_phrases in self.sp_loader:
-            for phrase in loaded_phrases:
-                result = self._process_phrase(phrase)
-                if result:
-                    class_type_pairs.update(result)
+        for phrase in self.sp_loader.generate_phrases():
+            result = self._process_phrase(phrase)
+            if result:
+                class_type_pairs.add(result)
 
         self._create_place_classtype_table_and_indexes(class_type_pairs)
 
         self._create_place_classtype_table_and_indexes(class_type_pairs)
-        self._remove_non_existent_tables_from_db()
+        if should_replace:
+            self._remove_non_existent_tables_from_db()
         self.db_connection.commit()
 
         with tokenizer.name_analyzer() as analyzer:
         self.db_connection.commit()
 
         with tokenizer.name_analyzer() as analyzer:
-            analyzer.update_special_phrases(self.word_phrases)
+            analyzer.update_special_phrases(self.word_phrases, should_replace)
 
         LOG.warning('Import done.')
         self.statistics_handler.notify_import_done()
 
         LOG.warning('Import done.')
         self.statistics_handler.notify_import_done()
@@ -90,13 +99,8 @@ class SPImporter():
         """
             Load white and black lists from phrases-settings.json.
         """
         """
             Load white and black lists from phrases-settings.json.
         """
-        settings_path = (self.config.config_dir / 'phrase-settings.json').resolve()
-
-        if self.config.PHRASE_CONFIG:
-            settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG)
+        settings = self.config.load_sub_configuration('phrase-settings.json')
 
 
-        with settings_path.open("r") as json_settings:
-            settings = json.load(json_settings)
         return settings['blackList'], settings['whiteList']
 
     def _check_sanity(self, phrase):
         return settings['blackList'], settings['whiteList']
 
     def _check_sanity(self, phrase):
@@ -120,19 +124,17 @@ class SPImporter():
             Return the class/type pair corresponding to the phrase.
         """
 
             Return the class/type pair corresponding to the phrase.
         """
 
-        #blacklisting: disallow certain class/type combinations
-        if (
-            phrase.p_class in self.black_list.keys() and
-            phrase.p_type in self.black_list[phrase.p_class]
-        ): return None
+        # blacklisting: disallow certain class/type combinations
+        if phrase.p_class in self.black_list.keys() \
+           and phrase.p_type in self.black_list[phrase.p_class]:
+            return None
 
 
-        #whitelisting: if class is in whitelist, allow only tags in the list
-        if (
-            phrase.p_class in self.white_list.keys() and
-            phrase.p_type not in self.white_list[phrase.p_class]
-        ): return None
+        # whitelisting: if class is in whitelist, allow only tags in the list
+        if phrase.p_class in self.white_list.keys() \
+           and phrase.p_type not in self.white_list[phrase.p_class]:
+            return None
 
 
-        #sanity check, in case somebody added garbage in the wiki
+        # sanity check, in case somebody added garbage in the wiki
         if not self._check_sanity(phrase):
             self.statistics_handler.notify_one_phrase_invalid()
             return None
         if not self._check_sanity(phrase):
             self.statistics_handler.notify_one_phrase_invalid()
             return None
@@ -140,7 +142,7 @@ class SPImporter():
         self.word_phrases.add((phrase.p_label, phrase.p_class,
                                phrase.p_type, phrase.p_operator))
 
         self.word_phrases.add((phrase.p_label, phrase.p_class,
                                phrase.p_type, phrase.p_operator))
 
-        return set({(phrase.p_class, phrase.p_type)})
+        return (phrase.p_class, phrase.p_type)
 
 
     def _create_place_classtype_table_and_indexes(self, class_type_pairs):
 
 
     def _create_place_classtype_table_and_indexes(self, class_type_pairs):
@@ -152,7 +154,7 @@ class SPImporter():
 
         sql_tablespace = self.config.TABLESPACE_AUX_DATA
         if sql_tablespace:
 
         sql_tablespace = self.config.TABLESPACE_AUX_DATA
         if sql_tablespace:
-            sql_tablespace = ' TABLESPACE '+sql_tablespace
+            sql_tablespace = ' TABLESPACE ' + sql_tablespace
 
         with self.db_connection.cursor() as db_cursor:
             db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
 
         with self.db_connection.cursor() as db_cursor:
             db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
@@ -161,23 +163,23 @@ class SPImporter():
             phrase_class = pair[0]
             phrase_type = pair[1]
 
             phrase_class = pair[0]
             phrase_type = pair[1]
 
-            table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
+            table_name = _classtype_table(phrase_class, phrase_type)
 
             if table_name in self.table_phrases_to_delete:
                 self.statistics_handler.notify_one_table_ignored()
 
             if table_name in self.table_phrases_to_delete:
                 self.statistics_handler.notify_one_table_ignored()
-                #Remove this table from the ones to delete as it match a class/type
-                #still existing on the special phrases of the wiki.
+                # Remove this table from the ones to delete as it match a
+                # class/type still existing on the special phrases of the wiki.
                 self.table_phrases_to_delete.remove(table_name)
                 self.table_phrases_to_delete.remove(table_name)
-                #So dont need to create the table and indexes.
+                # So don't need to create the table and indexes.
                 continue
 
                 continue
 
-            #Table creation
+            # Table creation
             self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
 
             self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
 
-            #Indexes creation
+            # Indexes creation
             self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
 
             self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
 
-            #Grant access on read to the web user.
+            # Grant access on read to the web user.
             self._grant_access_to_webuser(phrase_class, phrase_type)
 
             self.statistics_handler.notify_one_table_created()
             self._grant_access_to_webuser(phrase_class, phrase_type)
 
             self.statistics_handler.notify_one_table_created()
@@ -188,46 +190,48 @@ class SPImporter():
 
     def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type):
         """
 
     def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type):
         """
-            Create table place_classtype of the given phrase_class/phrase_type if doesn't exit.
+            Create table place_classtype of the given phrase_class/phrase_type
+            if doesn't exit.
         """
         """
-        table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
-        with self.db_connection.cursor() as db_cursor:
-            db_cursor.execute(SQL("""
-                    CREATE TABLE IF NOT EXISTS {{}} {} 
-                    AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex 
-                    WHERE class = {{}} AND type = {{}}""".format(sql_tablespace))
-                              .format(Identifier(table_name), Literal(phrase_class),
-                                      Literal(phrase_type)))
+        table_name = _classtype_table(phrase_class, phrase_type)
+        with self.db_connection.cursor() as cur:
+            cur.execute(SQL("""CREATE TABLE IF NOT EXISTS {} {} AS
+                                 SELECT place_id AS place_id,
+                                        st_centroid(geometry) AS centroid
+                                 FROM placex
+                                 WHERE class = %s AND type = %s
+                             """).format(Identifier(table_name), SQL(sql_tablespace)),
+                        (phrase_class, phrase_type))
 
 
     def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type):
         """
             Create indexes on centroid and place_id for the place_classtype table.
         """
 
 
     def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type):
         """
             Create indexes on centroid and place_id for the place_classtype table.
         """
-        index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type)
-        base_table = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
-        #Index on centroid
+        index_prefix = f'idx_place_classtype_{phrase_class}_{phrase_type}_'
+        base_table = _classtype_table(phrase_class, phrase_type)
+        # Index on centroid
         if not self.db_connection.index_exists(index_prefix + 'centroid'):
             with self.db_connection.cursor() as db_cursor:
         if not self.db_connection.index_exists(index_prefix + 'centroid'):
             with self.db_connection.cursor() as db_cursor:
-                db_cursor.execute(SQL("""
-                    CREATE INDEX {{}} ON {{}} USING GIST (centroid) {}""".format(sql_tablespace))
+                db_cursor.execute(SQL("CREATE INDEX {} ON {} USING GIST (centroid) {}")
                                   .format(Identifier(index_prefix + 'centroid'),
                                   .format(Identifier(index_prefix + 'centroid'),
-                                          Identifier(base_table)), sql_tablespace)
+                                          Identifier(base_table),
+                                          SQL(sql_tablespace)))
 
 
-        #Index on place_id
+        # Index on place_id
         if not self.db_connection.index_exists(index_prefix + 'place_id'):
             with self.db_connection.cursor() as db_cursor:
         if not self.db_connection.index_exists(index_prefix + 'place_id'):
             with self.db_connection.cursor() as db_cursor:
-                db_cursor.execute(SQL(
-                    """CREATE INDEX {{}} ON {{}} USING btree(place_id) {}""".format(sql_tablespace))
+                db_cursor.execute(SQL("CREATE INDEX {} ON {} USING btree(place_id) {}")
                                   .format(Identifier(index_prefix + 'place_id'),
                                   .format(Identifier(index_prefix + 'place_id'),
-                                          Identifier(base_table)))
+                                          Identifier(base_table),
+                                          SQL(sql_tablespace)))
 
 
     def _grant_access_to_webuser(self, phrase_class, phrase_type):
         """
             Grant access on read to the table place_classtype for the webuser.
         """
 
 
     def _grant_access_to_webuser(self, phrase_class, phrase_type):
         """
             Grant access on read to the table place_classtype for the webuser.
         """
-        table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
+        table_name = _classtype_table(phrase_class, phrase_type)
         with self.db_connection.cursor() as db_cursor:
             db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
                               .format(Identifier(table_name),
         with self.db_connection.cursor() as db_cursor:
             db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
                               .format(Identifier(table_name),
@@ -239,41 +243,10 @@ class SPImporter():
             Delete the place_classtype tables.
         """
         LOG.warning('Cleaning database...')
             Delete the place_classtype tables.
         """
         LOG.warning('Cleaning database...')
-        #Array containing all queries to execute. Contain tuples of format (query, parameters)
-        queries_parameters = []
-
-        #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
-        for table in self.table_phrases_to_delete:
-            self.statistics_handler.notify_one_table_deleted()
-            query = SQL('DROP TABLE IF EXISTS {}').format(Identifier(table))
-            queries_parameters.append((query, ()))
 
 
+        # Delete place_classtype tables corresponding to class/type which
+        # are not on the wiki anymore.
         with self.db_connection.cursor() as db_cursor:
         with self.db_connection.cursor() as db_cursor:
-            for query, parameters in queries_parameters:
-                db_cursor.execute(query, parameters)
-
-    def _convert_php_settings_if_needed(self, file_path):
-        """
-            Convert php settings file of special phrases to json file if it is still in php format.
-        """
-        if not isfile(file_path):
-            raise UsageError(str(file_path) + ' is not a valid file.')
-
-        file, extension = os.path.splitext(file_path)
-        json_file_path = Path(file + '.json').resolve()
-
-        if extension not in('.php', '.json'):
-            raise UsageError('The custom NOMINATIM_PHRASE_CONFIG file has not a valid extension.')
-
-        if extension == '.php' and not isfile(json_file_path):
-            try:
-                subprocess.run(['/usr/bin/env', 'php', '-Cq',
-                                (self.phplib_dir / 'migration/PhraseSettingsToJson.php').resolve(),
-                                file_path], check=True)
-                LOG.warning('special_phrase configuration file has been converted to json.')
-                return json_file_path
-            except subprocess.CalledProcessError:
-                LOG.error('Error while converting %s to json.', file_path)
-                raise
-        else:
-            return json_file_path
+            for table in self.table_phrases_to_delete:
+                self.statistics_handler.notify_one_table_deleted()
+                db_cursor.drop_table(table)