1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Module containing the class handling the import
9 of the special phrases.
11 Phrases are analyzed and imported into the database.
13 The phrases already present in the database which are not
14 valids anymore are removed.
19 from psycopg2.sql import Identifier, SQL
20 from nominatim.tools.special_phrases.importer_statistics import SpecialPhrasesImporterStatistics
22 LOG = logging.getLogger()
24 def _classtype_table(phrase_class, phrase_type):
25 """ Return the name of the table for the given class and type.
27 return f'place_classtype_{phrase_class}_{phrase_type}'
30 # pylint: disable-msg=too-many-instance-attributes
32 Class handling the process of special phrases importation into the database.
34 Take a sp loader which load the phrases from an external source.
36 def __init__(self, config, db_connection, sp_loader) -> None:
38 self.db_connection = db_connection
39 self.sp_loader = sp_loader
40 self.statistics_handler = SpecialPhrasesImporterStatistics()
41 self.black_list, self.white_list = self._load_white_and_black_lists()
42 self.sanity_check_pattern = re.compile(r'^\w+$')
43 # This set will contain all existing phrases to be added.
44 # It contains tuples with the following format: (lable, class, type, operator)
45 self.word_phrases = set()
46 # This set will contain all existing place_classtype tables which doesn't match any
47 # special phrases class/type on the wiki.
48 self.table_phrases_to_delete = set()
50 def import_phrases(self, tokenizer, should_replace):
52 Iterate through all SpecialPhrases extracted from the
53 loader and import them into the database.
55 If should_replace is set to True only the loaded phrases
56 will be kept into the database. All other phrases already
57 in the database will be removed.
59 LOG.warning('Special phrases importation starting')
60 self._fetch_existing_place_classtype_tables()
62 # Store pairs of class/type for further processing
63 class_type_pairs = set()
65 for phrase in self.sp_loader.generate_phrases():
66 result = self._process_phrase(phrase)
68 class_type_pairs.add(result)
70 self._create_place_classtype_table_and_indexes(class_type_pairs)
72 self._remove_non_existent_tables_from_db()
73 self.db_connection.commit()
75 with tokenizer.name_analyzer() as analyzer:
76 analyzer.update_special_phrases(self.word_phrases, should_replace)
78 LOG.warning('Import done.')
79 self.statistics_handler.notify_import_done()
82 def _fetch_existing_place_classtype_tables(self):
84 Fetch existing place_classtype tables.
85 Fill the table_phrases_to_delete set of the class.
89 FROM information_schema.tables
90 WHERE table_schema='public'
91 AND table_name like 'place_classtype_%';
93 with self.db_connection.cursor() as db_cursor:
94 db_cursor.execute(SQL(query))
96 self.table_phrases_to_delete.add(row[0])
98 def _load_white_and_black_lists(self):
100 Load white and black lists from phrases-settings.json.
102 settings = self.config.load_sub_configuration('phrase-settings.json')
104 return settings['blackList'], settings['whiteList']
106 def _check_sanity(self, phrase):
108 Check sanity of given inputs in case somebody added garbage in the wiki.
109 If a bad class/type is detected the system will exit with an error.
111 class_matchs = self.sanity_check_pattern.findall(phrase.p_class)
112 type_matchs = self.sanity_check_pattern.findall(phrase.p_type)
114 if not class_matchs or not type_matchs:
115 LOG.warning("Bad class/type: %s=%s. It will not be imported",
116 phrase.p_class, phrase.p_type)
120 def _process_phrase(self, phrase):
122 Processes the given phrase by checking black and white list
124 Return the class/type pair corresponding to the phrase.
127 # blacklisting: disallow certain class/type combinations
128 if phrase.p_class in self.black_list.keys() \
129 and phrase.p_type in self.black_list[phrase.p_class]:
132 # whitelisting: if class is in whitelist, allow only tags in the list
133 if phrase.p_class in self.white_list.keys() \
134 and phrase.p_type not in self.white_list[phrase.p_class]:
137 # sanity check, in case somebody added garbage in the wiki
138 if not self._check_sanity(phrase):
139 self.statistics_handler.notify_one_phrase_invalid()
142 self.word_phrases.add((phrase.p_label, phrase.p_class,
143 phrase.p_type, phrase.p_operator))
145 return (phrase.p_class, phrase.p_type)
148 def _create_place_classtype_table_and_indexes(self, class_type_pairs):
150 Create table place_classtype for each given pair.
151 Also create indexes on place_id and centroid.
153 LOG.warning('Create tables and indexes...')
155 sql_tablespace = self.config.TABLESPACE_AUX_DATA
157 sql_tablespace = ' TABLESPACE ' + sql_tablespace
159 with self.db_connection.cursor() as db_cursor:
160 db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
162 for pair in class_type_pairs:
163 phrase_class = pair[0]
164 phrase_type = pair[1]
166 table_name = _classtype_table(phrase_class, phrase_type)
168 if table_name in self.table_phrases_to_delete:
169 self.statistics_handler.notify_one_table_ignored()
170 # Remove this table from the ones to delete as it match a
171 # class/type still existing on the special phrases of the wiki.
172 self.table_phrases_to_delete.remove(table_name)
173 # So don't need to create the table and indexes.
177 self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
180 self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
182 # Grant access on read to the web user.
183 self._grant_access_to_webuser(phrase_class, phrase_type)
185 self.statistics_handler.notify_one_table_created()
187 with self.db_connection.cursor() as db_cursor:
188 db_cursor.execute("DROP INDEX idx_placex_classtype")
191 def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type):
193 Create table place_classtype of the given phrase_class/phrase_type
196 table_name = _classtype_table(phrase_class, phrase_type)
197 with self.db_connection.cursor() as cur:
198 cur.execute(SQL("""CREATE TABLE IF NOT EXISTS {} {} AS
199 SELECT place_id AS place_id,
200 st_centroid(geometry) AS centroid
202 WHERE class = %s AND type = %s
203 """).format(Identifier(table_name), SQL(sql_tablespace)),
204 (phrase_class, phrase_type))
207 def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type):
209 Create indexes on centroid and place_id for the place_classtype table.
211 index_prefix = f'idx_place_classtype_{phrase_class}_{phrase_type}_'
212 base_table = _classtype_table(phrase_class, phrase_type)
214 if not self.db_connection.index_exists(index_prefix + 'centroid'):
215 with self.db_connection.cursor() as db_cursor:
216 db_cursor.execute(SQL("CREATE INDEX {} ON {} USING GIST (centroid) {}")
217 .format(Identifier(index_prefix + 'centroid'),
218 Identifier(base_table),
219 SQL(sql_tablespace)))
222 if not self.db_connection.index_exists(index_prefix + 'place_id'):
223 with self.db_connection.cursor() as db_cursor:
224 db_cursor.execute(SQL("CREATE INDEX {} ON {} USING btree(place_id) {}")
225 .format(Identifier(index_prefix + 'place_id'),
226 Identifier(base_table),
227 SQL(sql_tablespace)))
230 def _grant_access_to_webuser(self, phrase_class, phrase_type):
232 Grant access on read to the table place_classtype for the webuser.
234 table_name = _classtype_table(phrase_class, phrase_type)
235 with self.db_connection.cursor() as db_cursor:
236 db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
237 .format(Identifier(table_name),
238 Identifier(self.config.DATABASE_WEBUSER)))
240 def _remove_non_existent_tables_from_db(self):
242 Remove special phrases which doesn't exist on the wiki anymore.
243 Delete the place_classtype tables.
245 LOG.warning('Cleaning database...')
247 # Delete place_classtype tables corresponding to class/type which
248 # are not on the wiki anymore.
249 with self.db_connection.cursor() as db_cursor:
250 for table in self.table_phrases_to_delete:
251 self.statistics_handler.notify_one_table_deleted()
252 db_cursor.drop_table(table)