2 Module containing the class handling the import
3 of the special phrases.
5 Phrases are analyzed and imported into the database.
7 The phrases already present in the database which are not
8 valids anymore are removed.
12 from os.path import isfile
13 from pathlib import Path
18 from psycopg2.sql import Identifier, Literal, SQL
19 from nominatim.errors import UsageError
20 from nominatim.tools.special_phrases.importer_statistics import SpecialPhrasesImporterStatistics
22 LOG = logging.getLogger()
24 def _classtype_table(phrase_class, phrase_type):
25 """ Return the name of the table for the given class and type.
27 return f'place_classtype_{phrase_class}_{phrase_type}'
30 # pylint: disable-msg=too-many-instance-attributes
32 Class handling the process of special phrases importation into the database.
34 Take a sp loader which load the phrases from an external source.
36 def __init__(self, config, phplib_dir, db_connection, sp_loader) -> None:
38 self.phplib_dir = phplib_dir
39 self.db_connection = db_connection
40 self.sp_loader = sp_loader
41 self.statistics_handler = SpecialPhrasesImporterStatistics()
42 self.black_list, self.white_list = self._load_white_and_black_lists()
43 self.sanity_check_pattern = re.compile(r'^\w+$')
44 # This set will contain all existing phrases to be added.
45 # It contains tuples with the following format: (lable, class, type, operator)
46 self.word_phrases = set()
47 #This set will contain all existing place_classtype tables which doesn't match any
48 #special phrases class/type on the wiki.
49 self.table_phrases_to_delete = set()
51 def import_phrases(self, tokenizer, should_replace):
53 Iterate through all SpecialPhrases extracted from the
54 loader and import them into the database.
56 If should_replace is set to True only the loaded phrases
57 will be kept into the database. All other phrases already
58 in the database will be removed.
60 LOG.warning('Special phrases importation starting')
61 self._fetch_existing_place_classtype_tables()
63 #Store pairs of class/type for further processing
64 class_type_pairs = set()
66 for loaded_phrases in self.sp_loader:
67 for phrase in loaded_phrases:
68 result = self._process_phrase(phrase)
70 class_type_pairs.add(result)
72 self._create_place_classtype_table_and_indexes(class_type_pairs)
74 self._remove_non_existent_tables_from_db()
75 self.db_connection.commit()
77 with tokenizer.name_analyzer() as analyzer:
78 analyzer.update_special_phrases(self.word_phrases, should_replace)
80 LOG.warning('Import done.')
81 self.statistics_handler.notify_import_done()
84 def _fetch_existing_place_classtype_tables(self):
86 Fetch existing place_classtype tables.
87 Fill the table_phrases_to_delete set of the class.
91 FROM information_schema.tables
92 WHERE table_schema='public'
93 AND table_name like 'place_classtype_%';
95 with self.db_connection.cursor() as db_cursor:
96 db_cursor.execute(SQL(query))
98 self.table_phrases_to_delete.add(row[0])
100 def _load_white_and_black_lists(self):
102 Load white and black lists from phrases-settings.json.
104 settings_path = (self.config.config_dir / 'phrase-settings.json').resolve()
106 if self.config.PHRASE_CONFIG:
107 settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG)
109 with settings_path.open("r") as json_settings:
110 settings = json.load(json_settings)
111 return settings['blackList'], settings['whiteList']
113 def _check_sanity(self, phrase):
115 Check sanity of given inputs in case somebody added garbage in the wiki.
116 If a bad class/type is detected the system will exit with an error.
118 class_matchs = self.sanity_check_pattern.findall(phrase.p_class)
119 type_matchs = self.sanity_check_pattern.findall(phrase.p_type)
121 if not class_matchs or not type_matchs:
122 LOG.warning("Bad class/type: %s=%s. It will not be imported",
123 phrase.p_class, phrase.p_type)
127 def _process_phrase(self, phrase):
129 Processes the given phrase by checking black and white list
131 Return the class/type pair corresponding to the phrase.
134 #blacklisting: disallow certain class/type combinations
135 if phrase.p_class in self.black_list.keys() \
136 and phrase.p_type in self.black_list[phrase.p_class]:
139 #whitelisting: if class is in whitelist, allow only tags in the list
140 if phrase.p_class in self.white_list.keys() \
141 and phrase.p_type not in self.white_list[phrase.p_class]:
144 #sanity check, in case somebody added garbage in the wiki
145 if not self._check_sanity(phrase):
146 self.statistics_handler.notify_one_phrase_invalid()
149 self.word_phrases.add((phrase.p_label, phrase.p_class,
150 phrase.p_type, phrase.p_operator))
152 return (phrase.p_class, phrase.p_type)
155 def _create_place_classtype_table_and_indexes(self, class_type_pairs):
157 Create table place_classtype for each given pair.
158 Also create indexes on place_id and centroid.
160 LOG.warning('Create tables and indexes...')
162 sql_tablespace = self.config.TABLESPACE_AUX_DATA
164 sql_tablespace = ' TABLESPACE '+sql_tablespace
166 with self.db_connection.cursor() as db_cursor:
167 db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
169 for pair in class_type_pairs:
170 phrase_class = pair[0]
171 phrase_type = pair[1]
173 table_name = _classtype_table(phrase_class, phrase_type)
175 if table_name in self.table_phrases_to_delete:
176 self.statistics_handler.notify_one_table_ignored()
177 #Remove this table from the ones to delete as it match a class/type
178 #still existing on the special phrases of the wiki.
179 self.table_phrases_to_delete.remove(table_name)
180 #So dont need to create the table and indexes.
184 self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
187 self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
189 #Grant access on read to the web user.
190 self._grant_access_to_webuser(phrase_class, phrase_type)
192 self.statistics_handler.notify_one_table_created()
194 with self.db_connection.cursor() as db_cursor:
195 db_cursor.execute("DROP INDEX idx_placex_classtype")
198 def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type):
200 Create table place_classtype of the given phrase_class/phrase_type if doesn't exit.
202 table_name = _classtype_table(phrase_class, phrase_type)
203 with self.db_connection.cursor() as db_cursor:
204 db_cursor.execute(SQL("""
205 CREATE TABLE IF NOT EXISTS {{}} {}
206 AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex
207 WHERE class = {{}} AND type = {{}}""".format(sql_tablespace))
208 .format(Identifier(table_name), Literal(phrase_class),
209 Literal(phrase_type)))
212 def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type):
214 Create indexes on centroid and place_id for the place_classtype table.
216 index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type)
217 base_table = _classtype_table(phrase_class, phrase_type)
219 if not self.db_connection.index_exists(index_prefix + 'centroid'):
220 with self.db_connection.cursor() as db_cursor:
221 db_cursor.execute(SQL("""
222 CREATE INDEX {{}} ON {{}} USING GIST (centroid) {}""".format(sql_tablespace))
223 .format(Identifier(index_prefix + 'centroid'),
224 Identifier(base_table)), sql_tablespace)
227 if not self.db_connection.index_exists(index_prefix + 'place_id'):
228 with self.db_connection.cursor() as db_cursor:
229 db_cursor.execute(SQL(
230 """CREATE INDEX {{}} ON {{}} USING btree(place_id) {}""".format(sql_tablespace))
231 .format(Identifier(index_prefix + 'place_id'),
232 Identifier(base_table)))
235 def _grant_access_to_webuser(self, phrase_class, phrase_type):
237 Grant access on read to the table place_classtype for the webuser.
239 table_name = _classtype_table(phrase_class, phrase_type)
240 with self.db_connection.cursor() as db_cursor:
241 db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
242 .format(Identifier(table_name),
243 Identifier(self.config.DATABASE_WEBUSER)))
245 def _remove_non_existent_tables_from_db(self):
247 Remove special phrases which doesn't exist on the wiki anymore.
248 Delete the place_classtype tables.
250 LOG.warning('Cleaning database...')
251 #Array containing all queries to execute. Contain tuples of format (query, parameters)
252 queries_parameters = []
254 #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
255 for table in self.table_phrases_to_delete:
256 self.statistics_handler.notify_one_table_deleted()
257 query = SQL('DROP TABLE IF EXISTS {}').format(Identifier(table))
258 queries_parameters.append((query, ()))
260 with self.db_connection.cursor() as db_cursor:
261 for query, parameters in queries_parameters:
262 db_cursor.execute(query, parameters)
264 def _convert_php_settings_if_needed(self, file_path):
266 Convert php settings file of special phrases to json file if it is still in php format.
268 if not isfile(file_path):
269 raise UsageError(str(file_path) + ' is not a valid file.')
271 file, extension = os.path.splitext(file_path)
272 json_file_path = Path(file + '.json').resolve()
274 if extension not in('.php', '.json'):
275 raise UsageError('The custom NOMINATIM_PHRASE_CONFIG file has not a valid extension.')
277 if extension == '.php' and not isfile(json_file_path):
279 subprocess.run(['/usr/bin/env', 'php', '-Cq',
280 (self.phplib_dir / 'migration/PhraseSettingsToJson.php').resolve(),
281 file_path], check=True)
282 LOG.warning('special_phrase configuration file has been converted to json.')
283 except subprocess.CalledProcessError:
284 LOG.error('Error while converting %s to json.', file_path)
287 return json_file_path