2 Module containing the class handling the import
3 of the special phrases.
5 Phrases are analyzed and imported into the database.
7 The phrases already present in the database which are not
8 valids anymore are removed.
12 from os.path import isfile
13 from pathlib import Path
18 from psycopg2.sql import Identifier, Literal, SQL
19 from nominatim.errors import UsageError
20 from nominatim.tools.special_phrases.importer_statistics import SpecialPhrasesImporterStatistics
22 LOG = logging.getLogger()
24 # pylint: disable-msg=too-many-instance-attributes
26 Class handling the process of special phrases importation into the database.
28 Take a sp loader which load the phrases from an external source.
30 def __init__(self, config, phplib_dir, db_connection, sp_loader) -> None:
32 self.phplib_dir = phplib_dir
33 self.db_connection = db_connection
34 self.sp_loader = sp_loader
35 self.statistics_handler = SpecialPhrasesImporterStatistics()
36 self.black_list, self.white_list = self._load_white_and_black_lists()
37 self.sanity_check_pattern = re.compile(r'^\w+$')
38 # This set will contain all existing phrases to be added.
39 # It contains tuples with the following format: (lable, class, type, operator)
40 self.word_phrases = set()
41 #This set will contain all existing place_classtype tables which doesn't match any
42 #special phrases class/type on the wiki.
43 self.table_phrases_to_delete = set()
45 def import_phrases(self, tokenizer, should_replace):
47 Iterate through all SpecialPhrases extracted from the
48 loader and import them into the database.
50 If should_replace is set to True only the loaded phrases
51 will be kept into the database. All other phrases already
52 in the database will be removed.
54 LOG.warning('Special phrases importation starting')
55 self._fetch_existing_place_classtype_tables()
57 #Store pairs of class/type for further processing
58 class_type_pairs = set()
60 for loaded_phrases in self.sp_loader:
61 for phrase in loaded_phrases:
62 result = self._process_phrase(phrase)
64 class_type_pairs.update(result)
66 self._create_place_classtype_table_and_indexes(class_type_pairs)
68 self._remove_non_existent_tables_from_db()
69 self.db_connection.commit()
71 with tokenizer.name_analyzer() as analyzer:
72 analyzer.update_special_phrases(self.word_phrases, should_replace)
74 LOG.warning('Import done.')
75 self.statistics_handler.notify_import_done()
78 def _fetch_existing_place_classtype_tables(self):
80 Fetch existing place_classtype tables.
81 Fill the table_phrases_to_delete set of the class.
85 FROM information_schema.tables
86 WHERE table_schema='public'
87 AND table_name like 'place_classtype_%';
89 with self.db_connection.cursor() as db_cursor:
90 db_cursor.execute(SQL(query))
92 self.table_phrases_to_delete.add(row[0])
94 def _load_white_and_black_lists(self):
96 Load white and black lists from phrases-settings.json.
98 settings_path = (self.config.config_dir / 'phrase-settings.json').resolve()
100 if self.config.PHRASE_CONFIG:
101 settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG)
103 with settings_path.open("r") as json_settings:
104 settings = json.load(json_settings)
105 return settings['blackList'], settings['whiteList']
107 def _check_sanity(self, phrase):
109 Check sanity of given inputs in case somebody added garbage in the wiki.
110 If a bad class/type is detected the system will exit with an error.
112 class_matchs = self.sanity_check_pattern.findall(phrase.p_class)
113 type_matchs = self.sanity_check_pattern.findall(phrase.p_type)
115 if not class_matchs or not type_matchs:
116 LOG.warning("Bad class/type: %s=%s. It will not be imported",
117 phrase.p_class, phrase.p_type)
121 def _process_phrase(self, phrase):
123 Processes the given phrase by checking black and white list
125 Return the class/type pair corresponding to the phrase.
128 #blacklisting: disallow certain class/type combinations
129 if phrase.p_class in self.black_list.keys() \
130 and phrase.p_type in self.black_list[phrase.p_class]:
133 #whitelisting: if class is in whitelist, allow only tags in the list
134 if phrase.p_class in self.white_list.keys() \
135 and phrase.p_type not in self.white_list[phrase.p_class]:
138 #sanity check, in case somebody added garbage in the wiki
139 if not self._check_sanity(phrase):
140 self.statistics_handler.notify_one_phrase_invalid()
143 self.word_phrases.add((phrase.p_label, phrase.p_class,
144 phrase.p_type, phrase.p_operator))
146 return set({(phrase.p_class, phrase.p_type)})
149 def _create_place_classtype_table_and_indexes(self, class_type_pairs):
151 Create table place_classtype for each given pair.
152 Also create indexes on place_id and centroid.
154 LOG.warning('Create tables and indexes...')
156 sql_tablespace = self.config.TABLESPACE_AUX_DATA
158 sql_tablespace = ' TABLESPACE '+sql_tablespace
160 with self.db_connection.cursor() as db_cursor:
161 db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
163 for pair in class_type_pairs:
164 phrase_class = pair[0]
165 phrase_type = pair[1]
167 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
169 if table_name in self.table_phrases_to_delete:
170 self.statistics_handler.notify_one_table_ignored()
171 #Remove this table from the ones to delete as it match a class/type
172 #still existing on the special phrases of the wiki.
173 self.table_phrases_to_delete.remove(table_name)
174 #So dont need to create the table and indexes.
178 self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
181 self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
183 #Grant access on read to the web user.
184 self._grant_access_to_webuser(phrase_class, phrase_type)
186 self.statistics_handler.notify_one_table_created()
188 with self.db_connection.cursor() as db_cursor:
189 db_cursor.execute("DROP INDEX idx_placex_classtype")
192 def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type):
194 Create table place_classtype of the given phrase_class/phrase_type if doesn't exit.
196 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
197 with self.db_connection.cursor() as db_cursor:
198 db_cursor.execute(SQL("""
199 CREATE TABLE IF NOT EXISTS {{}} {}
200 AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex
201 WHERE class = {{}} AND type = {{}}""".format(sql_tablespace))
202 .format(Identifier(table_name), Literal(phrase_class),
203 Literal(phrase_type)))
206 def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type):
208 Create indexes on centroid and place_id for the place_classtype table.
210 index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type)
211 base_table = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
213 if not self.db_connection.index_exists(index_prefix + 'centroid'):
214 with self.db_connection.cursor() as db_cursor:
215 db_cursor.execute(SQL("""
216 CREATE INDEX {{}} ON {{}} USING GIST (centroid) {}""".format(sql_tablespace))
217 .format(Identifier(index_prefix + 'centroid'),
218 Identifier(base_table)), sql_tablespace)
221 if not self.db_connection.index_exists(index_prefix + 'place_id'):
222 with self.db_connection.cursor() as db_cursor:
223 db_cursor.execute(SQL(
224 """CREATE INDEX {{}} ON {{}} USING btree(place_id) {}""".format(sql_tablespace))
225 .format(Identifier(index_prefix + 'place_id'),
226 Identifier(base_table)))
229 def _grant_access_to_webuser(self, phrase_class, phrase_type):
231 Grant access on read to the table place_classtype for the webuser.
233 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
234 with self.db_connection.cursor() as db_cursor:
235 db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
236 .format(Identifier(table_name),
237 Identifier(self.config.DATABASE_WEBUSER)))
239 def _remove_non_existent_tables_from_db(self):
241 Remove special phrases which doesn't exist on the wiki anymore.
242 Delete the place_classtype tables.
244 LOG.warning('Cleaning database...')
245 #Array containing all queries to execute. Contain tuples of format (query, parameters)
246 queries_parameters = []
248 #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
249 for table in self.table_phrases_to_delete:
250 self.statistics_handler.notify_one_table_deleted()
251 query = SQL('DROP TABLE IF EXISTS {}').format(Identifier(table))
252 queries_parameters.append((query, ()))
254 with self.db_connection.cursor() as db_cursor:
255 for query, parameters in queries_parameters:
256 db_cursor.execute(query, parameters)
258 def _convert_php_settings_if_needed(self, file_path):
260 Convert php settings file of special phrases to json file if it is still in php format.
262 if not isfile(file_path):
263 raise UsageError(str(file_path) + ' is not a valid file.')
265 file, extension = os.path.splitext(file_path)
266 json_file_path = Path(file + '.json').resolve()
268 if extension not in('.php', '.json'):
269 raise UsageError('The custom NOMINATIM_PHRASE_CONFIG file has not a valid extension.')
271 if extension == '.php' and not isfile(json_file_path):
273 subprocess.run(['/usr/bin/env', 'php', '-Cq',
274 (self.phplib_dir / 'migration/PhraseSettingsToJson.php').resolve(),
275 file_path], check=True)
276 LOG.warning('special_phrase configuration file has been converted to json.')
277 return json_file_path
278 except subprocess.CalledProcessError:
279 LOG.error('Error while converting %s to json.', file_path)
282 return json_file_path