2 Module containing the class handling the import
3 of the special phrases.
5 Phrases are analyzed and imported into the database.
7 The phrases already present in the database which are not
8 valids anymore are removed.
12 from os.path import isfile
13 from pathlib import Path
18 from psycopg2.sql import Identifier, Literal, SQL
19 from nominatim.errors import UsageError
20 from nominatim.tools.special_phrases.importer_statistics import SpecialPhrasesImporterStatistics
22 LOG = logging.getLogger()
24 # pylint: disable-msg=too-many-instance-attributes
26 Class handling the process of special phrases importations into the database.
28 Take a sp loader which load the phrases from an external source.
30 def __init__(self, config, phplib_dir, db_connection, sp_loader) -> None:
32 self.phplib_dir = phplib_dir
33 self.db_connection = db_connection
34 self.sp_loader = sp_loader
35 self.statistics_handler = SpecialPhrasesImporterStatistics()
36 self.black_list, self.white_list = self._load_white_and_black_lists()
37 self.sanity_check_pattern = re.compile(r'^\w+$')
38 # This set will contain all existing phrases to be added.
39 # It contains tuples with the following format: (lable, class, type, operator)
40 self.word_phrases = set()
41 #This set will contain all existing place_classtype tables which doesn't match any
42 #special phrases class/type on the wiki.
43 self.table_phrases_to_delete = set()
45 def import_phrases(self, tokenizer):
47 Iterate through all specified languages and
48 extract corresponding special phrases from the wiki.
50 LOG.warning('Special phrases importation starting')
51 self._fetch_existing_place_classtype_tables()
53 #Store pairs of class/type for further processing
54 class_type_pairs = set()
56 for loaded_phrases in self.sp_loader:
57 for phrase in loaded_phrases:
58 result = self._process_phrase(phrase)
60 class_type_pairs.update(result)
62 self._create_place_classtype_table_and_indexes(class_type_pairs)
63 self._remove_non_existent_tables_from_db()
64 self.db_connection.commit()
66 with tokenizer.name_analyzer() as analyzer:
67 analyzer.update_special_phrases(self.word_phrases)
69 LOG.warning('Import done.')
70 self.statistics_handler.notify_import_done()
73 def _fetch_existing_place_classtype_tables(self):
75 Fetch existing place_classtype tables.
76 Fill the table_phrases_to_delete set of the class.
80 FROM information_schema.tables
81 WHERE table_schema='public'
82 AND table_name like 'place_classtype_%';
84 with self.db_connection.cursor() as db_cursor:
85 db_cursor.execute(SQL(query))
87 self.table_phrases_to_delete.add(row[0])
89 def _load_white_and_black_lists(self):
91 Load white and black lists from phrases-settings.json.
93 settings_path = (self.config.config_dir / 'phrase-settings.json').resolve()
95 if self.config.PHRASE_CONFIG:
96 settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG)
98 with settings_path.open("r") as json_settings:
99 settings = json.load(json_settings)
100 return settings['blackList'], settings['whiteList']
102 def _check_sanity(self, phrase):
104 Check sanity of given inputs in case somebody added garbage in the wiki.
105 If a bad class/type is detected the system will exit with an error.
107 class_matchs = self.sanity_check_pattern.findall(phrase.p_class)
108 type_matchs = self.sanity_check_pattern.findall(phrase.p_type)
110 if not class_matchs or not type_matchs:
111 LOG.warning("Bad class/type: %s=%s. It will not be imported",
112 phrase.p_class, phrase.p_type)
116 def _process_phrase(self, phrase):
118 Processes the given phrase by checking black and white list
120 Return the class/type pair corresponding to the phrase.
123 #blacklisting: disallow certain class/type combinations
124 if phrase.p_class in self.black_list.keys() \
125 and phrase.p_type in self.black_list[phrase.p_class]:
128 #whitelisting: if class is in whitelist, allow only tags in the list
129 if phrase.p_class in self.white_list.keys() \
130 and phrase.p_type not in self.white_list[phrase.p_class]:
133 #sanity check, in case somebody added garbage in the wiki
134 if not self._check_sanity(phrase):
135 self.statistics_handler.notify_one_phrase_invalid()
138 self.word_phrases.add((phrase.p_label, phrase.p_class,
139 phrase.p_type, phrase.p_operator))
141 return set({(phrase.p_class, phrase.p_type)})
144 def _create_place_classtype_table_and_indexes(self, class_type_pairs):
146 Create table place_classtype for each given pair.
147 Also create indexes on place_id and centroid.
149 LOG.warning('Create tables and indexes...')
151 sql_tablespace = self.config.TABLESPACE_AUX_DATA
153 sql_tablespace = ' TABLESPACE '+sql_tablespace
155 with self.db_connection.cursor() as db_cursor:
156 db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
158 for pair in class_type_pairs:
159 phrase_class = pair[0]
160 phrase_type = pair[1]
162 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
164 if table_name in self.table_phrases_to_delete:
165 self.statistics_handler.notify_one_table_ignored()
166 #Remove this table from the ones to delete as it match a class/type
167 #still existing on the special phrases of the wiki.
168 self.table_phrases_to_delete.remove(table_name)
169 #So dont need to create the table and indexes.
173 self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
176 self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
178 #Grant access on read to the web user.
179 self._grant_access_to_webuser(phrase_class, phrase_type)
181 self.statistics_handler.notify_one_table_created()
183 with self.db_connection.cursor() as db_cursor:
184 db_cursor.execute("DROP INDEX idx_placex_classtype")
187 def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type):
189 Create table place_classtype of the given phrase_class/phrase_type if doesn't exit.
191 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
192 with self.db_connection.cursor() as db_cursor:
193 db_cursor.execute(SQL("""
194 CREATE TABLE IF NOT EXISTS {{}} {}
195 AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex
196 WHERE class = {{}} AND type = {{}}""".format(sql_tablespace))
197 .format(Identifier(table_name), Literal(phrase_class),
198 Literal(phrase_type)))
201 def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type):
203 Create indexes on centroid and place_id for the place_classtype table.
205 index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type)
206 base_table = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
208 if not self.db_connection.index_exists(index_prefix + 'centroid'):
209 with self.db_connection.cursor() as db_cursor:
210 db_cursor.execute(SQL("""
211 CREATE INDEX {{}} ON {{}} USING GIST (centroid) {}""".format(sql_tablespace))
212 .format(Identifier(index_prefix + 'centroid'),
213 Identifier(base_table)), sql_tablespace)
216 if not self.db_connection.index_exists(index_prefix + 'place_id'):
217 with self.db_connection.cursor() as db_cursor:
218 db_cursor.execute(SQL(
219 """CREATE INDEX {{}} ON {{}} USING btree(place_id) {}""".format(sql_tablespace))
220 .format(Identifier(index_prefix + 'place_id'),
221 Identifier(base_table)))
224 def _grant_access_to_webuser(self, phrase_class, phrase_type):
226 Grant access on read to the table place_classtype for the webuser.
228 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
229 with self.db_connection.cursor() as db_cursor:
230 db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
231 .format(Identifier(table_name),
232 Identifier(self.config.DATABASE_WEBUSER)))
234 def _remove_non_existent_tables_from_db(self):
236 Remove special phrases which doesn't exist on the wiki anymore.
237 Delete the place_classtype tables.
239 LOG.warning('Cleaning database...')
240 #Array containing all queries to execute. Contain tuples of format (query, parameters)
241 queries_parameters = []
243 #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
244 for table in self.table_phrases_to_delete:
245 self.statistics_handler.notify_one_table_deleted()
246 query = SQL('DROP TABLE IF EXISTS {}').format(Identifier(table))
247 queries_parameters.append((query, ()))
249 with self.db_connection.cursor() as db_cursor:
250 for query, parameters in queries_parameters:
251 db_cursor.execute(query, parameters)
253 def _convert_php_settings_if_needed(self, file_path):
255 Convert php settings file of special phrases to json file if it is still in php format.
257 if not isfile(file_path):
258 raise UsageError(str(file_path) + ' is not a valid file.')
260 file, extension = os.path.splitext(file_path)
261 json_file_path = Path(file + '.json').resolve()
263 if extension not in('.php', '.json'):
264 raise UsageError('The custom NOMINATIM_PHRASE_CONFIG file has not a valid extension.')
266 if extension == '.php' and not isfile(json_file_path):
268 subprocess.run(['/usr/bin/env', 'php', '-Cq',
269 (self.phplib_dir / 'migration/PhraseSettingsToJson.php').resolve(),
270 file_path], check=True)
271 LOG.warning('special_phrase configuration file has been converted to json.')
272 return json_file_path
273 except subprocess.CalledProcessError:
274 LOG.error('Error while converting %s to json.', file_path)
277 return json_file_path