2 Functions to import special phrases into the database.
6 from os.path import isfile
7 from pathlib import Path
12 from icu import Transliterator
13 from psycopg2.sql import Identifier, Literal, SQL
15 from nominatim.tools.exec_utils import get_url
16 from nominatim.errors import UsageError
17 from nominatim.tools.special_phrases.importer_statistics import SpecialPhrasesImporterStatistics
19 LOG = logging.getLogger()
20 class SpecialPhrasesImporter():
21 # pylint: disable-msg=too-many-instance-attributes
23 Class handling the process of special phrases importations.
25 def __init__(self, config, phplib_dir, db_connection) -> None:
26 self.statistics_handler = SpecialPhrasesImporterStatistics()
27 self.db_connection = db_connection
29 self.phplib_dir = phplib_dir
30 self.black_list, self.white_list = self._load_white_and_black_lists()
31 #Compile the regex here to increase performances.
32 self.occurence_pattern = re.compile(
33 r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
35 self.sanity_check_pattern = re.compile(r'^\w+$')
36 self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
37 self.config.TERM_NORMALIZATION)
38 #This set will contain all existing phrases from the word table which
39 #no longer exist on the wiki.
40 #It contain tuples with the following format: (normalized_word, class, type, operator)
41 self.words_phrases_to_delete = set()
42 #This set will contain the phrases which still exist from the wiki.
43 #It is used to prevent duplicates on the wiki by removing them from
44 #the word_phrases_to_delete only at the end.
45 self.words_phrases_still_exist = set()
46 #This set will contain all existing place_classtype tables which doesn't match any
47 #special phrases class/type on the wiki.
48 self.table_phrases_to_delete = set()
50 def import_from_wiki(self, languages=None):
52 Iterate through all specified languages and
53 extract corresponding special phrases from the wiki.
55 if languages is not None and not isinstance(languages, list):
56 raise TypeError('The \'languages\' argument should be of type list.')
58 self._fetch_existing_words_phrases()
59 self._fetch_existing_place_classtype_tables()
61 #Get all languages to process.
62 languages = self._load_languages() if not languages else languages
64 #Store pairs of class/type for further processing
65 class_type_pairs = set()
67 for lang in languages:
68 LOG.warning('Importing phrases for lang: %s...', lang)
69 wiki_page_xml_content = SpecialPhrasesImporter._get_wiki_content(lang)
70 class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang))
71 self.statistics_handler.notify_current_lang_done(lang)
73 self._create_place_classtype_table_and_indexes(class_type_pairs)
74 self._remove_non_existent_phrases_from_db()
75 self.db_connection.commit()
76 LOG.warning('Import done.')
77 self.statistics_handler.notify_import_done()
79 def _fetch_existing_words_phrases(self):
81 Fetch existing special phrases from the word table.
82 Fill the word_phrases_to_delete set of the class.
84 #Only extract special phrases terms:
85 #If class=place and type=house then it is a housenumber term.
86 #If class=place and type=postcode then it is a postcode term.
88 SELECT word, class, type, operator FROM word
89 WHERE class != 'place' OR (type != 'house' AND type != 'postcode')
91 with self.db_connection.cursor() as db_cursor:
92 db_cursor.execute(SQL(word_query))
94 row[3] = '-' if row[3] is None else row[3]
95 self.words_phrases_to_delete.add(
96 (row[0], row[1], row[2], row[3])
99 def _fetch_existing_place_classtype_tables(self):
101 Fetch existing place_classtype tables.
102 Fill the table_phrases_to_delete set of the class.
106 FROM information_schema.tables
107 WHERE table_schema='public'
108 AND table_name like 'place_classtype_%';
110 with self.db_connection.cursor() as db_cursor:
111 db_cursor.execute(SQL(query))
112 for row in db_cursor:
113 self.table_phrases_to_delete.add(row[0])
115 def _load_white_and_black_lists(self):
117 Load white and black lists from phrases-settings.json.
119 settings_path = (self.config.config_dir / 'phrase-settings.json').resolve()
121 if self.config.PHRASE_CONFIG:
122 settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG)
124 with settings_path.open("r") as json_settings:
125 settings = json.load(json_settings)
126 return settings['blackList'], settings['whiteList']
128 def _load_languages(self):
130 Get list of all languages from env config file
131 or default if there is no languages configured.
132 The system will extract special phrases only from all specified languages.
134 default_languages = [
135 'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
136 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
137 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
138 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
139 return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages
142 def _get_wiki_content(lang):
144 Request and return the wiki page's content
145 corresponding to special phrases for a given lang.
146 Requested URL Example :
147 https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
149 url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' + lang.upper() # pylint: disable=line-too-long
152 def _check_sanity(self, lang, phrase_class, phrase_type):
154 Check sanity of given inputs in case somebody added garbage in the wiki.
155 If a bad class/type is detected the system will exit with an error.
157 type_matchs = self.sanity_check_pattern.findall(phrase_type)
158 class_matchs = self.sanity_check_pattern.findall(phrase_class)
160 if not class_matchs or not type_matchs:
161 LOG.warning("Bad class/type for language %s: %s=%s. It will not be imported",
162 lang, phrase_class, phrase_type)
166 def _process_xml_content(self, xml_content, lang):
168 Process given xml content by extracting matching patterns.
169 Matching patterns are processed there and returned in a
170 set of class/type pairs.
172 #One match will be of format [label, class, type, operator, plural]
173 matches = self.occurence_pattern.findall(xml_content)
174 #Store pairs of class/type for further processing
175 class_type_pairs = set()
177 for match in matches:
178 phrase_label = match[0].strip()
179 normalized_label = self.transliterator.transliterate(phrase_label)
180 phrase_class = match[1].strip()
181 phrase_type = match[2].strip()
182 phrase_operator = match[3].strip()
183 #Needed if some operator in the wiki are not written in english
184 phrase_operator = '-' if phrase_operator not in ('near', 'in') else phrase_operator
185 #hack around a bug where building=yes was imported with quotes into the wiki
186 phrase_type = re.sub(r'\"|"', '', phrase_type)
188 #blacklisting: disallow certain class/type combinations
190 phrase_class in self.black_list.keys() and
191 phrase_type in self.black_list[phrase_class]
194 #whitelisting: if class is in whitelist, allow only tags in the list
196 phrase_class in self.white_list.keys() and
197 phrase_type not in self.white_list[phrase_class]
201 #Check if the phrase already exists in the database.
203 (normalized_label, phrase_class, phrase_type, phrase_operator)
204 in self.words_phrases_to_delete
206 #Remove this phrase from the ones to delete as it still exist on the wiki.
207 self.words_phrases_still_exist.add(
208 (normalized_label, phrase_class, phrase_type, phrase_operator)
210 class_type_pairs.add((phrase_class, phrase_type))
211 self.statistics_handler.notify_one_phrase_ignored()
212 #Dont need to add this phrase as it already exists in the word table.
215 #sanity check, in case somebody added garbage in the wiki
216 if not self._check_sanity(lang, phrase_class, phrase_type):
217 self.statistics_handler.notify_one_phrase_invalid()
220 class_type_pairs.add((phrase_class, phrase_type))
222 self._process_amenity(
223 phrase_label, normalized_label, phrase_class,
224 phrase_type, phrase_operator
226 self.statistics_handler.notify_one_phrase_added()
228 return class_type_pairs
230 def _process_amenity(self, phrase_label, normalized_label,
231 phrase_class, phrase_type, phrase_operator):
232 # pylint: disable-msg=too-many-arguments
234 Add phrase lookup and corresponding class and
235 type to the word table based on the operator.
237 with self.db_connection.cursor() as db_cursor:
238 if phrase_operator == 'near':
239 db_cursor.execute("""SELECT getorcreate_amenityoperator(
240 make_standard_name(%s), %s, %s, %s, 'near')""",
241 (phrase_label, normalized_label, phrase_class, phrase_type))
242 elif phrase_operator == 'in':
243 db_cursor.execute("""SELECT getorcreate_amenityoperator(
244 make_standard_name(%s), %s, %s, %s, 'in')""",
245 (phrase_label, normalized_label, phrase_class, phrase_type))
247 db_cursor.execute("""SELECT getorcreate_amenity(
248 make_standard_name(%s), %s, %s, %s)""",
249 (phrase_label, normalized_label, phrase_class, phrase_type))
252 def _create_place_classtype_table_and_indexes(self, class_type_pairs):
254 Create table place_classtype for each given pair.
255 Also create indexes on place_id and centroid.
257 LOG.warning('Create tables and indexes...')
259 sql_tablespace = self.config.TABLESPACE_AUX_DATA
261 sql_tablespace = ' TABLESPACE '+sql_tablespace
263 with self.db_connection.cursor() as db_cursor:
264 db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
266 for pair in class_type_pairs:
267 phrase_class = pair[0]
268 phrase_type = pair[1]
270 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
272 if table_name in self.table_phrases_to_delete:
273 self.statistics_handler.notify_one_table_ignored()
274 #Remove this table from the ones to delete as it match a class/type
275 #still existing on the special phrases of the wiki.
276 self.table_phrases_to_delete.remove(table_name)
277 #So dont need to create the table and indexes.
281 self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
284 self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
286 #Grant access on read to the web user.
287 self._grant_access_to_webuser(phrase_class, phrase_type)
289 self.statistics_handler.notify_one_table_created()
291 with self.db_connection.cursor() as db_cursor:
292 db_cursor.execute("DROP INDEX idx_placex_classtype")
295 def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type):
297 Create table place_classtype of the given phrase_class/phrase_type if doesn't exit.
299 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
300 with self.db_connection.cursor() as db_cursor:
301 db_cursor.execute(SQL("""
302 CREATE TABLE IF NOT EXISTS {{}} {}
303 AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex
304 WHERE class = {{}} AND type = {{}}""".format(sql_tablespace))
305 .format(Identifier(table_name), Literal(phrase_class),
306 Literal(phrase_type)))
309 def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type):
311 Create indexes on centroid and place_id for the place_classtype table.
313 index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type)
314 base_table = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
316 if not self.db_connection.index_exists(index_prefix + 'centroid'):
317 with self.db_connection.cursor() as db_cursor:
318 db_cursor.execute(SQL("""
319 CREATE INDEX {{}} ON {{}} USING GIST (centroid) {}""".format(sql_tablespace))
320 .format(Identifier(index_prefix + 'centroid'),
321 Identifier(base_table)), sql_tablespace)
324 if not self.db_connection.index_exists(index_prefix + 'place_id'):
325 with self.db_connection.cursor() as db_cursor:
326 db_cursor.execute(SQL(
327 """CREATE INDEX {{}} ON {{}} USING btree(place_id) {}""".format(sql_tablespace))
328 .format(Identifier(index_prefix + 'place_id'),
329 Identifier(base_table)))
332 def _grant_access_to_webuser(self, phrase_class, phrase_type):
334 Grant access on read to the table place_classtype for the webuser.
336 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
337 with self.db_connection.cursor() as db_cursor:
338 db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
339 .format(Identifier(table_name),
340 Identifier(self.config.DATABASE_WEBUSER)))
342 def _remove_non_existent_phrases_from_db(self):
344 Remove special phrases which doesn't exist on the wiki anymore.
345 Delete from the word table and delete the place_classtype tables.
347 LOG.warning('Cleaning database...')
348 self.words_phrases_to_delete = self.words_phrases_to_delete - self.words_phrases_still_exist
349 #Array containing all queries to execute. Contain tuples of format (query, parameters)
350 queries_parameters = []
352 #Delete phrases from the word table which are not on the wiki anymore.
353 for phrase_to_delete in self.words_phrases_to_delete:
354 self.statistics_handler.notify_one_phrase_deleted()
355 if phrase_to_delete[3] == '-':
357 DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator IS null
359 parameters = (phrase_to_delete[0], phrase_to_delete[1], phrase_to_delete[2], )
360 queries_parameters.append((query, parameters))
363 DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator = %s
365 parameters = (phrase_to_delete[0], phrase_to_delete[1],
366 phrase_to_delete[2], phrase_to_delete[3], )
367 queries_parameters.append((query, parameters))
369 #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
370 for table in self.table_phrases_to_delete:
371 self.statistics_handler.notify_one_table_deleted()
372 query = SQL('DROP TABLE IF EXISTS {}').format(Identifier(table))
373 queries_parameters.append((query, ()))
375 with self.db_connection.cursor() as db_cursor:
376 for query, parameters in queries_parameters:
377 db_cursor.execute(query, parameters)
379 def _convert_php_settings_if_needed(self, file_path):
381 Convert php settings file of special phrases to json file if it is still in php format.
383 if not isfile(file_path):
384 raise UsageError(str(file_path) + ' is not a valid file.')
386 file, extension = os.path.splitext(file_path)
387 json_file_path = Path(file + '.json').resolve()
389 if extension not in('.php', '.json'):
390 raise UsageError('The custom NOMINATIM_PHRASE_CONFIG file has not a valid extension.')
392 if extension == '.php' and not isfile(json_file_path):
394 subprocess.run(['/usr/bin/env', 'php', '-Cq',
395 (self.phplib_dir / 'migration/PhraseSettingsToJson.php').resolve(),
396 file_path], check=True)
397 LOG.warning('special_phrase configuration file has been converted to json.')
398 return json_file_path
399 except subprocess.CalledProcessError:
400 LOG.error('Error while converting %s to json.', file_path)
403 return json_file_path