2 Functions to import special phrases into the database.
6 from os.path import isfile
7 from pathlib import Path
12 from icu import Transliterator
13 from psycopg2.sql import Identifier, Literal, SQL
15 from nominatim.tools.exec_utils import get_url
16 from nominatim.errors import UsageError
18 LOG = logging.getLogger()
19 class SpecialPhrasesImporter():
20 # pylint: disable-msg=too-many-instance-attributes
22 Class handling the process of special phrases importations.
24 def __init__(self, config, phplib_dir, db_connection) -> None:
25 self.db_connection = db_connection
27 self.phplib_dir = phplib_dir
28 self.black_list, self.white_list = self._load_white_and_black_lists()
29 #Compile the regex here to increase performances.
30 self.occurence_pattern = re.compile(
31 r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
33 self.sanity_check_pattern = re.compile(r'^\w+$')
34 self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
35 self.config.TERM_NORMALIZATION)
36 #This set will contain all existing phrases from the word table which
37 #no longer exist on the wiki.
38 #It contain tuples with the following format: (normalized_word, class, type, operator)
39 self.words_phrases_to_delete = set()
40 #This set will contain the phrases which still exist from the wiki.
41 #It is used to prevent duplicates on the wiki by removing them from
42 #the word_phrases_to_delete only at the end.
43 self.words_phrases_still_exist = set()
44 #This set will contain all existing place_classtype tables which doesn't match any
45 #special phrases class/type on the wiki.
46 self.table_phrases_to_delete = set()
48 def import_from_wiki(self, languages=None):
50 Iterate through all specified languages and
51 extract corresponding special phrases from the wiki.
53 if languages is not None and not isinstance(languages, list):
54 raise TypeError('The \'languages\' argument should be of type list.')
56 self._fetch_existing_words_phrases()
57 self._fetch_existing_place_classtype_tables()
59 #Get all languages to process.
60 languages = self._load_languages() if not languages else languages
62 #Store pairs of class/type for further processing
63 class_type_pairs = set()
65 for lang in languages:
66 LOG.warning('Import phrases for lang: %s', lang)
67 wiki_page_xml_content = SpecialPhrasesImporter._get_wiki_content(lang)
68 class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang))
70 self._create_place_classtype_table_and_indexes(class_type_pairs)
71 self._remove_non_existent_phrases_from_db()
72 self.db_connection.commit()
73 LOG.warning('Import done.')
75 def _fetch_existing_words_phrases(self):
77 Fetch existing special phrases from the word table.
78 Fill the word_phrases_to_delete set of the class.
80 #Only extract special phrases terms:
81 #If class=place and type=house then it is a housenumber term.
82 #If class=place and type=postcode then it is a postcode term.
84 SELECT word, class, type, operator FROM word
85 WHERE class != 'place' OR (type != 'house' AND type != 'postcode')
87 with self.db_connection.cursor() as db_cursor:
88 db_cursor.execute(SQL(word_query))
90 row[3] = '-' if row[3] is None else row[3]
91 self.words_phrases_to_delete.add(
92 (row[0], row[1], row[2], row[3])
95 def _fetch_existing_place_classtype_tables(self):
97 Fetch existing place_classtype tables.
98 Fill the table_phrases_to_delete set of the class.
102 FROM information_schema.tables
103 WHERE table_schema='public'
104 AND table_name like 'place_classtype_%';
106 with self.db_connection.cursor() as db_cursor:
107 db_cursor.execute(SQL(query))
108 for row in db_cursor:
109 self.table_phrases_to_delete.add(row[0])
111 def _load_white_and_black_lists(self):
113 Load white and black lists from phrases-settings.json.
115 settings_path = (self.config.config_dir / 'phrase-settings.json').resolve()
117 if self.config.PHRASE_CONFIG:
118 settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG)
120 with settings_path.open("r") as json_settings:
121 settings = json.load(json_settings)
122 return settings['blackList'], settings['whiteList']
124 def _load_languages(self):
126 Get list of all languages from env config file
127 or default if there is no languages configured.
128 The system will extract special phrases only from all specified languages.
130 default_languages = [
131 'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
132 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
133 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
134 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
135 return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages
138 def _get_wiki_content(lang):
140 Request and return the wiki page's content
141 corresponding to special phrases for a given lang.
142 Requested URL Example :
143 https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
145 url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' + lang.upper() # pylint: disable=line-too-long
148 def _check_sanity(self, lang, phrase_class, phrase_type):
150 Check sanity of given inputs in case somebody added garbage in the wiki.
151 If a bad class/type is detected the system will exit with an error.
153 type_matchs = self.sanity_check_pattern.findall(phrase_type)
154 class_matchs = self.sanity_check_pattern.findall(phrase_class)
156 if not class_matchs or not type_matchs:
157 LOG.warning("Bad class/type for language %s: %s=%s. It will not be imported",
158 lang, phrase_class, phrase_type)
162 def _process_xml_content(self, xml_content, lang):
164 Process given xml content by extracting matching patterns.
165 Matching patterns are processed there and returned in a
166 set of class/type pairs.
168 #One match will be of format [label, class, type, operator, plural]
169 matches = self.occurence_pattern.findall(xml_content)
170 #Store pairs of class/type for further processing
171 class_type_pairs = set()
173 for match in matches:
174 phrase_label = match[0].strip()
175 normalized_label = self.transliterator.transliterate(phrase_label)
176 phrase_class = match[1].strip()
177 phrase_type = match[2].strip()
178 phrase_operator = match[3].strip()
179 #Needed if some operator in the wiki are not written in english
180 phrase_operator = '-' if phrase_operator not in ('near', 'in') else phrase_operator
181 #hack around a bug where building=yes was imported with quotes into the wiki
182 phrase_type = re.sub(r'\"|"', '', phrase_type)
184 #blacklisting: disallow certain class/type combinations
186 phrase_class in self.black_list.keys() and
187 phrase_type in self.black_list[phrase_class]
190 #whitelisting: if class is in whitelist, allow only tags in the list
192 phrase_class in self.white_list.keys() and
193 phrase_type not in self.white_list[phrase_class]
197 #Check if the phrase already exists in the database.
199 (normalized_label, phrase_class, phrase_type, phrase_operator)
200 in self.words_phrases_to_delete
202 #Remove this phrase from the ones to delete as it still exist on the wiki.
203 self.words_phrases_still_exist.add(
204 (normalized_label, phrase_class, phrase_type, phrase_operator)
206 class_type_pairs.add((phrase_class, phrase_type))
207 #Dont need to add this phrase as it already exists in the word table.
210 #sanity check, in case somebody added garbage in the wiki
211 if not self._check_sanity(lang, phrase_class, phrase_type):
214 class_type_pairs.add((phrase_class, phrase_type))
216 self._process_amenity(
217 phrase_label, normalized_label, phrase_class,
218 phrase_type, phrase_operator
221 return class_type_pairs
223 def _process_amenity(self, phrase_label, normalized_label,
224 phrase_class, phrase_type, phrase_operator):
225 # pylint: disable-msg=too-many-arguments
227 Add phrase lookup and corresponding class and
228 type to the word table based on the operator.
230 with self.db_connection.cursor() as db_cursor:
231 if phrase_operator == 'near':
232 db_cursor.execute("""SELECT getorcreate_amenityoperator(
233 make_standard_name(%s), %s, %s, %s, 'near')""",
234 (phrase_label, normalized_label, phrase_class, phrase_type))
235 elif phrase_operator == 'in':
236 db_cursor.execute("""SELECT getorcreate_amenityoperator(
237 make_standard_name(%s), %s, %s, %s, 'in')""",
238 (phrase_label, normalized_label, phrase_class, phrase_type))
240 db_cursor.execute("""SELECT getorcreate_amenity(
241 make_standard_name(%s), %s, %s, %s)""",
242 (phrase_label, normalized_label, phrase_class, phrase_type))
245 def _create_place_classtype_table_and_indexes(self, class_type_pairs):
247 Create table place_classtype for each given pair.
248 Also create indexes on place_id and centroid.
250 LOG.warning('Create tables and indexes...')
252 sql_tablespace = self.config.TABLESPACE_AUX_DATA
254 sql_tablespace = ' TABLESPACE '+sql_tablespace
256 with self.db_connection.cursor() as db_cursor:
257 db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
259 for pair in class_type_pairs:
260 phrase_class = pair[0]
261 phrase_type = pair[1]
263 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
265 if table_name in self.table_phrases_to_delete:
266 #Remove this table from the ones to delete as it match a class/type
267 #still existing on the special phrases of the wiki.
268 self.table_phrases_to_delete.remove(table_name)
269 #So dont need to create the table and indexes.
273 self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
276 self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
278 #Grant access on read to the web user.
279 self._grant_access_to_webuser(phrase_class, phrase_type)
281 with self.db_connection.cursor() as db_cursor:
282 db_cursor.execute("DROP INDEX idx_placex_classtype")
285 def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type):
287 Create table place_classtype of the given phrase_class/phrase_type if doesn't exit.
289 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
290 with self.db_connection.cursor() as db_cursor:
291 db_cursor.execute(SQL("""
292 CREATE TABLE IF NOT EXISTS {{}} {}
293 AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex
294 WHERE class = {{}} AND type = {{}}""".format(sql_tablespace))
295 .format(Identifier(table_name), Literal(phrase_class),
296 Literal(phrase_type)))
299 def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type):
301 Create indexes on centroid and place_id for the place_classtype table.
303 index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type)
304 base_table = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
306 if not self.db_connection.index_exists(index_prefix + 'centroid'):
307 with self.db_connection.cursor() as db_cursor:
308 db_cursor.execute(SQL("""
309 CREATE INDEX {{}} ON {{}} USING GIST (centroid) {}""".format(sql_tablespace))
310 .format(Identifier(index_prefix + 'centroid'),
311 Identifier(base_table)), sql_tablespace)
314 if not self.db_connection.index_exists(index_prefix + 'place_id'):
315 with self.db_connection.cursor() as db_cursor:
316 db_cursor.execute(SQL(
317 """CREATE INDEX {{}} ON {{}} USING btree(place_id) {}""".format(sql_tablespace))
318 .format(Identifier(index_prefix + 'place_id'),
319 Identifier(base_table)))
322 def _grant_access_to_webuser(self, phrase_class, phrase_type):
324 Grant access on read to the table place_classtype for the webuser.
326 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
327 with self.db_connection.cursor() as db_cursor:
328 db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
329 .format(Identifier(table_name),
330 Identifier(self.config.DATABASE_WEBUSER)))
332 def _remove_non_existent_phrases_from_db(self):
334 Remove special phrases which doesn't exist on the wiki anymore.
335 Delete from the word table and delete the place_classtype tables.
337 LOG.warning('Cleaning database...')
338 self.words_phrases_to_delete = self.words_phrases_to_delete - self.words_phrases_still_exist
339 #Array containing all queries to execute. Contain tuples of format (query, parameters)
340 queries_parameters = []
342 #Delete phrases from the word table which are not on the wiki anymore.
343 for phrase_to_delete in self.words_phrases_to_delete:
344 if phrase_to_delete[3] == '-':
346 DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator IS null
348 parameters = (phrase_to_delete[0], phrase_to_delete[1], phrase_to_delete[2], )
349 queries_parameters.append((query, parameters))
352 DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator = %s
354 parameters = (phrase_to_delete[0], phrase_to_delete[1],
355 phrase_to_delete[2], phrase_to_delete[3], )
356 queries_parameters.append((query, parameters))
358 #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
359 for table in self.table_phrases_to_delete:
360 query = SQL('DROP TABLE IF EXISTS {}').format(Identifier(table))
361 queries_parameters.append((query, ()))
363 with self.db_connection.cursor() as db_cursor:
364 for query, parameters in queries_parameters:
365 db_cursor.execute(query, parameters)
367 def _convert_php_settings_if_needed(self, file_path):
369 Convert php settings file of special phrases to json file if it is still in php format.
371 if not isfile(file_path):
372 raise UsageError(str(file_path) + ' is not a valid file.')
374 file, extension = os.path.splitext(file_path)
375 json_file_path = Path(file + '.json').resolve()
377 if extension not in('.php', '.json'):
378 raise UsageError('The custom NOMINATIM_PHRASE_CONFIG file has not a valid extension.')
380 if extension == '.php' and not isfile(json_file_path):
382 subprocess.run(['/usr/bin/env', 'php', '-Cq',
383 (self.phplib_dir / 'migration/PhraseSettingsToJson.php').resolve(),
384 file_path], check=True)
385 LOG.warning('special_phrase configuration file has been converted to json.')
386 return json_file_path
387 except subprocess.CalledProcessError:
388 LOG.error('Error while converting %s to json.', file_path)
391 return json_file_path