2 Functions to import special phrases into the database.
6 from pathlib import Path
10 from os.path import isfile
11 from icu import Transliterator
12 from psycopg2.sql import Identifier, Literal, SQL
13 from nominatim.tools.exec_utils import get_url
14 from nominatim.errors import UsageError
16 LOG = logging.getLogger()
17 class SpecialPhrasesImporter():
18 # pylint: disable-msg=too-many-instance-attributes
19 # pylint: disable-msg=too-few-public-methods
21 Class handling the process of special phrases importations.
23 def __init__(self, config, phplib_dir, db_connection) -> None:
24 self.db_connection = db_connection
26 self.phplib_dir = phplib_dir
27 self.black_list, self.white_list = self._load_white_and_black_lists()
28 #Compile the regex here to increase performances.
29 self.occurence_pattern = re.compile(
30 r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
32 self.sanity_check_pattern = re.compile(r'^\w+$')
33 self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
34 self.config.TERM_NORMALIZATION)
35 #This set will contain all existing phrases from the word table which
36 #no longer exist on the wiki.
37 #It contain tuples with the following format: (normalized_word, class, type, operator)
38 self.words_phrases_to_delete = set()
39 #This set will contain the phrases which still exist from the wiki.
40 #It is used to prevent duplicates on the wiki by removing them from
41 #the word_phrases_to_delete only at the end.
42 self.words_phrases_still_exist = set()
43 #This set will contain all existing place_classtype tables which doesn't match any
44 #special phrases class/type on the wiki.
45 self.table_phrases_to_delete = set()
47 def import_from_wiki(self, languages=None):
49 Iterate through all specified languages and
50 extract corresponding special phrases from the wiki.
52 if languages is not None and not isinstance(languages, list):
53 raise TypeError('The \'languages\' argument should be of type list.')
55 self._fetch_existing_words_phrases()
56 self._fetch_existing_place_classtype_tables()
58 #Get all languages to process.
59 languages = self._load_languages() if not languages else languages
61 #Store pairs of class/type for further processing
62 class_type_pairs = set()
64 for lang in languages:
65 LOG.warning('Import phrases for lang: %s', lang)
66 wiki_page_xml_content = SpecialPhrasesImporter._get_wiki_content(lang)
67 class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang))
69 self._create_place_classtype_table_and_indexes(class_type_pairs)
70 self._remove_non_existent_phrases_from_db()
71 self.db_connection.commit()
72 LOG.warning('Import done.')
74 def _fetch_existing_words_phrases(self):
76 Fetch existing special phrases from the word table.
77 Fill the word_phrases_to_delete set of the class.
79 #Only extract special phrases terms:
80 #If class=place and type=house then it is a housenumber term.
81 #If class=place and type=postcode then it is a postcode term.
83 SELECT word, class, type, operator FROM word
84 WHERE class != 'place' OR (type != 'house' AND type != 'postcode')
86 with self.db_connection.cursor() as db_cursor:
87 db_cursor.execute(SQL(word_query))
89 row[3] = '-' if row[3] is None else row[3]
90 self.words_phrases_to_delete.add(
91 (row[0], row[1], row[2], row[3])
94 def _fetch_existing_place_classtype_tables(self):
96 Fetch existing place_classtype tables.
97 Fill the table_phrases_to_delete set of the class.
101 FROM information_schema.tables
102 WHERE table_schema='public'
103 AND table_name like 'place_classtype_%';
105 with self.db_connection.cursor() as db_cursor:
106 db_cursor.execute(SQL(query))
107 for row in db_cursor:
108 self.table_phrases_to_delete.add(row[0])
110 def _load_white_and_black_lists(self):
112 Load white and black lists from phrases-settings.json.
114 settings_path = (self.config.config_dir / 'phrase-settings.json').resolve()
116 if self.config.PHRASE_CONFIG:
117 settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG)
119 with open(settings_path, "r") as json_settings:
120 settings = json.load(json_settings)
121 return settings['blackList'], settings['whiteList']
123 def _load_languages(self):
125 Get list of all languages from env config file
126 or default if there is no languages configured.
127 The system will extract special phrases only from all specified languages.
129 default_languages = [
130 'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
131 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
132 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
133 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
134 return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages
137 def _get_wiki_content(lang):
139 Request and return the wiki page's content
140 corresponding to special phrases for a given lang.
141 Requested URL Example :
142 https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
144 url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' + lang.upper() # pylint: disable=line-too-long
147 def _check_sanity(self, lang, phrase_class, phrase_type):
149 Check sanity of given inputs in case somebody added garbage in the wiki.
150 If a bad class/type is detected the system will exit with an error.
152 type_matchs = self.sanity_check_pattern.findall(phrase_type)
153 class_matchs = self.sanity_check_pattern.findall(phrase_class)
155 if len(class_matchs) < 1 or len(type_matchs) < 1:
156 LOG.warning("Bad class/type for language %s: %s=%s. It will not be imported",
157 lang, phrase_class, phrase_type)
161 def _process_xml_content(self, xml_content, lang):
163 Process given xml content by extracting matching patterns.
164 Matching patterns are processed there and returned in a
165 set of class/type pairs.
167 #One match will be of format [label, class, type, operator, plural]
168 matches = self.occurence_pattern.findall(xml_content)
169 #Store pairs of class/type for further processing
170 class_type_pairs = set()
172 for match in matches:
173 phrase_label = match[0].strip()
174 normalized_label = self.transliterator.transliterate(phrase_label)
175 phrase_class = match[1].strip()
176 phrase_type = match[2].strip()
177 phrase_operator = match[3].strip()
178 #Needed if some operator in the wiki are not written in english
179 phrase_operator = '-' if phrase_operator not in ('near', 'in') else phrase_operator
180 #hack around a bug where building=yes was imported with quotes into the wiki
181 phrase_type = re.sub(r'\"|"', '', phrase_type)
183 #blacklisting: disallow certain class/type combinations
185 phrase_class in self.black_list.keys() and
186 phrase_type in self.black_list[phrase_class]
189 #whitelisting: if class is in whitelist, allow only tags in the list
191 phrase_class in self.white_list.keys() and
192 phrase_type not in self.white_list[phrase_class]
196 #Check if the phrase already exists in the database.
198 (normalized_label, phrase_class, phrase_type, phrase_operator)
199 in self.words_phrases_to_delete
201 #Remove this phrase from the ones to delete as it still exist on the wiki.
202 self.words_phrases_still_exist.add(
203 (normalized_label, phrase_class, phrase_type, phrase_operator)
205 class_type_pairs.add((phrase_class, phrase_type))
206 #Dont need to add this phrase as it already exists in the word table.
209 #sanity check, in case somebody added garbage in the wiki
210 if not self._check_sanity(lang, phrase_class, phrase_type):
213 class_type_pairs.add((phrase_class, phrase_type))
215 self._process_amenity(
216 phrase_label, normalized_label, phrase_class,
217 phrase_type, phrase_operator
220 return class_type_pairs
222 def _process_amenity(self, phrase_label, normalized_label,
223 phrase_class, phrase_type, phrase_operator):
224 # pylint: disable-msg=too-many-arguments
226 Add phrase lookup and corresponding class and
227 type to the word table based on the operator.
229 with self.db_connection.cursor() as db_cursor:
230 if phrase_operator == 'near':
231 db_cursor.execute("""SELECT getorcreate_amenityoperator(
232 make_standard_name(%s), %s, %s, %s, 'near')""",
233 (phrase_label, normalized_label, phrase_class, phrase_type))
234 elif phrase_operator == 'in':
235 db_cursor.execute("""SELECT getorcreate_amenityoperator(
236 make_standard_name(%s), %s, %s, %s, 'in')""",
237 (phrase_label, normalized_label, phrase_class, phrase_type))
239 db_cursor.execute("""SELECT getorcreate_amenity(
240 make_standard_name(%s), %s, %s, %s)""",
241 (phrase_label, normalized_label, phrase_class, phrase_type))
244 def _create_place_classtype_table_and_indexes(self, class_type_pairs):
246 Create table place_classtype for each given pair.
247 Also create indexes on place_id and centroid.
249 LOG.warning('Create tables and indexes...')
251 sql_tablespace = self.config.TABLESPACE_AUX_DATA
253 sql_tablespace = ' TABLESPACE '+sql_tablespace
255 with self.db_connection.cursor() as db_cursor:
256 db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
258 for pair in class_type_pairs:
259 phrase_class = pair[0]
260 phrase_type = pair[1]
262 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
264 if table_name in self.table_phrases_to_delete:
265 #Remove this table from the ones to delete as it match a class/type
266 #still existing on the special phrases of the wiki.
267 self.table_phrases_to_delete.remove(table_name)
268 #So dont need to create the table and indexes.
272 self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
275 self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
277 #Grant access on read to the web user.
278 self._grant_access_to_webuser(phrase_class, phrase_type)
280 with self.db_connection.cursor() as db_cursor:
281 db_cursor.execute("DROP INDEX idx_placex_classtype")
284 def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type):
286 Create table place_classtype of the given phrase_class/phrase_type if doesn't exit.
288 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
289 with self.db_connection.cursor() as db_cursor:
290 db_cursor.execute(SQL("""
291 CREATE TABLE IF NOT EXISTS {{}} {}
292 AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex
293 WHERE class = {{}} AND type = {{}}""".format(sql_tablespace))
294 .format(Identifier(table_name), Literal(phrase_class),
295 Literal(phrase_type)))
298 def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type):
300 Create indexes on centroid and place_id for the place_classtype table.
302 index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type)
303 base_table = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
305 if not self.db_connection.index_exists(index_prefix + 'centroid'):
306 with self.db_connection.cursor() as db_cursor:
307 db_cursor.execute(SQL("""
308 CREATE INDEX {{}} ON {{}} USING GIST (centroid) {}""".format(sql_tablespace))
309 .format(Identifier(index_prefix + 'centroid'),
310 Identifier(base_table)), sql_tablespace)
313 if not self.db_connection.index_exists(index_prefix + 'place_id'):
314 with self.db_connection.cursor() as db_cursor:
315 db_cursor.execute(SQL(
316 """CREATE INDEX {{}} ON {{}} USING btree(place_id) {}""".format(sql_tablespace))
317 .format(Identifier(index_prefix + 'place_id'),
318 Identifier(base_table)))
321 def _grant_access_to_webuser(self, phrase_class, phrase_type):
323 Grant access on read to the table place_classtype for the webuser.
325 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
326 with self.db_connection.cursor() as db_cursor:
327 db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
328 .format(Identifier(table_name),
329 Identifier(self.config.DATABASE_WEBUSER)))
331 def _remove_non_existent_phrases_from_db(self):
333 Remove special phrases which doesn't exist on the wiki anymore.
334 Delete from the word table and delete the place_classtype tables.
336 LOG.warning('Cleaning database...')
337 self.words_phrases_to_delete = self.words_phrases_to_delete - self.words_phrases_still_exist
338 #Array containing all queries to execute. Contain tuples of format (query, parameters)
339 queries_parameters = []
341 #Delete phrases from the word table which are not on the wiki anymore.
342 for phrase_to_delete in self.words_phrases_to_delete:
343 if phrase_to_delete[3] == '-':
345 DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator IS null
347 parameters = (phrase_to_delete[0], phrase_to_delete[1], phrase_to_delete[2], )
348 queries_parameters.append((query, parameters))
351 DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator = %s
353 parameters = (phrase_to_delete[0], phrase_to_delete[1],
354 phrase_to_delete[2], phrase_to_delete[3], )
355 queries_parameters.append((query, parameters))
357 #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
358 for table in self.table_phrases_to_delete:
359 query = SQL('DROP TABLE IF EXISTS {}').format(Identifier(table))
360 queries_parameters.append((query, ()))
362 with self.db_connection.cursor() as db_cursor:
363 for query, parameters in queries_parameters:
364 db_cursor.execute(query, parameters)
366 def _convert_php_settings_if_needed(self, file_path):
368 Convert php settings file of special phrases to json file if it is still in php format.
370 if not isfile(file_path):
371 raise UsageError(str(file_path) + ' is not a valid file.')
373 file, extension = os.path.splitext(file_path)
374 json_file_path = Path(file + '.json').resolve()
376 if extension not in('.php', '.json'):
377 raise UsageError('The custom NOMINATIM_PHRASE_CONFIG file has not a valid extension.')
379 if extension == '.php' and not isfile(json_file_path):
381 subprocess.run(['/usr/bin/env', 'php', '-Cq',
382 (self.phplib_dir / 'migration/PhraseSettingsToJson.php').resolve(),
383 file_path], check=True)
384 LOG.warning('special_phrase configuration file has been converted to json.')
385 return json_file_path
386 except subprocess.CalledProcessError:
387 LOG.error('Error while converting %s to json.', file_path)
390 return json_file_path