2 Functions to import special phrases into the database.
6 from pathlib import Path
10 from os.path import isfile
11 from icu import Transliterator
12 from psycopg2.sql import Identifier, Literal, SQL
13 from nominatim.tools.exec_utils import get_url
14 from nominatim.errors import UsageError
16 LOG = logging.getLogger()
17 class SpecialPhrasesImporter():
18 # pylint: disable-msg=too-many-instance-attributes
19 # pylint: disable-msg=too-few-public-methods
21 Class handling the process of special phrases importations.
23 def __init__(self, config, phplib_dir, db_connection) -> None:
24 self.db_connection = db_connection
26 self.phplib_dir = phplib_dir
27 self.black_list, self.white_list = self._load_white_and_black_lists()
28 #Compile the regex here to increase performances.
29 self.occurence_pattern = re.compile(
30 r'\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([\-YN])'
32 self.sanity_check_pattern = re.compile(r'^\w+$')
33 self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
34 self.config.TERM_NORMALIZATION)
35 #This set will contain all existing phrases from the word table which
36 #no longer exist on the wiki.
37 #It contain tuples with the following format: (normalized_word, class, type, operator)
38 self.words_phrases_to_delete = set()
39 #This set will contain the phrases which still exist from the wiki.
40 #It is used to prevent duplicates on the wiki by removing them from
41 #the word_phrases_to_delete only at the end.
42 self.words_phrases_still_exist = set()
43 #This set will contain all existing place_classtype tables which doesn't match any
44 #special phrases class/type on the wiki.
45 self.table_phrases_to_delete = set()
47 def import_from_wiki(self, languages=None):
49 Iterate through all specified languages and
50 extract corresponding special phrases from the wiki.
52 if languages is not None and not isinstance(languages, list):
53 raise TypeError('The \'languages\' argument should be of type list.')
55 self._fetch_existing_words_phrases()
56 self._fetch_existing_place_classtype_tables()
58 #Get all languages to process.
59 languages = self._load_languages() if not languages else languages
61 #Store pairs of class/type for further processing
62 class_type_pairs = set()
64 for lang in languages:
65 LOG.warning('Import phrases for lang: %s', lang)
66 wiki_page_xml_content = SpecialPhrasesImporter._get_wiki_content(lang)
67 class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang))
69 self._create_place_classtype_table_and_indexes(class_type_pairs)
70 self._remove_non_existent_phrases_from_db()
71 self.db_connection.commit()
72 LOG.warning('Import done.')
74 def _fetch_existing_words_phrases(self):
76 Fetch existing special phrases from the word table.
77 Fill the word_phrases_to_delete set of the class.
79 #Only extract special phrases terms:
80 #If class=place and type=house then it is a housenumber term.
81 #If class=place and type=postcode then it is a postcode term.
83 SELECT word, class, type, operator FROM word
84 WHERE class != 'place' OR (type != 'house' AND type != 'postcode')
86 with self.db_connection.cursor() as db_cursor:
87 db_cursor.execute(SQL(word_query))
89 row[3] = '-' if row[3] is None else row[3]
90 self.words_phrases_to_delete.add(
91 (row[0], row[1], row[2], row[3])
94 def _fetch_existing_place_classtype_tables(self):
96 Fetch existing place_classtype tables.
97 Fill the table_phrases_to_delete set of the class.
101 FROM information_schema.tables
102 WHERE table_schema='public'
103 AND table_name like 'place_classtype_%';
105 with self.db_connection.cursor() as db_cursor:
106 db_cursor.execute(SQL(query))
107 for row in db_cursor:
108 self.table_phrases_to_delete.add(row[0])
110 def _load_white_and_black_lists(self):
112 Load white and black lists from phrases-settings.json.
114 settings_path = (self.config.config_dir / 'phrase-settings.json').resolve()
116 if self.config.PHRASE_CONFIG:
117 settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG)
119 with open(settings_path, "r") as json_settings:
120 settings = json.load(json_settings)
121 return settings['blackList'], settings['whiteList']
123 def _load_languages(self):
125 Get list of all languages from env config file
126 or default if there is no languages configured.
127 The system will extract special phrases only from all specified languages.
129 default_languages = [
130 'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
131 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
132 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
133 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
134 return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages
137 def _get_wiki_content(lang):
139 Request and return the wiki page's content
140 corresponding to special phrases for a given lang.
141 Requested URL Example :
142 https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
144 url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' + lang.upper() # pylint: disable=line-too-long
147 def _check_sanity(self, lang, phrase_class, phrase_type):
149 Check sanity of given inputs in case somebody added garbage in the wiki.
150 If a bad class/type is detected the system will exit with an error.
152 type_matchs = self.sanity_check_pattern.findall(phrase_type)
153 class_matchs = self.sanity_check_pattern.findall(phrase_class)
155 if len(class_matchs) < 1 or len(type_matchs) < 1:
156 raise UsageError("Bad class/type for language {}: {}={}".format(
157 lang, phrase_class, phrase_type))
159 def _process_xml_content(self, xml_content, lang):
161 Process given xml content by extracting matching patterns.
162 Matching patterns are processed there and returned in a
163 set of class/type pairs.
165 #One match will be of format [label, class, type, operator, plural]
166 matches = self.occurence_pattern.findall(xml_content)
167 #Store pairs of class/type for further processing
168 class_type_pairs = set()
170 for match in matches:
171 phrase_label = match[0].strip()
172 normalized_label = self.transliterator.transliterate(phrase_label)
173 phrase_class = match[1].strip()
174 phrase_type = match[2].strip()
175 phrase_operator = match[3].strip()
176 #Needed if some operator in the wiki are not written in english
177 phrase_operator = '-' if phrase_operator not in ('near', 'in') else phrase_operator
178 #hack around a bug where building=yes was imported with quotes into the wiki
179 phrase_type = re.sub(r'\"|"', '', phrase_type)
181 #blacklisting: disallow certain class/type combinations
183 phrase_class in self.black_list.keys() and
184 phrase_type in self.black_list[phrase_class]
187 #whitelisting: if class is in whitelist, allow only tags in the list
189 phrase_class in self.white_list.keys() and
190 phrase_type not in self.white_list[phrase_class]
194 #Check if the phrase already exists in the database.
196 (normalized_label, phrase_class, phrase_type, phrase_operator)
197 in self.words_phrases_to_delete
199 #Remove this phrase from the ones to delete as it still exist on the wiki.
200 self.words_phrases_still_exist.add(
201 (normalized_label, phrase_class, phrase_type, phrase_operator)
203 class_type_pairs.add((phrase_class, phrase_type))
204 #Dont need to add this phrase as it already exists in the word table.
207 #sanity check, in case somebody added garbage in the wiki
208 self._check_sanity(lang, phrase_class, phrase_type)
210 class_type_pairs.add((phrase_class, phrase_type))
212 self._process_amenity(
213 phrase_label, normalized_label, phrase_class,
214 phrase_type, phrase_operator
217 return class_type_pairs
219 def _process_amenity(self, phrase_label, normalized_label,
220 phrase_class, phrase_type, phrase_operator):
221 # pylint: disable-msg=too-many-arguments
223 Add phrase lookup and corresponding class and
224 type to the word table based on the operator.
226 with self.db_connection.cursor() as db_cursor:
227 if phrase_operator == 'near':
228 db_cursor.execute("""SELECT getorcreate_amenityoperator(
229 make_standard_name(%s), %s, %s, %s, 'near')""",
230 (phrase_label, normalized_label, phrase_class, phrase_type))
231 elif phrase_operator == 'in':
232 db_cursor.execute("""SELECT getorcreate_amenityoperator(
233 make_standard_name(%s), %s, %s, %s, 'in')""",
234 (phrase_label, normalized_label, phrase_class, phrase_type))
236 db_cursor.execute("""SELECT getorcreate_amenity(
237 make_standard_name(%s), %s, %s, %s)""",
238 (phrase_label, normalized_label, phrase_class, phrase_type))
241 def _create_place_classtype_table_and_indexes(self, class_type_pairs):
243 Create table place_classtype for each given pair.
244 Also create indexes on place_id and centroid.
246 LOG.warning('Create tables and indexes...')
248 sql_tablespace = self.config.TABLESPACE_AUX_DATA
250 sql_tablespace = ' TABLESPACE '+sql_tablespace
252 with self.db_connection.cursor() as db_cursor:
253 db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
255 for pair in class_type_pairs:
256 phrase_class = pair[0]
257 phrase_type = pair[1]
259 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
261 if table_name in self.table_phrases_to_delete:
262 #Remove this table from the ones to delete as it match a class/type
263 #still existing on the special phrases of the wiki.
264 self.table_phrases_to_delete.remove(table_name)
265 #So dont need to create the table and indexes.
269 self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
272 self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
274 #Grant access on read to the web user.
275 self._grant_access_to_webuser(phrase_class, phrase_type)
277 with self.db_connection.cursor() as db_cursor:
278 db_cursor.execute("DROP INDEX idx_placex_classtype")
281 def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type):
283 Create table place_classtype of the given phrase_class/phrase_type if doesn't exit.
285 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
286 with self.db_connection.cursor() as db_cursor:
287 db_cursor.execute(SQL("""
288 CREATE TABLE IF NOT EXISTS {{}} {}
289 AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex
290 WHERE class = {{}} AND type = {{}}""".format(sql_tablespace))
291 .format(Identifier(table_name), Literal(phrase_class),
292 Literal(phrase_type)))
295 def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type):
297 Create indexes on centroid and place_id for the place_classtype table.
299 index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type)
300 base_table = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
302 if not self.db_connection.index_exists(index_prefix + 'centroid'):
303 with self.db_connection.cursor() as db_cursor:
304 db_cursor.execute(SQL("""
305 CREATE INDEX {{}} ON {{}} USING GIST (centroid) {}""".format(sql_tablespace))
306 .format(Identifier(index_prefix + 'centroid'),
307 Identifier(base_table)), sql_tablespace)
310 if not self.db_connection.index_exists(index_prefix + 'place_id'):
311 with self.db_connection.cursor() as db_cursor:
312 db_cursor.execute(SQL(
313 """CREATE INDEX {{}} ON {{}} USING btree(place_id) {}""".format(sql_tablespace))
314 .format(Identifier(index_prefix + 'place_id'),
315 Identifier(base_table)))
318 def _grant_access_to_webuser(self, phrase_class, phrase_type):
320 Grant access on read to the table place_classtype for the webuser.
322 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
323 with self.db_connection.cursor() as db_cursor:
324 db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
325 .format(Identifier(table_name),
326 Identifier(self.config.DATABASE_WEBUSER)))
328 def _remove_non_existent_phrases_from_db(self):
330 Remove special phrases which doesn't exist on the wiki anymore.
331 Delete from the word table and delete the place_classtype tables.
333 LOG.warning('Cleaning database...')
334 self.words_phrases_to_delete = self.words_phrases_to_delete - self.words_phrases_still_exist
335 #Array containing all queries to execute. Contain tuples of format (query, parameters)
336 queries_parameters = []
338 #Delete phrases from the word table which are not on the wiki anymore.
339 for phrase_to_delete in self.words_phrases_to_delete:
340 if phrase_to_delete[3] == '-':
342 DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator IS null
344 parameters = (phrase_to_delete[0], phrase_to_delete[1], phrase_to_delete[2], )
345 queries_parameters.append((query, parameters))
348 DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator = %s
350 parameters = (phrase_to_delete[0], phrase_to_delete[1],
351 phrase_to_delete[2], phrase_to_delete[3], )
352 queries_parameters.append((query, parameters))
354 #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
355 for table in self.table_phrases_to_delete:
356 query = SQL('DROP TABLE IF EXISTS {}').format(Identifier(table))
357 queries_parameters.append((query, ()))
359 with self.db_connection.cursor() as db_cursor:
360 for query, parameters in queries_parameters:
361 db_cursor.execute(query, parameters)
363 def _convert_php_settings_if_needed(self, file_path):
365 Convert php settings file of special phrases to json file if it is still in php format.
367 if not isfile(file_path):
368 raise UsageError(str(file_path) + ' is not a valid file.')
370 file, extension = os.path.splitext(file_path)
371 json_file_path = Path(file + '.json').resolve()
373 if extension not in('.php', '.json'):
374 raise UsageError('The custom NOMINATIM_PHRASE_CONFIG file has not a valid extension.')
376 if extension == '.php' and not isfile(json_file_path):
378 subprocess.run(['/usr/bin/env', 'php', '-Cq',
379 (self.phplib_dir / 'migration/PhraseSettingsToJson.php').resolve(),
380 file_path], check=True)
381 LOG.warning('special_phrase configuration file has been converted to json.')
382 return json_file_path
383 except subprocess.CalledProcessError:
384 LOG.error('Error while converting %s to json.', file_path)
387 return json_file_path