2 Functions to import special phrases into the database.
6 from os.path import isfile
7 from pathlib import Path
12 from psycopg2.sql import Identifier, Literal, SQL
14 from nominatim.tools.exec_utils import get_url
15 from nominatim.errors import UsageError
16 from nominatim.tools.special_phrases.importer_statistics import SpecialPhrasesImporterStatistics
18 LOG = logging.getLogger()
19 class SpecialPhrasesImporter():
20 # pylint: disable-msg=too-many-instance-attributes
22 Class handling the process of special phrases importations.
24 def __init__(self, config, phplib_dir, db_connection) -> None:
25 self.statistics_handler = SpecialPhrasesImporterStatistics()
26 self.db_connection = db_connection
28 self.phplib_dir = phplib_dir
29 self.black_list, self.white_list = self._load_white_and_black_lists()
30 #Compile the regex here to increase performances.
31 self.occurence_pattern = re.compile(
32 r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
34 self.sanity_check_pattern = re.compile(r'^\w+$')
35 # This set will contain all existing phrases to be added.
36 # It contains tuples with the following format: (lable, class, type, operator)
37 self.word_phrases = set()
38 #This set will contain all existing place_classtype tables which doesn't match any
39 #special phrases class/type on the wiki.
40 self.table_phrases_to_delete = set()
42 def import_from_wiki(self, tokenizer, languages=None):
44 Iterate through all specified languages and
45 extract corresponding special phrases from the wiki.
47 if languages is not None and not isinstance(languages, list):
48 raise TypeError('The \'languages\' argument should be of type list.')
50 self._fetch_existing_place_classtype_tables()
52 #Get all languages to process.
53 languages = self._load_languages() if not languages else languages
55 #Store pairs of class/type for further processing
56 class_type_pairs = set()
58 for lang in languages:
59 LOG.warning('Importing phrases for lang: %s...', lang)
60 wiki_page_xml_content = SpecialPhrasesImporter._get_wiki_content(lang)
61 class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang))
62 self.statistics_handler.notify_current_lang_done(lang)
64 self._create_place_classtype_table_and_indexes(class_type_pairs)
65 self._remove_non_existent_tables_from_db()
66 self.db_connection.commit()
68 with tokenizer.name_analyzer() as analyzer:
69 analyzer.update_special_phrases(self.word_phrases)
71 LOG.warning('Import done.')
72 self.statistics_handler.notify_import_done()
75 def _fetch_existing_place_classtype_tables(self):
77 Fetch existing place_classtype tables.
78 Fill the table_phrases_to_delete set of the class.
82 FROM information_schema.tables
83 WHERE table_schema='public'
84 AND table_name like 'place_classtype_%';
86 with self.db_connection.cursor() as db_cursor:
87 db_cursor.execute(SQL(query))
89 self.table_phrases_to_delete.add(row[0])
91 def _load_white_and_black_lists(self):
93 Load white and black lists from phrases-settings.json.
95 settings_path = (self.config.config_dir / 'phrase-settings.json').resolve()
97 if self.config.PHRASE_CONFIG:
98 settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG)
100 with settings_path.open("r") as json_settings:
101 settings = json.load(json_settings)
102 return settings['blackList'], settings['whiteList']
104 def _load_languages(self):
106 Get list of all languages from env config file
107 or default if there is no languages configured.
108 The system will extract special phrases only from all specified languages.
110 default_languages = [
111 'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
112 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
113 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
114 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
115 return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages
118 def _get_wiki_content(lang):
120 Request and return the wiki page's content
121 corresponding to special phrases for a given lang.
122 Requested URL Example :
123 https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
125 url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' + lang.upper() # pylint: disable=line-too-long
128 def _check_sanity(self, lang, phrase_class, phrase_type):
130 Check sanity of given inputs in case somebody added garbage in the wiki.
131 If a bad class/type is detected the system will exit with an error.
133 type_matchs = self.sanity_check_pattern.findall(phrase_type)
134 class_matchs = self.sanity_check_pattern.findall(phrase_class)
136 if not class_matchs or not type_matchs:
137 LOG.warning("Bad class/type for language %s: %s=%s. It will not be imported",
138 lang, phrase_class, phrase_type)
142 def _process_xml_content(self, xml_content, lang):
144 Process given xml content by extracting matching patterns.
145 Matching patterns are processed there and returned in a
146 set of class/type pairs.
148 #One match will be of format [label, class, type, operator, plural]
149 matches = self.occurence_pattern.findall(xml_content)
150 #Store pairs of class/type for further processing
151 class_type_pairs = set()
153 for match in matches:
154 phrase_label = match[0].strip()
155 phrase_class = match[1].strip()
156 phrase_type = match[2].strip()
157 phrase_operator = match[3].strip()
158 #Needed if some operator in the wiki are not written in english
159 phrase_operator = '-' if phrase_operator not in ('near', 'in') else phrase_operator
160 #hack around a bug where building=yes was imported with quotes into the wiki
161 phrase_type = re.sub(r'\"|"', '', phrase_type)
163 #blacklisting: disallow certain class/type combinations
165 phrase_class in self.black_list.keys() and
166 phrase_type in self.black_list[phrase_class]
169 #whitelisting: if class is in whitelist, allow only tags in the list
171 phrase_class in self.white_list.keys() and
172 phrase_type not in self.white_list[phrase_class]
176 #sanity check, in case somebody added garbage in the wiki
177 if not self._check_sanity(lang, phrase_class, phrase_type):
178 self.statistics_handler.notify_one_phrase_invalid()
181 class_type_pairs.add((phrase_class, phrase_type))
183 self.word_phrases.add((phrase_label, phrase_class,
184 phrase_type, phrase_operator))
186 return class_type_pairs
189 def _create_place_classtype_table_and_indexes(self, class_type_pairs):
191 Create table place_classtype for each given pair.
192 Also create indexes on place_id and centroid.
194 LOG.warning('Create tables and indexes...')
196 sql_tablespace = self.config.TABLESPACE_AUX_DATA
198 sql_tablespace = ' TABLESPACE '+sql_tablespace
200 with self.db_connection.cursor() as db_cursor:
201 db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
203 for pair in class_type_pairs:
204 phrase_class = pair[0]
205 phrase_type = pair[1]
207 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
209 if table_name in self.table_phrases_to_delete:
210 self.statistics_handler.notify_one_table_ignored()
211 #Remove this table from the ones to delete as it match a class/type
212 #still existing on the special phrases of the wiki.
213 self.table_phrases_to_delete.remove(table_name)
214 #So dont need to create the table and indexes.
218 self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
221 self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
223 #Grant access on read to the web user.
224 self._grant_access_to_webuser(phrase_class, phrase_type)
226 self.statistics_handler.notify_one_table_created()
228 with self.db_connection.cursor() as db_cursor:
229 db_cursor.execute("DROP INDEX idx_placex_classtype")
232 def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type):
234 Create table place_classtype of the given phrase_class/phrase_type if doesn't exit.
236 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
237 with self.db_connection.cursor() as db_cursor:
238 db_cursor.execute(SQL("""
239 CREATE TABLE IF NOT EXISTS {{}} {}
240 AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex
241 WHERE class = {{}} AND type = {{}}""".format(sql_tablespace))
242 .format(Identifier(table_name), Literal(phrase_class),
243 Literal(phrase_type)))
246 def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type):
248 Create indexes on centroid and place_id for the place_classtype table.
250 index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type)
251 base_table = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
253 if not self.db_connection.index_exists(index_prefix + 'centroid'):
254 with self.db_connection.cursor() as db_cursor:
255 db_cursor.execute(SQL("""
256 CREATE INDEX {{}} ON {{}} USING GIST (centroid) {}""".format(sql_tablespace))
257 .format(Identifier(index_prefix + 'centroid'),
258 Identifier(base_table)), sql_tablespace)
261 if not self.db_connection.index_exists(index_prefix + 'place_id'):
262 with self.db_connection.cursor() as db_cursor:
263 db_cursor.execute(SQL(
264 """CREATE INDEX {{}} ON {{}} USING btree(place_id) {}""".format(sql_tablespace))
265 .format(Identifier(index_prefix + 'place_id'),
266 Identifier(base_table)))
269 def _grant_access_to_webuser(self, phrase_class, phrase_type):
271 Grant access on read to the table place_classtype for the webuser.
273 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
274 with self.db_connection.cursor() as db_cursor:
275 db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
276 .format(Identifier(table_name),
277 Identifier(self.config.DATABASE_WEBUSER)))
279 def _remove_non_existent_tables_from_db(self):
281 Remove special phrases which doesn't exist on the wiki anymore.
282 Delete the place_classtype tables.
284 LOG.warning('Cleaning database...')
285 #Array containing all queries to execute. Contain tuples of format (query, parameters)
286 queries_parameters = []
288 #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
289 for table in self.table_phrases_to_delete:
290 self.statistics_handler.notify_one_table_deleted()
291 query = SQL('DROP TABLE IF EXISTS {}').format(Identifier(table))
292 queries_parameters.append((query, ()))
294 with self.db_connection.cursor() as db_cursor:
295 for query, parameters in queries_parameters:
296 db_cursor.execute(query, parameters)
298 def _convert_php_settings_if_needed(self, file_path):
300 Convert php settings file of special phrases to json file if it is still in php format.
302 if not isfile(file_path):
303 raise UsageError(str(file_path) + ' is not a valid file.')
305 file, extension = os.path.splitext(file_path)
306 json_file_path = Path(file + '.json').resolve()
308 if extension not in('.php', '.json'):
309 raise UsageError('The custom NOMINATIM_PHRASE_CONFIG file has not a valid extension.')
311 if extension == '.php' and not isfile(json_file_path):
313 subprocess.run(['/usr/bin/env', 'php', '-Cq',
314 (self.phplib_dir / 'migration/PhraseSettingsToJson.php').resolve(),
315 file_path], check=True)
316 LOG.warning('special_phrase configuration file has been converted to json.')
317 return json_file_path
318 except subprocess.CalledProcessError:
319 LOG.error('Error while converting %s to json.', file_path)
322 return json_file_path