2 Functions to import special phrases into the database.
9 from os.path import isfile
10 from icu import Transliterator
11 from psycopg2.sql import Identifier, Literal, SQL
12 from nominatim.tools.exec_utils import get_url
14 LOG = logging.getLogger()
15 class SpecialPhrasesImporter():
16 # pylint: disable-msg=too-many-instance-attributes
17 # pylint: disable-msg=too-few-public-methods
19 Class handling the process of special phrases importations.
21 def __init__(self, config, phplib_dir, db_connection) -> None:
22 self.db_connection = db_connection
24 self.phplib_dir = phplib_dir
25 self.black_list, self.white_list = self._load_white_and_black_lists()
26 #Compile the regex here to increase performances.
27 self.occurence_pattern = re.compile(
28 r'\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([\-YN])'
30 self.sanity_check_pattern = re.compile(r'^\w+$')
31 self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
32 self.config.TERM_NORMALIZATION)
34 def import_from_wiki(self, languages=None):
36 Iterate through all specified languages and
37 extract corresponding special phrases from the wiki.
39 if languages is not None and not isinstance(languages, list):
40 raise TypeError('languages argument should be of type list')
42 #Get all languages to process.
43 languages = self._load_languages() if not languages else languages
45 #array for pairs of class/type
46 class_type_pairs = set()
48 for lang in languages:
49 LOG.warning('Import phrases for lang: %s', lang)
50 wiki_page_xml_content = SpecialPhrasesImporter._get_wiki_content(lang)
51 self._process_xml_content(wiki_page_xml_content, lang)
53 self._create_place_classtype_table_and_indexes(class_type_pairs)
54 self.db_connection.commit()
55 LOG.warning('Import done.')
57 def _load_white_and_black_lists(self):
59 Load white and black lists from phrases-settings.json.
61 settings_path = str(self.config.config_dir)+'/phrase-settings.json'
63 if self.config.PHRASE_CONFIG:
64 settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG)
66 with open(settings_path, "r") as json_settings:
67 settings = json.load(json_settings)
68 return settings['blackList'], settings['whiteList']
70 def _load_languages(self):
72 Get list of all languages from env config file
73 or default if there is no languages configured.
74 The system will extract special phrases only from all specified languages.
77 'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
78 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
79 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
80 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
81 return self.config.LANGUAGES or default_languages
84 def _get_wiki_content(lang):
86 Request and return the wiki page's content
87 corresponding to special phrases for a given lang.
88 Requested URL Example :
89 https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
91 url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' + lang.upper() # pylint: disable=line-too-long
94 def _check_sanity(self, lang, phrase_class, phrase_type):
96 Check sanity of given inputs in case somebody added garbage in the wiki.
97 If a bad class/type is detected the system will exit with an error.
99 type_matchs = self.sanity_check_pattern.findall(phrase_type)
100 class_matchs = self.sanity_check_pattern.findall(phrase_class)
102 if len(class_matchs) < 1 or len(type_matchs) < 1:
103 LOG.error("Bad class/type for language %s: %s=%s", lang, phrase_class, phrase_type)
105 def _process_xml_content(self, xml_content, lang):
106 #One match will be of format [label, class, type, operator, plural]
107 matches = self.occurence_pattern.findall(xml_content)
108 class_type_pairs = set()
110 for match in matches:
111 phrase_label = match[0].strip()
112 normalized_label = self.transliterator.transliterate(phrase_label)
113 phrase_class = match[1].strip()
114 phrase_type = match[2].strip()
115 phrase_operator = match[3].strip()
116 #hack around a bug where building=yes was imported with quotes into the wiki
117 phrase_type = re.sub(r'\"|"', '', phrase_type)
119 #sanity check, in case somebody added garbage in the wiki
120 self._check_sanity(lang, phrase_class, phrase_type)
122 #blacklisting: disallow certain class/type combinations
124 phrase_class in self.black_list.keys() and
125 phrase_type in self.black_list[phrase_class]
128 #whitelisting: if class is in whitelist, allow only tags in the list
130 phrase_class in self.white_list.keys() and
131 phrase_type not in self.white_list[phrase_class]
135 #add class/type to the pairs dict
136 class_type_pairs.add((phrase_class, phrase_type))
138 self._process_amenity(
139 phrase_label, normalized_label, phrase_class,
140 phrase_type, phrase_operator
143 return class_type_pairs
145 def _process_amenity(self, phrase_label, normalized_label,
146 phrase_class, phrase_type, phrase_operator):
147 # pylint: disable-msg=too-many-arguments
149 Add phrase lookup and corresponding class and
150 type to the word table based on the operator.
152 with self.db_connection.cursor() as db_cursor:
153 if phrase_operator == 'near':
154 db_cursor.execute("""SELECT getorcreate_amenityoperator(
155 make_standard_name(%s), %s, %s, %s, 'near')""",
156 (phrase_label, normalized_label, phrase_class, phrase_type))
157 elif phrase_operator == 'in':
158 db_cursor.execute("""SELECT getorcreate_amenityoperator(
159 make_standard_name(%s), %s, %s, %s, 'in')""",
160 (phrase_label, normalized_label, phrase_class, phrase_type))
162 db_cursor.execute("""SELECT getorcreate_amenity(
163 make_standard_name(%s), %s, %s, %s)""",
164 (phrase_label, normalized_label, phrase_class, phrase_type))
167 def _create_place_classtype_table_and_indexes(self, class_type_pairs):
169 Create table place_classtype for each given pair.
170 Also create indexes on place_id and centroid.
172 LOG.warning('Create tables and indexes...')
174 sql_tablespace = self.config.TABLESPACE_AUX_DATA
176 sql_tablespace = ' TABLESPACE '+sql_tablespace
178 with self.db_connection.cursor() as db_cursor:
179 db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
181 for pair in class_type_pairs:
182 phrase_class = pair[0]
183 phrase_type = pair[1]
186 self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
189 self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
191 #Grant access on read to the web user.
192 self._grant_access_to_webuser(phrase_class, phrase_type)
194 with self.db_connection.cursor() as db_cursor:
195 db_cursor.execute("DROP INDEX idx_placex_classtype")
198 def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type):
200 Create table place_classtype of the given phrase_class/phrase_type if doesn't exit.
202 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
203 with self.db_connection.cursor() as db_cursor:
204 db_cursor.execute(SQL("""
205 CREATE TABLE IF NOT EXISTS {{}} {}
206 AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex
207 WHERE class = {{}} AND type = {{}}""".format(sql_tablespace))
208 .format(Identifier(table_name), Literal(phrase_class),
209 Literal(phrase_type)))
212 def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type):
214 Create indexes on centroid and place_id for the place_classtype table.
216 index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type)
217 base_table = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
219 if not self.db_connection.index_exists(index_prefix + 'centroid'):
220 with self.db_connection.cursor() as db_cursor:
221 db_cursor.execute(SQL("""
222 CREATE INDEX {{}} ON {{}} USING GIST (centroid) {}""".format(sql_tablespace))
223 .format(Identifier(index_prefix + 'centroid'),
224 Identifier(base_table)), sql_tablespace)
227 if not self.db_connection.index_exists(index_prefix + 'place_id'):
228 with self.db_connection.cursor() as db_cursor:
229 db_cursor.execute(SQL(
230 """CREATE INDEX {{}} ON {{}} USING btree(place_id) {}""".format(sql_tablespace))
231 .format(Identifier(index_prefix + 'place_id'),
232 Identifier(base_table)))
235 def _grant_access_to_webuser(self, phrase_class, phrase_type):
237 Grant access on read to the table place_classtype for the webuser.
239 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
240 with self.db_connection.cursor() as db_cursor:
241 db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
242 .format(Identifier(table_name),
243 Identifier(self.config.DATABASE_WEBUSER)))
245 def _convert_php_settings_if_needed(self, file_path):
247 Convert php settings file of special phrases to json file if it is still in php format.
249 file, extension = os.path.splitext(file_path)
250 json_file_path = file + '.json'
251 if extension == '.php' and not isfile(json_file_path):
253 subprocess.run(['/usr/bin/env', 'php', '-Cq',
254 self.phplib_dir / 'migration/phraseSettingsToJson.php',
255 file_path], check=True)
256 LOG.warning('special_phrase configuration file has been converted to json.')
257 return json_file_path
258 except subprocess.CalledProcessError:
259 LOG.error('Error while converting %s to json.', file_path)
262 return json_file_path