2 Functions to import special phrases into the database.
6 from pathlib import Path
10 from os.path import isfile
11 from icu import Transliterator
12 from psycopg2.sql import Identifier, Literal, SQL
13 from nominatim.tools.exec_utils import get_url
14 from nominatim.errors import UsageError
16 LOG = logging.getLogger()
17 class SpecialPhrasesImporter():
18 # pylint: disable-msg=too-many-instance-attributes
19 # pylint: disable-msg=too-few-public-methods
21 Class handling the process of special phrases importations.
23 def __init__(self, config, phplib_dir, db_connection) -> None:
24 self.db_connection = db_connection
26 self.phplib_dir = phplib_dir
27 self.black_list, self.white_list = self._load_white_and_black_lists()
28 #Compile the regex here to increase performances.
29 self.occurence_pattern = re.compile(
30 r'\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([\-YN])'
32 self.sanity_check_pattern = re.compile(r'^\w+$')
33 self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
34 self.config.TERM_NORMALIZATION)
36 def import_from_wiki(self, languages=None):
38 Iterate through all specified languages and
39 extract corresponding special phrases from the wiki.
41 if languages is not None and not isinstance(languages, list):
42 raise TypeError('The \'languages\' argument should be of type list.')
44 #Get all languages to process.
45 languages = self._load_languages() if not languages else languages
47 #Store pairs of class/type for further processing
48 class_type_pairs = set()
50 for lang in languages:
51 LOG.warning('Import phrases for lang: %s', lang)
52 wiki_page_xml_content = SpecialPhrasesImporter._get_wiki_content(lang)
53 class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang))
55 self._create_place_classtype_table_and_indexes(class_type_pairs)
56 self.db_connection.commit()
57 LOG.warning('Import done.')
59 def _load_white_and_black_lists(self):
61 Load white and black lists from phrases-settings.json.
63 settings_path = (self.config.config_dir / 'phrase-settings.json').resolve()
65 if self.config.PHRASE_CONFIG:
66 settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG)
68 with open(settings_path, "r") as json_settings:
69 settings = json.load(json_settings)
70 return settings['blackList'], settings['whiteList']
72 def _load_languages(self):
74 Get list of all languages from env config file
75 or default if there is no languages configured.
76 The system will extract special phrases only from all specified languages.
79 'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
80 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
81 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
82 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
83 return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages
86 def _get_wiki_content(lang):
88 Request and return the wiki page's content
89 corresponding to special phrases for a given lang.
90 Requested URL Example :
91 https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
93 url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' + lang.upper() # pylint: disable=line-too-long
96 def _check_sanity(self, lang, phrase_class, phrase_type):
98 Check sanity of given inputs in case somebody added garbage in the wiki.
99 If a bad class/type is detected the system will exit with an error.
101 type_matchs = self.sanity_check_pattern.findall(phrase_type)
102 class_matchs = self.sanity_check_pattern.findall(phrase_class)
104 if len(class_matchs) < 1 or len(type_matchs) < 1:
105 raise UsageError("Bad class/type for language {}: {}={}".format(
106 lang, phrase_class, phrase_type))
108 def _process_xml_content(self, xml_content, lang):
110 Process given xml content by extracting matching patterns.
111 Matching patterns are processed there and returned in a
112 set of class/type pairs.
114 #One match will be of format [label, class, type, operator, plural]
115 matches = self.occurence_pattern.findall(xml_content)
116 #Store pairs of class/type for further processing
117 class_type_pairs = set()
119 for match in matches:
120 phrase_label = match[0].strip()
121 normalized_label = self.transliterator.transliterate(phrase_label)
122 phrase_class = match[1].strip()
123 phrase_type = match[2].strip()
124 phrase_operator = match[3].strip()
125 #hack around a bug where building=yes was imported with quotes into the wiki
126 phrase_type = re.sub(r'\"|"', '', phrase_type)
128 #sanity check, in case somebody added garbage in the wiki
129 self._check_sanity(lang, phrase_class, phrase_type)
131 #blacklisting: disallow certain class/type combinations
133 phrase_class in self.black_list.keys() and
134 phrase_type in self.black_list[phrase_class]
137 #whitelisting: if class is in whitelist, allow only tags in the list
139 phrase_class in self.white_list.keys() and
140 phrase_type not in self.white_list[phrase_class]
144 #add class/type to the pairs dict
145 class_type_pairs.add((phrase_class, phrase_type))
147 self._process_amenity(
148 phrase_label, normalized_label, phrase_class,
149 phrase_type, phrase_operator
152 return class_type_pairs
154 def _process_amenity(self, phrase_label, normalized_label,
155 phrase_class, phrase_type, phrase_operator):
156 # pylint: disable-msg=too-many-arguments
158 Add phrase lookup and corresponding class and
159 type to the word table based on the operator.
161 with self.db_connection.cursor() as db_cursor:
162 if phrase_operator == 'near':
163 db_cursor.execute("""SELECT getorcreate_amenityoperator(
164 make_standard_name(%s), %s, %s, %s, 'near')""",
165 (phrase_label, normalized_label, phrase_class, phrase_type))
166 elif phrase_operator == 'in':
167 db_cursor.execute("""SELECT getorcreate_amenityoperator(
168 make_standard_name(%s), %s, %s, %s, 'in')""",
169 (phrase_label, normalized_label, phrase_class, phrase_type))
171 db_cursor.execute("""SELECT getorcreate_amenity(
172 make_standard_name(%s), %s, %s, %s)""",
173 (phrase_label, normalized_label, phrase_class, phrase_type))
176 def _create_place_classtype_table_and_indexes(self, class_type_pairs):
178 Create table place_classtype for each given pair.
179 Also create indexes on place_id and centroid.
181 LOG.warning('Create tables and indexes...')
183 sql_tablespace = self.config.TABLESPACE_AUX_DATA
185 sql_tablespace = ' TABLESPACE '+sql_tablespace
187 with self.db_connection.cursor() as db_cursor:
188 db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
190 for pair in class_type_pairs:
191 phrase_class = pair[0]
192 phrase_type = pair[1]
195 self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
198 self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
200 #Grant access on read to the web user.
201 self._grant_access_to_webuser(phrase_class, phrase_type)
203 with self.db_connection.cursor() as db_cursor:
204 db_cursor.execute("DROP INDEX idx_placex_classtype")
207 def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type):
209 Create table place_classtype of the given phrase_class/phrase_type if doesn't exit.
211 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
212 with self.db_connection.cursor() as db_cursor:
213 db_cursor.execute(SQL("""
214 CREATE TABLE IF NOT EXISTS {{}} {}
215 AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex
216 WHERE class = {{}} AND type = {{}}""".format(sql_tablespace))
217 .format(Identifier(table_name), Literal(phrase_class),
218 Literal(phrase_type)))
221 def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type):
223 Create indexes on centroid and place_id for the place_classtype table.
225 index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type)
226 base_table = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
228 if not self.db_connection.index_exists(index_prefix + 'centroid'):
229 with self.db_connection.cursor() as db_cursor:
230 db_cursor.execute(SQL("""
231 CREATE INDEX {{}} ON {{}} USING GIST (centroid) {}""".format(sql_tablespace))
232 .format(Identifier(index_prefix + 'centroid'),
233 Identifier(base_table)), sql_tablespace)
236 if not self.db_connection.index_exists(index_prefix + 'place_id'):
237 with self.db_connection.cursor() as db_cursor:
238 db_cursor.execute(SQL(
239 """CREATE INDEX {{}} ON {{}} USING btree(place_id) {}""".format(sql_tablespace))
240 .format(Identifier(index_prefix + 'place_id'),
241 Identifier(base_table)))
244 def _grant_access_to_webuser(self, phrase_class, phrase_type):
246 Grant access on read to the table place_classtype for the webuser.
248 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
249 with self.db_connection.cursor() as db_cursor:
250 db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
251 .format(Identifier(table_name),
252 Identifier(self.config.DATABASE_WEBUSER)))
254 def _convert_php_settings_if_needed(self, file_path):
256 Convert php settings file of special phrases to json file if it is still in php format.
258 if not isfile(file_path):
259 raise UsageError(str(file_path) + ' is not a valid file.')
261 file, extension = os.path.splitext(file_path)
262 json_file_path = Path(file + '.json').resolve()
264 if extension not in('.php', '.json'):
265 raise UsageError('The custom NOMINATIM_PHRASE_CONFIG file has not a valid extension.')
267 if extension == '.php' and not isfile(json_file_path):
269 subprocess.run(['/usr/bin/env', 'php', '-Cq',
270 (self.phplib_dir / 'migration/PhraseSettingsToJson.php').resolve(),
271 file_path], check=True)
272 LOG.warning('special_phrase configuration file has been converted to json.')
273 return json_file_path
274 except subprocess.CalledProcessError:
275 LOG.error('Error while converting %s to json.', file_path)
278 return json_file_path