X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/17cb59efbd5ebec4ee44cf56999e443a7993b050..c495ce097916063327460cfa2cc6e3fcccac4df2:/nominatim/tools/special_phrases.py diff --git a/nominatim/tools/special_phrases.py b/nominatim/tools/special_phrases.py index a70d3047..fd46a18d 100644 --- a/nominatim/tools/special_phrases.py +++ b/nominatim/tools/special_phrases.py @@ -2,200 +2,277 @@ Functions to import special phrases into the database. """ import logging +import os +from pathlib import Path import re -import sys +import subprocess +import json +from os.path import isfile +from icu import Transliterator from psycopg2.sql import Identifier, Literal, SQL -from settings.phrase_settings import BLACK_LIST, WHITE_LIST from nominatim.tools.exec_utils import get_url +from nominatim.errors import UsageError LOG = logging.getLogger() - -def import_from_wiki(config, db_connection, languages=None): +class SpecialPhrasesImporter(): + # pylint: disable-msg=too-many-instance-attributes + # pylint: disable-msg=too-few-public-methods """ - Iterate through all specified languages and - extract corresponding special phrases from the wiki. + Class handling the process of special phrases importations. """ - #Compile the match regex to increase performance for the following loop. - occurence_pattern = re.compile( - r'\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([\-YN])' - ) - sanity_check_pattern = re.compile(r'^\w+$') - - languages = _get_languages(config) if not languages else languages - - #array for pairs of class/type - pairs = dict() - for lang in languages: - LOG.warning('Import phrases for lang: %s', lang) - wiki_page_xml_content = _get_wiki_content(lang) + def __init__(self, config, phplib_dir, db_connection) -> None: + self.db_connection = db_connection + self.config = config + self.phplib_dir = phplib_dir + self.black_list, self.white_list = self._load_white_and_black_lists() + #Compile the regex here to increase performances. + self.occurence_pattern = re.compile( + r'\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([\-YN])' + ) + self.sanity_check_pattern = re.compile(r'^\w+$') + self.transliterator = Transliterator.createFromRules("special-phrases normalizer", + self.config.TERM_NORMALIZATION) + + def import_from_wiki(self, languages=None): + """ + Iterate through all specified languages and + extract corresponding special phrases from the wiki. + """ + if languages is not None and not isinstance(languages, list): + raise TypeError('The \'languages\' argument should be of type list.') + + #Get all languages to process. + languages = self._load_languages() if not languages else languages + + #Store pairs of class/type for further processing + class_type_pairs = set() + + for lang in languages: + LOG.warning('Import phrases for lang: %s', lang) + wiki_page_xml_content = SpecialPhrasesImporter._get_wiki_content(lang) + class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang)) + + self._create_place_classtype_table_and_indexes(class_type_pairs) + self.db_connection.commit() + LOG.warning('Import done.') + + def _load_white_and_black_lists(self): + """ + Load white and black lists from phrases-settings.json. + """ + settings_path = (self.config.config_dir / 'phrase-settings.json').resolve() + + if self.config.PHRASE_CONFIG: + settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG) + + with open(settings_path, "r") as json_settings: + settings = json.load(json_settings) + return settings['blackList'], settings['whiteList'] + + def _load_languages(self): + """ + Get list of all languages from env config file + or default if there is no languages configured. + The system will extract special phrases only from all specified languages. + """ + default_languages = [ + 'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es', + 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu', + 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl', + 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi'] + return self.config.LANGUAGES or default_languages + + @staticmethod + def _get_wiki_content(lang): + """ + Request and return the wiki page's content + corresponding to special phrases for a given lang. + Requested URL Example : + https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN + """ + url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' + lang.upper() # pylint: disable=line-too-long + return get_url(url) + + def _check_sanity(self, lang, phrase_class, phrase_type): + """ + Check sanity of given inputs in case somebody added garbage in the wiki. + If a bad class/type is detected the system will exit with an error. + """ + type_matchs = self.sanity_check_pattern.findall(phrase_type) + class_matchs = self.sanity_check_pattern.findall(phrase_class) + + if len(class_matchs) < 1 or len(type_matchs) < 1: + raise UsageError("Bad class/type for language {}: {}={}".format( + lang, phrase_class, phrase_type)) + + def _process_xml_content(self, xml_content, lang): + """ + Process given xml content by extracting matching patterns. + Matching patterns are processed there and returned in a + set of class/type pairs. + """ #One match will be of format [label, class, type, operator, plural] - matches = occurence_pattern.findall(wiki_page_xml_content) + matches = self.occurence_pattern.findall(xml_content) + #Store pairs of class/type for further processing + class_type_pairs = set() for match in matches: phrase_label = match[0].strip() + normalized_label = self.transliterator.transliterate(phrase_label) phrase_class = match[1].strip() phrase_type = match[2].strip() phrase_operator = match[3].strip() - #hack around a bug where building=yes was imported withq quotes into the wiki + #hack around a bug where building=yes was imported with quotes into the wiki phrase_type = re.sub(r'\"|"', '', phrase_type) #sanity check, in case somebody added garbage in the wiki - _check_sanity(lang, phrase_class, phrase_type, sanity_check_pattern) + self._check_sanity(lang, phrase_class, phrase_type) #blacklisting: disallow certain class/type combinations - if phrase_class in BLACK_LIST.keys() and phrase_type in BLACK_LIST[phrase_class]: + if ( + phrase_class in self.black_list.keys() and + phrase_type in self.black_list[phrase_class] + ): continue #whitelisting: if class is in whitelist, allow only tags in the list - if phrase_class in WHITE_LIST.keys() and phrase_type not in WHITE_LIST[phrase_class]: + if ( + phrase_class in self.white_list.keys() and + phrase_type not in self.white_list[phrase_class] + ): continue #add class/type to the pairs dict - pairs[f'{phrase_class}|{phrase_type}'] = (phrase_class, phrase_type) + class_type_pairs.add((phrase_class, phrase_type)) - _process_amenity( - db_connection, phrase_label, phrase_class, phrase_type, phrase_operator + self._process_amenity( + phrase_label, normalized_label, phrase_class, + phrase_type, phrase_operator ) - _create_place_classtype_table_and_indexes(db_connection, config, pairs) - db_connection.commit() - LOG.warning('Import done.') - - -def _get_languages(config): - """ - Get list of all languages from env config file - or default if there is no languages configured. - The system will extract special phrases only from all specified languages. - """ - default_languages = [ - 'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es', - 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu', - 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl', - 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi'] - return config.LANGUAGES or default_languages - - -def _get_wiki_content(lang): - """ - Request and return the wiki page's content - corresponding to special phrases for a given lang. - Requested URL Example : - https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN - """ - url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' + lang.upper() # pylint: disable=line-too-long - return get_url(url) - - -def _check_sanity(lang, phrase_class, phrase_type, pattern): - """ - Check sanity of given inputs in case somebody added garbage in the wiki. - If a bad class/type is detected the system will exit with an error. - """ - try: - if len(pattern.findall(phrase_class)) < 1 or len(pattern.findall(phrase_type)) < 1: - sys.exit() - except SystemExit: - LOG.error("Bad class/type for language %s: %s=%s", lang, phrase_class, phrase_type) - raise - - -def _process_amenity(db_connection, phrase_label, phrase_class, phrase_type, phrase_operator): - """ - Add phrase lookup and corresponding class and type to the word table based on the operator. - """ - with db_connection.cursor() as db_cursor: - if phrase_operator == 'near': - db_cursor.execute("""SELECT getorcreate_amenityoperator( - make_standard_name(%s), %s, %s, 'near')""", - (phrase_label, phrase_class, phrase_type)) - elif phrase_operator == 'in': - db_cursor.execute("""SELECT getorcreate_amenityoperator( - make_standard_name(%s), %s, %s, 'in')""", - (phrase_label, phrase_class, phrase_type)) + return class_type_pairs + + def _process_amenity(self, phrase_label, normalized_label, + phrase_class, phrase_type, phrase_operator): + # pylint: disable-msg=too-many-arguments + """ + Add phrase lookup and corresponding class and + type to the word table based on the operator. + """ + with self.db_connection.cursor() as db_cursor: + if phrase_operator == 'near': + db_cursor.execute("""SELECT getorcreate_amenityoperator( + make_standard_name(%s), %s, %s, %s, 'near')""", + (phrase_label, normalized_label, phrase_class, phrase_type)) + elif phrase_operator == 'in': + db_cursor.execute("""SELECT getorcreate_amenityoperator( + make_standard_name(%s), %s, %s, %s, 'in')""", + (phrase_label, normalized_label, phrase_class, phrase_type)) + else: + db_cursor.execute("""SELECT getorcreate_amenity( + make_standard_name(%s), %s, %s, %s)""", + (phrase_label, normalized_label, phrase_class, phrase_type)) + + + def _create_place_classtype_table_and_indexes(self, class_type_pairs): + """ + Create table place_classtype for each given pair. + Also create indexes on place_id and centroid. + """ + LOG.warning('Create tables and indexes...') + + sql_tablespace = self.config.TABLESPACE_AUX_DATA + if sql_tablespace: + sql_tablespace = ' TABLESPACE '+sql_tablespace + + with self.db_connection.cursor() as db_cursor: + db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)") + + for pair in class_type_pairs: + phrase_class = pair[0] + phrase_type = pair[1] + + #Table creation + self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type) + + #Indexes creation + self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type) + + #Grant access on read to the web user. + self._grant_access_to_webuser(phrase_class, phrase_type) + + with self.db_connection.cursor() as db_cursor: + db_cursor.execute("DROP INDEX idx_placex_classtype") + + + def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type): + """ + Create table place_classtype of the given phrase_class/phrase_type if doesn't exit. + """ + table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type) + with self.db_connection.cursor() as db_cursor: + db_cursor.execute(SQL(""" + CREATE TABLE IF NOT EXISTS {{}} {} + AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex + WHERE class = {{}} AND type = {{}}""".format(sql_tablespace)) + .format(Identifier(table_name), Literal(phrase_class), + Literal(phrase_type))) + + + def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type): + """ + Create indexes on centroid and place_id for the place_classtype table. + """ + index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type) + base_table = 'place_classtype_{}_{}'.format(phrase_class, phrase_type) + #Index on centroid + if not self.db_connection.index_exists(index_prefix + 'centroid'): + with self.db_connection.cursor() as db_cursor: + db_cursor.execute(SQL(""" + CREATE INDEX {{}} ON {{}} USING GIST (centroid) {}""".format(sql_tablespace)) + .format(Identifier(index_prefix + 'centroid'), + Identifier(base_table)), sql_tablespace) + + #Index on place_id + if not self.db_connection.index_exists(index_prefix + 'place_id'): + with self.db_connection.cursor() as db_cursor: + db_cursor.execute(SQL( + """CREATE INDEX {{}} ON {{}} USING btree(place_id) {}""".format(sql_tablespace)) + .format(Identifier(index_prefix + 'place_id'), + Identifier(base_table))) + + + def _grant_access_to_webuser(self, phrase_class, phrase_type): + """ + Grant access on read to the table place_classtype for the webuser. + """ + table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type) + with self.db_connection.cursor() as db_cursor: + db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""") + .format(Identifier(table_name), + Identifier(self.config.DATABASE_WEBUSER))) + + def _convert_php_settings_if_needed(self, file_path): + """ + Convert php settings file of special phrases to json file if it is still in php format. + """ + if not isfile(file_path): + raise UsageError(str(file_path) + ' is not a valid file.') + + file, extension = os.path.splitext(file_path) + json_file_path = Path(file + '.json').resolve() + + if extension not in('.php', '.json'): + raise UsageError('The custom NOMINATIM_PHRASE_CONFIG file has not a valid extension.') + + if extension == '.php' and not isfile(json_file_path): + try: + subprocess.run(['/usr/bin/env', 'php', '-Cq', + (self.phplib_dir / 'migration/PhraseSettingsToJson.php').resolve(), + file_path], check=True) + LOG.warning('special_phrase configuration file has been converted to json.') + return json_file_path + except subprocess.CalledProcessError: + LOG.error('Error while converting %s to json.', file_path) + raise else: - db_cursor.execute("""SELECT getorcreate_amenity( - make_standard_name(%s), %s, %s)""", - (phrase_label, phrase_class, phrase_type)) - - -def _create_place_classtype_table_and_indexes(db_connection, config, pairs): - """ - Create table place_classtype for each given pair. - Also create indexes on place_id and centroid. - """ - LOG.warning('Create tables and indexes...') - - sql_tablespace = config.TABLESPACE_AUX_DATA - if sql_tablespace: - sql_tablespace = ' TABLESPACE '+sql_tablespace - - with db_connection.cursor() as db_cursor: - db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)") - - for _, pair in pairs.items(): - phrase_class = pair[0] - phrase_type = pair[1] - - #Table creation - _create_place_classtype_table( - db_connection, sql_tablespace, phrase_class, phrase_type - ) - - #Indexes creation - _create_place_classtype_indexes( - db_connection, sql_tablespace, phrase_class, phrase_type - ) - - #Grant access on read to the web user. - _grant_access_to_webuser( - db_connection, config, phrase_class, phrase_type - ) - - with db_connection.cursor() as db_cursor: - db_cursor.execute("DROP INDEX idx_placex_classtype") - - -def _create_place_classtype_table(db_connection, sql_tablespace, phrase_class, phrase_type): - """ - Create table place_classtype of the given phrase_class/phrase_type if doesn't exit. - """ - with db_connection.cursor() as db_cursor: - db_cursor.execute(SQL(f""" - CREATE TABLE IF NOT EXISTS {{}} {sql_tablespace} - AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex - WHERE class = {{}} AND type = {{}}""") - .format(Identifier(f'place_classtype_{phrase_class}_{phrase_type}'), - Literal(phrase_class), Literal(phrase_type))) - - -def _create_place_classtype_indexes(db_connection, sql_tablespace, phrase_class, phrase_type): - """ - Create indexes on centroid and place_id for the place_classtype table. - """ - #Index on centroid - if not db_connection.index_exists(f'idx_place_classtype_{phrase_class}_{phrase_type}_centroid'): - with db_connection.cursor() as db_cursor: - db_cursor.execute(SQL(f""" - CREATE INDEX {{}} ON {{}} USING GIST (centroid) {sql_tablespace}""") - .format(Identifier( - f"""idx_place_classtype_{phrase_class}_{phrase_type}_centroid"""), - Identifier(f'place_classtype_{phrase_class}_{phrase_type}'))) - - #Index on place_id - if not db_connection.index_exists(f'idx_place_classtype_{phrase_class}_{phrase_type}_place_id'): - with db_connection.cursor() as db_cursor: - db_cursor.execute(SQL(f""" - CREATE INDEX {{}} ON {{}} USING btree(place_id) {sql_tablespace}""") - .format(Identifier( - f"""idx_place_classtype_{phrase_class}_{phrase_type}_place_id"""), - Identifier(f'place_classtype_{phrase_class}_{phrase_type}'))) - - -def _grant_access_to_webuser(db_connection, config, phrase_class, phrase_type): - """ - Grant access on read to the table place_classtype for the webuser. - """ - with db_connection.cursor() as db_cursor: - db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""") - .format(Identifier(f'place_classtype_{phrase_class}_{phrase_type}'), - Identifier(config.DATABASE_WEBUSER))) + return json_file_path