X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/2c19bd5ea38fb7474d0ca530fdab7ae76cee965b..fb3353b854f40b8fd97ea942d3d9814e80e14779:/nominatim/tools/special_phrases.py diff --git a/nominatim/tools/special_phrases.py b/nominatim/tools/special_phrases.py index 3faeefbd..0c1258fe 100644 --- a/nominatim/tools/special_phrases.py +++ b/nominatim/tools/special_phrases.py @@ -3,6 +3,7 @@ """ import logging import os +from pathlib import Path import re import subprocess import json @@ -10,6 +11,7 @@ from os.path import isfile from icu import Transliterator from psycopg2.sql import Identifier, Literal, SQL from nominatim.tools.exec_utils import get_url +from nominatim.errors import UsageError LOG = logging.getLogger() class SpecialPhrasesImporter(): @@ -30,6 +32,17 @@ class SpecialPhrasesImporter(): self.sanity_check_pattern = re.compile(r'^\w+$') self.transliterator = Transliterator.createFromRules("special-phrases normalizer", self.config.TERM_NORMALIZATION) + #This set will contain all existing phrases from the word table which + #no longer exist on the wiki. + #It contain tuples with the following format: (normalized_word, class, type, operator) + self.words_phrases_to_delete = set() + #This set will contain the phrases which still exist from the wiki. + #It is used to prevent duplicates on the wiki by removing them from + #the word_phrases_to_delete only at the end. + self.words_phrases_still_exist = set() + #This set will contain all existing place_classtype tables which doesn't match any + #special phrases class/type on the wiki. + self.table_phrases_to_delete = set() def import_from_wiki(self, languages=None): """ @@ -37,28 +50,68 @@ class SpecialPhrasesImporter(): extract corresponding special phrases from the wiki. """ if languages is not None and not isinstance(languages, list): - raise TypeError('languages argument should be of type list') + raise TypeError('The \'languages\' argument should be of type list.') + + self._fetch_existing_words_phrases() + self._fetch_existing_place_classtype_tables() #Get all languages to process. languages = self._load_languages() if not languages else languages - #array for pairs of class/type + #Store pairs of class/type for further processing class_type_pairs = set() for lang in languages: LOG.warning('Import phrases for lang: %s', lang) wiki_page_xml_content = SpecialPhrasesImporter._get_wiki_content(lang) - self._process_xml_content(wiki_page_xml_content, lang) + class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang)) self._create_place_classtype_table_and_indexes(class_type_pairs) + self._remove_non_existent_phrases_from_db() self.db_connection.commit() LOG.warning('Import done.') + def _fetch_existing_words_phrases(self): + """ + Fetch existing special phrases from the word table. + Fill the word_phrases_to_delete set of the class. + """ + #Only extract special phrases terms: + #If class=place and type=house then it is a housenumber term. + #If class=place and type=postcode then it is a postcode term. + word_query = """ + SELECT word, class, type, operator FROM word + WHERE class != 'place' OR (type != 'house' AND type != 'postcode') + """ + with self.db_connection.cursor() as db_cursor: + db_cursor.execute(SQL(word_query)) + for row in db_cursor: + row[3] = '-' if row[3] is None else row[3] + self.words_phrases_to_delete.add( + (row[0], row[1], row[2], row[3]) + ) + + def _fetch_existing_place_classtype_tables(self): + """ + Fetch existing place_classtype tables. + Fill the table_phrases_to_delete set of the class. + """ + query = """ + SELECT table_name + FROM information_schema.tables + WHERE table_schema='public' + AND table_name like 'place_classtype_%'; + """ + with self.db_connection.cursor() as db_cursor: + db_cursor.execute(SQL(query)) + for row in db_cursor: + self.table_phrases_to_delete.add(row[0]) + def _load_white_and_black_lists(self): """ Load white and black lists from phrases-settings.json. """ - settings_path = str(self.config.config_dir)+'/phrase-settings.json' + settings_path = (self.config.config_dir / 'phrase-settings.json').resolve() if self.config.PHRASE_CONFIG: settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG) @@ -78,7 +131,7 @@ class SpecialPhrasesImporter(): 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu', 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl', 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi'] - return self.config.LANGUAGES or default_languages + return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages @staticmethod def _get_wiki_content(lang): @@ -100,11 +153,18 @@ class SpecialPhrasesImporter(): class_matchs = self.sanity_check_pattern.findall(phrase_class) if len(class_matchs) < 1 or len(type_matchs) < 1: - LOG.error("Bad class/type for language %s: %s=%s", lang, phrase_class, phrase_type) + raise UsageError("Bad class/type for language {}: {}={}".format( + lang, phrase_class, phrase_type)) def _process_xml_content(self, xml_content, lang): + """ + Process given xml content by extracting matching patterns. + Matching patterns are processed there and returned in a + set of class/type pairs. + """ #One match will be of format [label, class, type, operator, plural] matches = self.occurence_pattern.findall(xml_content) + #Store pairs of class/type for further processing class_type_pairs = set() for match in matches: @@ -113,12 +173,11 @@ class SpecialPhrasesImporter(): phrase_class = match[1].strip() phrase_type = match[2].strip() phrase_operator = match[3].strip() + #Needed if some operator in the wiki are not written in english + phrase_operator = '-' if phrase_operator not in ('near', 'in') else phrase_operator #hack around a bug where building=yes was imported with quotes into the wiki phrase_type = re.sub(r'\"|"', '', phrase_type) - #sanity check, in case somebody added garbage in the wiki - self._check_sanity(lang, phrase_class, phrase_type) - #blacklisting: disallow certain class/type combinations if ( phrase_class in self.black_list.keys() and @@ -132,7 +191,22 @@ class SpecialPhrasesImporter(): ): continue - #add class/type to the pairs dict + #Check if the phrase already exists in the database. + if ( + (normalized_label, phrase_class, phrase_type, phrase_operator) + in self.words_phrases_to_delete + ): + #Remove this phrase from the ones to delete as it still exist on the wiki. + self.words_phrases_still_exist.add( + (normalized_label, phrase_class, phrase_type, phrase_operator) + ) + class_type_pairs.add((phrase_class, phrase_type)) + #Dont need to add this phrase as it already exists in the word table. + continue + + #sanity check, in case somebody added garbage in the wiki + self._check_sanity(lang, phrase_class, phrase_type) + class_type_pairs.add((phrase_class, phrase_type)) self._process_amenity( @@ -182,6 +256,15 @@ class SpecialPhrasesImporter(): phrase_class = pair[0] phrase_type = pair[1] + table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type) + + if table_name in self.table_phrases_to_delete: + #Remove this table from the ones to delete as it match a class/type + #still existing on the special phrases of the wiki. + self.table_phrases_to_delete.remove(table_name) + #So dont need to create the table and indexes. + continue + #Table creation self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type) @@ -242,16 +325,58 @@ class SpecialPhrasesImporter(): .format(Identifier(table_name), Identifier(self.config.DATABASE_WEBUSER))) + def _remove_non_existent_phrases_from_db(self): + """ + Remove special phrases which doesn't exist on the wiki anymore. + Delete from the word table and delete the place_classtype tables. + """ + LOG.warning('Cleaning database...') + self.words_phrases_to_delete = self.words_phrases_to_delete - self.words_phrases_still_exist + #Array containing all queries to execute. Contain tuples of format (query, parameters) + queries_parameters = [] + + #Delete phrases from the word table which are not on the wiki anymore. + for phrase_to_delete in self.words_phrases_to_delete: + if phrase_to_delete[3] == '-': + query = """ + DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator IS null + """ + parameters = (phrase_to_delete[0], phrase_to_delete[1], phrase_to_delete[2], ) + queries_parameters.append((query, parameters)) + else: + query = """ + DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator = %s + """ + parameters = (phrase_to_delete[0], phrase_to_delete[1], + phrase_to_delete[2], phrase_to_delete[3], ) + queries_parameters.append((query, parameters)) + + #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore + for table in self.table_phrases_to_delete: + query = SQL('DROP TABLE IF EXISTS {}').format(Identifier(table)) + queries_parameters.append((query, ())) + + with self.db_connection.cursor() as db_cursor: + for query, parameters in queries_parameters: + db_cursor.execute(query, parameters) + def _convert_php_settings_if_needed(self, file_path): """ Convert php settings file of special phrases to json file if it is still in php format. """ + if not isfile(file_path): + raise UsageError(str(file_path) + ' is not a valid file.') + file, extension = os.path.splitext(file_path) - json_file_path = file + '.json' + json_file_path = Path(file + '.json').resolve() + + if extension not in('.php', '.json'): + raise UsageError('The custom NOMINATIM_PHRASE_CONFIG file has not a valid extension.') + if extension == '.php' and not isfile(json_file_path): try: subprocess.run(['/usr/bin/env', 'php', '-Cq', - self.phplib_dir / 'migration/phraseSettingsToJson.php', + (self.phplib_dir / 'migration/PhraseSettingsToJson.php').resolve(), file_path], check=True) LOG.warning('special_phrase configuration file has been converted to json.') return json_file_path