X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/1b68152fb28f5fc146fa7e379efe8f3a5e511b26..bf864b2c54390ca47121b49476c34bd7813bc7a9:/nominatim/tools/special_phrases/special_phrases_importer.py diff --git a/nominatim/tools/special_phrases/special_phrases_importer.py b/nominatim/tools/special_phrases/special_phrases_importer.py index f4eec260..9649f94b 100644 --- a/nominatim/tools/special_phrases/special_phrases_importer.py +++ b/nominatim/tools/special_phrases/special_phrases_importer.py @@ -9,11 +9,11 @@ import re import subprocess import json -from icu import Transliterator from psycopg2.sql import Identifier, Literal, SQL from nominatim.tools.exec_utils import get_url from nominatim.errors import UsageError +from nominatim.tools.special_phrases.importer_statistics import SpecialPhrasesImporterStatistics LOG = logging.getLogger() class SpecialPhrasesImporter(): @@ -22,6 +22,7 @@ class SpecialPhrasesImporter(): Class handling the process of special phrases importations. """ def __init__(self, config, phplib_dir, db_connection) -> None: + self.statistics_handler = SpecialPhrasesImporterStatistics() self.db_connection = db_connection self.config = config self.phplib_dir = phplib_dir @@ -31,21 +32,14 @@ class SpecialPhrasesImporter(): r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])' ) self.sanity_check_pattern = re.compile(r'^\w+$') - self.transliterator = Transliterator.createFromRules("special-phrases normalizer", - self.config.TERM_NORMALIZATION) - #This set will contain all existing phrases from the word table which - #no longer exist on the wiki. - #It contain tuples with the following format: (normalized_word, class, type, operator) - self.words_phrases_to_delete = set() - #This set will contain the phrases which still exist from the wiki. - #It is used to prevent duplicates on the wiki by removing them from - #the word_phrases_to_delete only at the end. - self.words_phrases_still_exist = set() + # This set will contain all existing phrases to be added. + # It contains tuples with the following format: (lable, class, type, operator) + self.word_phrases = set() #This set will contain all existing place_classtype tables which doesn't match any #special phrases class/type on the wiki. self.table_phrases_to_delete = set() - def import_from_wiki(self, languages=None): + def import_from_wiki(self, tokenizer, languages=None): """ Iterate through all specified languages and extract corresponding special phrases from the wiki. @@ -53,7 +47,6 @@ class SpecialPhrasesImporter(): if languages is not None and not isinstance(languages, list): raise TypeError('The \'languages\' argument should be of type list.') - self._fetch_existing_words_phrases() self._fetch_existing_place_classtype_tables() #Get all languages to process. @@ -63,34 +56,21 @@ class SpecialPhrasesImporter(): class_type_pairs = set() for lang in languages: - LOG.warning('Import phrases for lang: %s', lang) + LOG.warning('Importing phrases for lang: %s...', lang) wiki_page_xml_content = SpecialPhrasesImporter._get_wiki_content(lang) class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang)) + self.statistics_handler.notify_current_lang_done(lang) self._create_place_classtype_table_and_indexes(class_type_pairs) - self._remove_non_existent_phrases_from_db() + self._remove_non_existent_tables_from_db() self.db_connection.commit() + + with tokenizer.name_analyzer() as analyzer: + analyzer.update_special_phrases(self.word_phrases) + LOG.warning('Import done.') + self.statistics_handler.notify_import_done() - def _fetch_existing_words_phrases(self): - """ - Fetch existing special phrases from the word table. - Fill the word_phrases_to_delete set of the class. - """ - #Only extract special phrases terms: - #If class=place and type=house then it is a housenumber term. - #If class=place and type=postcode then it is a postcode term. - word_query = """ - SELECT word, class, type, operator FROM word - WHERE class != 'place' OR (type != 'house' AND type != 'postcode') - """ - with self.db_connection.cursor() as db_cursor: - db_cursor.execute(SQL(word_query)) - for row in db_cursor: - row[3] = '-' if row[3] is None else row[3] - self.words_phrases_to_delete.add( - (row[0], row[1], row[2], row[3]) - ) def _fetch_existing_place_classtype_tables(self): """ @@ -172,7 +152,6 @@ class SpecialPhrasesImporter(): for match in matches: phrase_label = match[0].strip() - normalized_label = self.transliterator.transliterate(phrase_label) phrase_class = match[1].strip() phrase_type = match[2].strip() phrase_operator = match[3].strip() @@ -194,53 +173,18 @@ class SpecialPhrasesImporter(): ): continue - #Check if the phrase already exists in the database. - if ( - (normalized_label, phrase_class, phrase_type, phrase_operator) - in self.words_phrases_to_delete - ): - #Remove this phrase from the ones to delete as it still exist on the wiki. - self.words_phrases_still_exist.add( - (normalized_label, phrase_class, phrase_type, phrase_operator) - ) - class_type_pairs.add((phrase_class, phrase_type)) - #Dont need to add this phrase as it already exists in the word table. - continue - #sanity check, in case somebody added garbage in the wiki if not self._check_sanity(lang, phrase_class, phrase_type): + self.statistics_handler.notify_one_phrase_invalid() continue class_type_pairs.add((phrase_class, phrase_type)) - self._process_amenity( - phrase_label, normalized_label, phrase_class, - phrase_type, phrase_operator - ) + self.word_phrases.add((phrase_label, phrase_class, + phrase_type, phrase_operator)) return class_type_pairs - def _process_amenity(self, phrase_label, normalized_label, - phrase_class, phrase_type, phrase_operator): - # pylint: disable-msg=too-many-arguments - """ - Add phrase lookup and corresponding class and - type to the word table based on the operator. - """ - with self.db_connection.cursor() as db_cursor: - if phrase_operator == 'near': - db_cursor.execute("""SELECT getorcreate_amenityoperator( - make_standard_name(%s), %s, %s, %s, 'near')""", - (phrase_label, normalized_label, phrase_class, phrase_type)) - elif phrase_operator == 'in': - db_cursor.execute("""SELECT getorcreate_amenityoperator( - make_standard_name(%s), %s, %s, %s, 'in')""", - (phrase_label, normalized_label, phrase_class, phrase_type)) - else: - db_cursor.execute("""SELECT getorcreate_amenity( - make_standard_name(%s), %s, %s, %s)""", - (phrase_label, normalized_label, phrase_class, phrase_type)) - def _create_place_classtype_table_and_indexes(self, class_type_pairs): """ @@ -263,6 +207,7 @@ class SpecialPhrasesImporter(): table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type) if table_name in self.table_phrases_to_delete: + self.statistics_handler.notify_one_table_ignored() #Remove this table from the ones to delete as it match a class/type #still existing on the special phrases of the wiki. self.table_phrases_to_delete.remove(table_name) @@ -278,6 +223,8 @@ class SpecialPhrasesImporter(): #Grant access on read to the web user. self._grant_access_to_webuser(phrase_class, phrase_type) + self.statistics_handler.notify_one_table_created() + with self.db_connection.cursor() as db_cursor: db_cursor.execute("DROP INDEX idx_placex_classtype") @@ -329,34 +276,18 @@ class SpecialPhrasesImporter(): .format(Identifier(table_name), Identifier(self.config.DATABASE_WEBUSER))) - def _remove_non_existent_phrases_from_db(self): + def _remove_non_existent_tables_from_db(self): """ Remove special phrases which doesn't exist on the wiki anymore. - Delete from the word table and delete the place_classtype tables. + Delete the place_classtype tables. """ LOG.warning('Cleaning database...') - self.words_phrases_to_delete = self.words_phrases_to_delete - self.words_phrases_still_exist #Array containing all queries to execute. Contain tuples of format (query, parameters) queries_parameters = [] - #Delete phrases from the word table which are not on the wiki anymore. - for phrase_to_delete in self.words_phrases_to_delete: - if phrase_to_delete[3] == '-': - query = """ - DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator IS null - """ - parameters = (phrase_to_delete[0], phrase_to_delete[1], phrase_to_delete[2], ) - queries_parameters.append((query, parameters)) - else: - query = """ - DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator = %s - """ - parameters = (phrase_to_delete[0], phrase_to_delete[1], - phrase_to_delete[2], phrase_to_delete[3], ) - queries_parameters.append((query, parameters)) - #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore for table in self.table_phrases_to_delete: + self.statistics_handler.notify_one_table_deleted() query = SQL('DROP TABLE IF EXISTS {}').format(Identifier(table)) queries_parameters.append((query, ()))