"""
import logging
import os
+from pathlib import Path
import re
import subprocess
import json
from icu import Transliterator
from psycopg2.sql import Identifier, Literal, SQL
from nominatim.tools.exec_utils import get_url
+from nominatim.errors import UsageError
LOG = logging.getLogger()
class SpecialPhrasesImporter():
self.sanity_check_pattern = re.compile(r'^\w+$')
self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
self.config.TERM_NORMALIZATION)
+ #This set will contain all existing phrases from the word table which
+ #no longer exist on the wiki.
+ #It contain tuples with the following format: (normalized_word, class, type, operator)
+ self.words_phrases_to_delete = set()
+ #This set will contain the phrases which still exist from the wiki.
+ #It is used to prevent duplicates on the wiki by removing them from
+ #the word_phrases_to_delete only at the end.
+ self.words_phrases_still_exist = set()
+ #This set will contain all existing place_classtype tables which doesn't match any
+ #special phrases class/type on the wiki.
+ self.table_phrases_to_delete = set()
def import_from_wiki(self, languages=None):
"""
extract corresponding special phrases from the wiki.
"""
if languages is not None and not isinstance(languages, list):
- raise TypeError('languages argument should be of type list')
+ raise TypeError('The \'languages\' argument should be of type list.')
+
+ self._fetch_existing_words_phrases()
+ self._fetch_existing_place_classtype_tables()
#Get all languages to process.
languages = self._load_languages() if not languages else languages
- #array for pairs of class/type
+ #Store pairs of class/type for further processing
class_type_pairs = set()
for lang in languages:
LOG.warning('Import phrases for lang: %s', lang)
wiki_page_xml_content = SpecialPhrasesImporter._get_wiki_content(lang)
- self._process_xml_content(wiki_page_xml_content, lang)
+ class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang))
self._create_place_classtype_table_and_indexes(class_type_pairs)
+ self._remove_non_existent_phrases_from_db()
self.db_connection.commit()
LOG.warning('Import done.')
+ def _fetch_existing_words_phrases(self):
+ """
+ Fetch existing special phrases from the word table.
+ Fill the word_phrases_to_delete set of the class.
+ """
+ #Only extract special phrases terms:
+ #If class=place and type=house then it is a housenumber term.
+ #If class=place and type=postcode then it is a postcode term.
+ word_query = """
+ SELECT word, class, type, operator FROM word
+ WHERE class != 'place' OR (type != 'house' AND type != 'postcode')
+ """
+ with self.db_connection.cursor() as db_cursor:
+ db_cursor.execute(SQL(word_query))
+ for row in db_cursor:
+ row[3] = '-' if row[3] is None else row[3]
+ self.words_phrases_to_delete.add(
+ (row[0], row[1], row[2], row[3])
+ )
+
+ def _fetch_existing_place_classtype_tables(self):
+ """
+ Fetch existing place_classtype tables.
+ Fill the table_phrases_to_delete set of the class.
+ """
+ query = """
+ SELECT table_name
+ FROM information_schema.tables
+ WHERE table_schema='public'
+ AND table_name like 'place_classtype_%';
+ """
+ with self.db_connection.cursor() as db_cursor:
+ db_cursor.execute(SQL(query))
+ for row in db_cursor:
+ self.table_phrases_to_delete.add(row[0])
+
def _load_white_and_black_lists(self):
"""
Load white and black lists from phrases-settings.json.
"""
- settings_path = str(self.config.config_dir)+'/phrase-settings.json'
+ settings_path = (self.config.config_dir / 'phrase-settings.json').resolve()
if self.config.PHRASE_CONFIG:
settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG)
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
- return self.config.LANGUAGES or default_languages
+ return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages
@staticmethod
def _get_wiki_content(lang):
class_matchs = self.sanity_check_pattern.findall(phrase_class)
if len(class_matchs) < 1 or len(type_matchs) < 1:
- LOG.error("Bad class/type for language %s: %s=%s", lang, phrase_class, phrase_type)
+ raise UsageError("Bad class/type for language {}: {}={}".format(
+ lang, phrase_class, phrase_type))
def _process_xml_content(self, xml_content, lang):
+ """
+ Process given xml content by extracting matching patterns.
+ Matching patterns are processed there and returned in a
+ set of class/type pairs.
+ """
#One match will be of format [label, class, type, operator, plural]
matches = self.occurence_pattern.findall(xml_content)
+ #Store pairs of class/type for further processing
class_type_pairs = set()
for match in matches:
phrase_class = match[1].strip()
phrase_type = match[2].strip()
phrase_operator = match[3].strip()
+ #Needed if some operator in the wiki are not written in english
+ phrase_operator = '-' if phrase_operator not in ('near', 'in') else phrase_operator
#hack around a bug where building=yes was imported with quotes into the wiki
phrase_type = re.sub(r'\"|"', '', phrase_type)
- #sanity check, in case somebody added garbage in the wiki
- self._check_sanity(lang, phrase_class, phrase_type)
-
#blacklisting: disallow certain class/type combinations
if (
phrase_class in self.black_list.keys() and
):
continue
- #add class/type to the pairs dict
+ #Check if the phrase already exists in the database.
+ if (
+ (normalized_label, phrase_class, phrase_type, phrase_operator)
+ in self.words_phrases_to_delete
+ ):
+ #Remove this phrase from the ones to delete as it still exist on the wiki.
+ self.words_phrases_still_exist.add(
+ (normalized_label, phrase_class, phrase_type, phrase_operator)
+ )
+ class_type_pairs.add((phrase_class, phrase_type))
+ #Dont need to add this phrase as it already exists in the word table.
+ continue
+
+ #sanity check, in case somebody added garbage in the wiki
+ self._check_sanity(lang, phrase_class, phrase_type)
+
class_type_pairs.add((phrase_class, phrase_type))
self._process_amenity(
phrase_class = pair[0]
phrase_type = pair[1]
+ table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
+
+ if table_name in self.table_phrases_to_delete:
+ #Remove this table from the ones to delete as it match a class/type
+ #still existing on the special phrases of the wiki.
+ self.table_phrases_to_delete.remove(table_name)
+ #So dont need to create the table and indexes.
+ continue
+
#Table creation
self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
.format(Identifier(table_name),
Identifier(self.config.DATABASE_WEBUSER)))
+ def _remove_non_existent_phrases_from_db(self):
+ """
+ Remove special phrases which doesn't exist on the wiki anymore.
+ Delete from the word table and delete the place_classtype tables.
+ """
+ LOG.warning('Cleaning database...')
+ self.words_phrases_to_delete = self.words_phrases_to_delete - self.words_phrases_still_exist
+ #Array containing all queries to execute. Contain tuples of format (query, parameters)
+ queries_parameters = []
+
+ #Delete phrases from the word table which are not on the wiki anymore.
+ for phrase_to_delete in self.words_phrases_to_delete:
+ if phrase_to_delete[3] == '-':
+ query = """
+ DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator IS null
+ """
+ parameters = (phrase_to_delete[0], phrase_to_delete[1], phrase_to_delete[2], )
+ queries_parameters.append((query, parameters))
+ else:
+ query = """
+ DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator = %s
+ """
+ parameters = (phrase_to_delete[0], phrase_to_delete[1],
+ phrase_to_delete[2], phrase_to_delete[3], )
+ queries_parameters.append((query, parameters))
+
+ #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
+ for table in self.table_phrases_to_delete:
+ query = SQL('DROP TABLE IF EXISTS {}').format(Identifier(table))
+ queries_parameters.append((query, ()))
+
+ with self.db_connection.cursor() as db_cursor:
+ for query, parameters in queries_parameters:
+ db_cursor.execute(query, parameters)
+
def _convert_php_settings_if_needed(self, file_path):
"""
Convert php settings file of special phrases to json file if it is still in php format.
"""
+ if not isfile(file_path):
+ raise UsageError(str(file_path) + ' is not a valid file.')
+
file, extension = os.path.splitext(file_path)
- json_file_path = file + '.json'
+ json_file_path = Path(file + '.json').resolve()
+
+ if extension not in('.php', '.json'):
+ raise UsageError('The custom NOMINATIM_PHRASE_CONFIG file has not a valid extension.')
+
if extension == '.php' and not isfile(json_file_path):
try:
subprocess.run(['/usr/bin/env', 'php', '-Cq',
- self.phplib_dir / 'migration/phraseSettingsToJson.php',
+ (self.phplib_dir / 'migration/PhraseSettingsToJson.php').resolve(),
file_path], check=True)
LOG.warning('special_phrase configuration file has been converted to json.')
return json_file_path