self.black_list, self.white_list = self._load_white_and_black_lists()
#Compile the regex here to increase performances.
self.occurence_pattern = re.compile(
- r'\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([\-YN])'
+ r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
)
self.sanity_check_pattern = re.compile(r'^\w+$')
self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
#This set will contain all existing place_classtype tables which doesn't match any
#special phrases class/type on the wiki.
self.table_phrases_to_delete = set()
- self.table_phrases = set()
def import_from_wiki(self, languages=None):
"""
if languages is not None and not isinstance(languages, list):
raise TypeError('The \'languages\' argument should be of type list.')
- self.fetch_existing_words_phrases()
- self.fetch_existing_place_classtype_tables()
+ self._fetch_existing_words_phrases()
+ self._fetch_existing_place_classtype_tables()
#Get all languages to process.
languages = self._load_languages() if not languages else languages
class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang))
self._create_place_classtype_table_and_indexes(class_type_pairs)
- self.remove_non_existent_phrases_from_db()
+ self._remove_non_existent_phrases_from_db()
self.db_connection.commit()
LOG.warning('Import done.')
- def fetch_existing_words_phrases(self):
+ def _fetch_existing_words_phrases(self):
"""
Fetch existing special phrases from the word table.
Fill the word_phrases_to_delete set of the class.
(row[0], row[1], row[2], row[3])
)
- def fetch_existing_place_classtype_tables(self):
+ def _fetch_existing_place_classtype_tables(self):
"""
Fetch existing place_classtype tables.
Fill the table_phrases_to_delete set of the class.
if self.config.PHRASE_CONFIG:
settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG)
- with open(settings_path, "r") as json_settings:
+ with settings_path.open("r") as json_settings:
settings = json.load(json_settings)
return settings['blackList'], settings['whiteList']
class_matchs = self.sanity_check_pattern.findall(phrase_class)
if len(class_matchs) < 1 or len(type_matchs) < 1:
- raise UsageError("Bad class/type for language {}: {}={}".format(
- lang, phrase_class, phrase_type))
+ LOG.warning("Bad class/type for language %s: %s=%s. It will not be imported",
+ lang, phrase_class, phrase_type)
+ return False
+ return True
def _process_xml_content(self, xml_content, lang):
"""
continue
#sanity check, in case somebody added garbage in the wiki
- self._check_sanity(lang, phrase_class, phrase_type)
+ if not self._check_sanity(lang, phrase_class, phrase_type):
+ continue
class_type_pairs.add((phrase_class, phrase_type))
"""
index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type)
base_table = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
- #Index on centroidself.table_phrases_to_delete.add(row)
+ #Index on centroid
if not self.db_connection.index_exists(index_prefix + 'centroid'):
with self.db_connection.cursor() as db_cursor:
db_cursor.execute(SQL("""
.format(Identifier(table_name),
Identifier(self.config.DATABASE_WEBUSER)))
- def remove_non_existent_phrases_from_db(self):
+ def _remove_non_existent_phrases_from_db(self):
"""
Remove special phrases which doesn't exist on the wiki anymore.
Delete from the word table and delete the place_classtype tables.