X-Git-Url: https://git.openstreetmap.org./osqa.git/blobdiff_plain/23b2a2bfa95f12d0333a82c6e3d34456698522b7..7be81dfc799101c3d7e646d76b07ac30fc39bf61:/forum_modules/exporter/importer.py diff --git a/forum_modules/exporter/importer.py b/forum_modules/exporter/importer.py index 35b42b1..c6b60ab 100644 --- a/forum_modules/exporter/importer.py +++ b/forum_modules/exporter/importer.py @@ -1,13 +1,39 @@ -import os, tarfile, datetime +from __future__ import with_statement + +import os, tarfile, datetime, ConfigParser, logging + +from django.utils.translation import ugettext as _ +from django.core.cache import cache + +from south.db import db from xml.sax import make_parser from xml.sax.handler import ContentHandler, ErrorHandler -from exporter import TMP_FOLDER, DATETIME_FORMAT, DATE_FORMAT +from forum.templatetags.extra_tags import diff_date + +from exporter import TMP_FOLDER, DATETIME_FORMAT, DATE_FORMAT, META_INF_SECTION, CACHE_KEY from orm import orm +import commands, settings NO_DEFAULT = object() +import string + +class SafeReader(): + def __init__(self, loc): + self.base = open(loc) + + def read(self, *args): + return "".join(c for c in self.base.read(*args) if c in string.printable) + + def readLine(self, *args): + return "".join(c for c in self.base.readLine(*args) if c in string.printable) + + def close(self): + self.base.close() + + class ContentElement(): def __init__(self, content): self._content = content @@ -52,14 +78,14 @@ class RowElement(ContentElement): self.name = name.lower() self.parent = parent self.attrs = dict([(k.lower(), ContentElement(v)) for k, v in attrs.items()]) - self._content = '' + self._content = u'' self.sub_elements = {} if parent: parent.add(self) def add_to_content(self, ch): - self._content += ch + self._content += unicode(ch) def add(self, sub): curr = self.sub_elements.get(sub.name, None) @@ -121,11 +147,12 @@ class RowElement(ContentElement): class TableHandler(ContentHandler): - def __init__(self, root_name, row_name, callback, callback_args = []): + def __init__(self, root_name, row_name, callback, callback_args = [], ping = None): self.root_name = root_name.lower() self.row_name = row_name.lower() self.callback = callback self.callback_args = callback_args + self.ping = ping self._reset() @@ -154,6 +181,9 @@ class TableHandler(ContentHandler): pass elif name == self.row_name: self.callback(self.curr_element, *self.callback_args) + if self.ping: + self.ping() + self._reset() else: self.curr_element = self.curr_element.parent @@ -169,119 +199,272 @@ class SaxErrorHandler(ErrorHandler): def warning(self, e): raise e +def disable_triggers(): + if db.backend_name == "postgres": + db.start_transaction() + db.execute_many(commands.PG_DISABLE_TRIGGERS) + db.commit_transaction() + +def enable_triggers(): + if db.backend_name == "postgres": + db.start_transaction() + db.execute_many(commands.PG_ENABLE_TRIGGERS) + db.commit_transaction() + +def reset_sequences(): + if db.backend_name == "postgres": + db.start_transaction() + db.execute_many(commands.PG_SEQUENCE_RESETS) + db.commit_transaction() + +def reset_fts_indexes(): + pass + FILE_HANDLERS = [] -def start_import(fname, user): +def start_import(fname, tag_merge, user): + + start_time = datetime.datetime.now() + steps = [s for s in FILE_HANDLERS] + + with open(os.path.join(TMP_FOLDER, 'backup.inf'), 'r') as inffile: + inf = ConfigParser.SafeConfigParser() + inf.readfp(inffile) + + state = dict([(s['id'], { + 'status': _('Queued'), 'count': int(inf.get(META_INF_SECTION, s['id'])), 'parsed': 0 + }) for s in steps] + [ + ('overall', { + 'status': _('Starting'), 'count': int(inf.get(META_INF_SECTION, 'overall')), 'parsed': 0 + }) + ]) + + full_state = dict(running=True, state=state, time_started="") + + def set_state(): + full_state['time_started'] = diff_date(start_time) + cache.set(CACHE_KEY, full_state) + + set_state() + + def ping_state(name): + state[name]['parsed'] += 1 + state['overall']['parsed'] += 1 + set_state() + + data = { + 'is_merge': True, + 'tag_merge': tag_merge + } + + def run(fn, name): + def ping(): + ping_state(name) + + state['overall']['status'] = _('Importing %s') % s['name'] + state[name]['status'] = _('Importing') + + + fn(TMP_FOLDER, user, ping, data) + + state[name]['status'] = _('Done') + + set_state() + + return fname + #dump = tarfile.open(fname, 'r') #dump.extractall(TMP_FOLDER) - for h in FILE_HANDLERS: - h(TMP_FOLDER, user) + try: + + disable_triggers() + db.start_transaction() + + for h in FILE_HANDLERS: + run(h['fn'], h['id']) + + db.commit_transaction() + enable_triggers() -def file_handler(file_name, root_tag, el_tag, args_handler=None, pre_callback=None, post_callback=None): + settings.MERGE_MAPPINGS.set_value(dict(merged_nodes=data['nodes_map'], merged_users=data['users_map'])) + + reset_sequences() + except Exception, e: + full_state['running'] = False + full_state['errors'] = "%s: %s" % (e.__class__.__name__, unicode(e)) + set_state() + + import traceback + logging.error("Error executing xml import: \n %s" % (traceback.format_exc())) + +def file_handler(file_name, root_tag, el_tag, name, args_handler=None, pre_callback=None, post_callback=None): def decorator(fn): - def decorated(location, current_user): + def decorated(location, current_user, ping, data): if pre_callback: - pre_callback(current_user) + pre_callback(current_user, data) if (args_handler): - args = args_handler(current_user) + args = args_handler(current_user, data) else: args = [] parser = make_parser() - handler = TableHandler(root_tag, el_tag, fn, args) + handler = TableHandler(root_tag, el_tag, fn, args, ping) parser.setContentHandler(handler) #parser.setErrorHandler(SaxErrorHandler()) - parser.parse(os.path.join(location, file_name)) + parser.parse(SafeReader(os.path.join(location, file_name))) if post_callback: post_callback() - FILE_HANDLERS.append(decorated) + FILE_HANDLERS.append(dict(id=root_tag, name=name, fn=decorated)) return decorated return decorator +def verify_existence(row): + try: + return orm.User.objects.get(email=row.getc('email')) + except: + for key in row.get('authKeys').get_list('key'): + key = key=key.getc('key') -@file_handler('users.xml', 'users', 'user', args_handler=lambda u: [u]) -def user_import(row, current_user): - if str(current_user.id) == row.getc('id'): - return + if not ("google.com" in key or "yahoo.com" in key): + try: + return orm.AuthKeyUserAssociation.objects.get(key=key).user + except: + pass + + return None + +def user_import_pre_callback(user, data): + data['users_map'] = {} + +@file_handler('users.xml', 'users', 'user', _('Users'), pre_callback=user_import_pre_callback, args_handler=lambda u, d: [u, d['is_merge'], d['users_map']]) +def user_import(row, current_user, is_merge, users_map): + existent = is_merge and verify_existence(row) or None roles = row.get('roles').get_listc('role') valid_email = row.get('email').get_attr('validated').as_bool() badges = row.get('badges') - user = orm.User( - id = row.getc('id'), - username = row.getc('username'), - password = row.getc('password'), - email = row.getc('email'), - email_isvalid= valid_email, - is_superuser = 'superuser' in roles, - is_staff = 'moderator' in roles, - is_active = True, - date_joined = row.get('joindate').as_datetime(), - about = row.getc('bio'), - date_of_birth = row.get('birthdate').as_date(None), - website = row.getc('website'), - reputation = row.get('reputation').as_int(), - gold = badges.get_attr('gold').as_int(), - silver = badges.get_attr('silver').as_int(), - bronze = badges.get_attr('bronze').as_int(), - real_name = row.getc('realname'), - location = row.getc('location'), - ) + if existent: + user = existent + + user.reputation += row.get('reputation').as_int() + user.gold += badges.get_attr('gold').as_int() + user.silver += badges.get_attr('gold').as_int() + user.bronze += badges.get_attr('gold').as_int() + + else: + username = row.getc('username') + + if is_merge: + username_count = 0 + + while orm.User.objects.filter(username=username).count(): + username_count += 1 + username = "%s %s" % (row.getc('username'), username_count) + + user = orm.User( + id = (not is_merge) and row.getc('id') or None, + username = username, + password = row.getc('password'), + email = row.getc('email'), + email_isvalid= valid_email, + is_superuser = (not is_merge) and 'superuser' in roles, + is_staff = ('moderator' in roles) or (is_merge and 'superuser' in roles), + is_active = row.get('active').as_bool(), + date_joined = row.get('joindate').as_datetime(), + about = row.getc('bio'), + date_of_birth = row.get('birthdate').as_date(None), + website = row.getc('website'), + reputation = row.get('reputation').as_int(), + gold = badges.get_attr('gold').as_int(), + silver = badges.get_attr('silver').as_int(), + bronze = badges.get_attr('bronze').as_int(), + real_name = row.getc('realname'), + location = row.getc('location'), + ) user.save() + users_map[row.get('id').as_int()] = user.id + authKeys = row.get('authKeys') for key in authKeys.get_list('key'): - orm.AuthKeyUserAssociation(user=user, key=key.getc('key'), provider=key.getc('provider')).save() + if (not is_merge) or orm.AuthKeyUserAssociation.objects.filter(key=key.getc('key')).count() == 0: + orm.AuthKeyUserAssociation(user=user, key=key.getc('key'), provider=key.getc('provider')).save() - notifications = row.get('notifications') + if not existent: + notifications = row.get('notifications') - attributes = dict([(str(k), v.as_bool() and 'i' or 'n') for k, v in notifications.get('notify').attrs.items()]) - attributes.update(dict([(str(k), v.as_bool()) for k, v in notifications.get('autoSubscribe').attrs.items()])) - attributes.update(dict([(str("notify_%s" % k), v.as_bool()) for k, v in notifications.get('notifyOnSubscribed').attrs.items()])) + attributes = dict([(str(k), v.as_bool() and 'i' or 'n') for k, v in notifications.get('notify').attrs.items()]) + attributes.update(dict([(str(k), v.as_bool()) for k, v in notifications.get('autoSubscribe').attrs.items()])) + attributes.update(dict([(str("notify_%s" % k), v.as_bool()) for k, v in notifications.get('notifyOnSubscribed').attrs.items()])) - orm.SubscriptionSettings(user=user, enable_notifications=notifications.get_attr('enabled').as_bool(), **attributes).save() + ss = orm.SubscriptionSettings(user=user, enable_notifications=notifications.get_attr('enabled').as_bool(), **attributes) -def pre_tag_import(user): - tag_import.tag_mappings={} + if current_user.id == row.get('id').as_int(): + ss.id = current_user.subscription_settings.id + ss.save() + -@file_handler('tags.xml', 'tags', 'tag', pre_callback=pre_tag_import) -def tag_import(row): - tag = orm.Tag(name=row.getc('name'), used_count=row.get('used').as_int(), created_by_id=row.get('author').as_int()) - tag.save() - tag_import.tag_mappings[tag.name] = tag +def pre_tag_import(user, data): + data['tag_mappings'] = dict([ (t.name, t) for t in orm.Tag.objects.all() ]) + + +@file_handler('tags.xml', 'tags', 'tag', _('Tags'), pre_callback=pre_tag_import, args_handler=lambda u, d: [d['is_merge'], d['tag_merge'], d['users_map'], d['tag_mappings']]) +def tag_import(row, is_merge, tag_merge, users_map, tag_mappings): + created_by = row.get('used').as_int() + created_by = users_map.get(created_by, created_by) + tag_name = row.getc('name') + tag_name = tag_merge and tag_merge.get(tag_name, tag_name) or tag_name -def post_node_import(): - tag_import.tag_mappings = None + if is_merge and tag_name in tag_mappings: + tag = tag_mappings[tag_name] + tag.used_count += row.get('used').as_int() + else: + tag = orm.Tag(name=tag_name, used_count=row.get('used').as_int(), created_by_id=created_by) + tag_mappings[tag.name] = tag -@file_handler('nodes.xml', 'nodes', 'node', args_handler=lambda u: [tag_import.tag_mappings], post_callback=post_node_import) -def node_import(row, tags): + tag.save() + +def pre_node_import(user, data): + data['nodes_map'] = {} + +@file_handler('nodes.xml', 'nodes', 'node', _('Nodes'), pre_callback=pre_node_import, + args_handler=lambda u, d: [d['is_merge'], d['tag_merge'], d['tag_mappings'], d['nodes_map'], d['users_map']]) +def node_import(row, is_merge, tag_merge, tags, nodes_map, users_map): ntags = [] for t in row.get('tags').get_list('tag'): - ntags.append(tags[t.content()]) + t = t.content() + ntags.append(tags[tag_merge and tag_merge.get(t, t) or t]) + + author = row.get('author').as_int() last_act = row.get('lastactivity') + last_act_user = last_act.get('by').as_int(None) + + parent = row.get('parent').as_int(None) + abs_parent = row.get('absparent').as_int(None) node = orm.Node( - id = row.getc('id'), + id = (not is_merge) and row.getc('id') or None, node_type = row.getc('type'), - author_id = row.get('author').as_int(), + author_id = users_map.get(author, author), added_at = row.get('date').as_datetime(), - parent_id = row.get('parent').as_int(None), - abs_parent_id = row.get('absparent').as_int(None), + parent_id = nodes_map.get(parent, parent), + abs_parent_id = nodes_map.get(abs_parent, abs_parent), + score = row.get('score').as_int(0), - last_activity_by_id = last_act.get('by').as_int(None), + last_activity_by_id = last_act_user and users_map.get(last_act_user, last_act_user) or last_act_user, last_activity_at = last_act.get('at').as_datetime(None), title = row.getc('title'), @@ -295,26 +478,45 @@ def node_import(row, tags): ) node.save() + + nodes_map[row.get('id').as_int()] = node.id + node.tags = ntags revisions = row.get('revisions') active = revisions.get_attr('active').as_int() - for r in revisions.get_list('revision'): - rev = orm.NodeRevision( - author_id = r.getc('author'), - body = r.getc('body'), + if active == 0: + active = orm.NodeRevision( + author_id = node.author_id, + body = row.getc('body'), node = node, - revised_at = r.get('date').as_datetime(), - revision = r.get('number').as_int(), - summary = r.getc('summary'), - tagnames = " ".join(r.getc('tags').split(',')), - title = r.getc('title'), + revised_at = row.get('date').as_datetime(), + revision = 1, + summary = _('Initial revision'), + tagnames = " ".join([t.name for t in ntags]), + title = row.getc('title'), ) - rev.save() - if rev.revision == active: - active = rev + active.save() + else: + for r in revisions.get_list('revision'): + author = row.get('author').as_int() + + rev = orm.NodeRevision( + author_id = users_map.get(author, author), + body = r.getc('body'), + node = node, + revised_at = r.get('date').as_datetime(), + revision = r.get('number').as_int(), + summary = r.getc('summary'), + tagnames = " ".join(r.getc('tags').split(',')), + title = r.getc('title'), + ) + + rev.save() + if rev.revision == active: + active = rev node.active_revision = active node.save() @@ -328,6 +530,9 @@ def post_action(*types): return fn return decorator +def pre_action_import_callback(user, data): + data['actions_map'] = {} + def post_action_import_callback(): with_state = orm.Node.objects.filter(id__in=orm.NodeState.objects.values_list('node_id', flat=True).distinct()) @@ -335,52 +540,73 @@ def post_action_import_callback(): n.state_string = "".join(["(%s)" % s for s in n.states.values_list('state_type')]) n.save() -@file_handler('actions.xml', 'actions', 'action', post_callback=post_action_import_callback) -def actions_import(row): +@file_handler('actions.xml', 'actions', 'action', _('Actions'), post_callback=post_action_import_callback, + pre_callback=pre_action_import_callback, args_handler=lambda u, d: [d['nodes_map'], d['users_map'], d['actions_map']]) +def actions_import(row, nodes, users, actions_map): + node = row.get('node').as_int(None) + user = row.get('user').as_int() + real_user = row.get('realUser').as_int(None) + action = orm.Action( - id = row.get('id').as_int(), + #id = row.get('id').as_int(), action_type = row.getc('type'), action_date = row.get('date').as_datetime(), - node_id = row.get('node').as_int(None), - user_id = row.get('user').as_int(), - real_user_id = row.get('realUser').as_int(None), + node_id = nodes.get(node, node), + user_id = users.get(user, user), + real_user_id = users.get(real_user, real_user), ip = row.getc('ip'), extra = row.get('extraData').as_pickled(), ) canceled = row.get('canceled') if canceled.get_attr('state').as_bool(): - action.canceled_by_id = canceled.get('user').as_int() - action.canceled_at = canceled.get('date').as_datetime(), + by = canceled.get('user').as_int() + action.canceled = True + action.canceled_by_id = users.get(by, by) + action.canceled_at = canceled.getc('date') #.as_datetime(), action.canceled_ip = canceled.getc('ip') action.save() + actions_map[row.get('id').as_int()] = action.id + for r in row.get('reputes').get_list('repute'): by_canceled = r.get_attr('byCanceled').as_bool() orm.ActionRepute( action = action, - user_id = r.get('user').as_int(), + user_id = users[r.get('user').as_int()], value = r.get('value').as_int(), date = by_canceled and action.canceled_at or action.action_date, by_canceled = by_canceled ).save() - if (not action.canceled) and action.action_type in POST_ACTION: - POST_ACTION[action.action_type](row, action) + if (not action.canceled) and (action.action_type in POST_ACTION): + POST_ACTION[action.action_type](row, action, users, nodes, actions_map) +# Record of all persisted votes. +persisted_votes = [] @post_action('voteup', 'votedown', 'voteupcomment') -def vote_action(row, action): - orm.Vote(user_id=action.user_id, node_id=action.node_id, action=action, - voted_at=action.action_date, value=(action.action_type != 'votedown') and 1 or -1).save() +def vote_action(row, action, users, nodes, actions): + # Check to see if the vote has already been registered. + if not (action.user_id, action.node_id) in persisted_votes: + # Persist the vote action. + orm.Vote(user_id=action.user_id, node_id=action.node_id, action=action, + voted_at=action.action_date, value=(action.action_type != 'votedown') and 1 or -1).save() + + # Record the vote action. This will help us avoid duplicates. + persisted_votes.append((action.user_id, action.node_id)) + def state_action(state): - def fn(row, action): + def fn(row, action, users, nodes, actions): + if orm.NodeState.objects.filter(state_type = state, node = action.node_id).count(): + return + orm.NodeState( state_type = state, node_id = action.node_id, @@ -395,25 +621,43 @@ post_action('publish')(state_action('published')) @post_action('flag') -def flag_action(row, action): - orm.Flag(user_id=action.user_id, node_id=action.node_id, action=action, reason=action.extra).save() +def flag_action(row, action, users, nodes, actions): + orm.Flag(user_id=action.user_id, node_id=action.node_id, action=action, reason=action.extra or "").save() -def award_import_args(user): - return [ dict([ (b.cls, b) for b in orm.Badge.objects.all() ]) ] +def award_import_args(user, data): + return [ dict([ (b.cls, b) for b in orm.Badge.objects.all() ]) , data['nodes_map'], data['users_map'], data['actions_map']] -@file_handler('awards.xml', 'awards', 'award', args_handler=award_import_args) -def awards_import(row, badges): +@file_handler('awards.xml', 'awards', 'award', _('Awards'), args_handler=award_import_args) +def awards_import(row, badges, nodes, users, actions): + badge_type = badges.get(row.getc('badge'), None) + + if not badge_type: + return + + action = row.get('action').as_int(None) + trigger = row.get('trigger').as_int(None) + node = row.get('node').as_int(None) + user = row.get('user').as_int() + + if orm.Award.objects.filter(badge=badges[row.getc('badge')], user=users.get(user, user), node=nodes.get(node, node)).count(): + return + award = orm.Award( - user_id = row.get('user').as_int(), - badge = badges[row.getc('badge')], - node_id = row.get('node').as_int(None), - action_id = row.get('action').as_int(None), - trigger_id = row.get('trigger').as_int(None) + user_id = users.get(user, user), + badge = badge_type, + node_id = nodes.get(node, node), + action_id = actions.get(action, action), + trigger_id = actions.get(trigger, trigger) ).save() +#@file_handler('settings.xml', 'settings', 'setting', _('Settings')) +def settings_import(row): + orm.KeyValue(key=row.getc('key'), value=row.get('value').as_pickled()) + +