X-Git-Url: https://git.openstreetmap.org./osqa.git/blobdiff_plain/a9b491958beed0da30e5371ac87c0c850b039315..46b868a3f5ffa8f693808cb832306286802124f4:/forum_modules/sximporter/importer.py?ds=sidebyside diff --git a/forum_modules/sximporter/importer.py b/forum_modules/sximporter/importer.py index efc2867..6669b8c 100644 --- a/forum_modules/sximporter/importer.py +++ b/forum_modules/sximporter/importer.py @@ -1,13 +1,11 @@ # -*- coding: utf-8 -*- -from xml.dom import minidom -from datetime import datetime, timedelta +from datetime import datetime import time import re +import os +import gc from django.utils.translation import ugettext as _ -from django.template.defaultfilters import slugify -from forum.models.utils import dbsafe_encode -from orm import orm from django.utils.encoding import force_unicode @@ -20,6 +18,78 @@ from copy import deepcopy from base64 import b64encode, b64decode from zlib import compress, decompress +from xml.sax import make_parser +from xml.sax.handler import ContentHandler + +def create_orm(): + from django.conf import settings + from south.orm import FakeORM + + get_migration_number_re = re.compile(r'^((\d+)_.*)\.py$') + + migrations_folder = os.path.join(settings.SITE_SRC_ROOT, 'forum/migrations') + + highest_number = 0 + highest_file = None + + for f in os.listdir(migrations_folder): + if os.path.isfile(os.path.join(migrations_folder, f)): + m = get_migration_number_re.match(f) + + if m: + found = int(m.group(2)) + + if found > highest_number: + highest_number = found + highest_file = m.group(1) + + mod = __import__('forum.migrations.%s' % highest_file, globals(), locals(), ['forum.migrations']) + return FakeORM(getattr(mod, 'Migration'), "forum") + +orm = create_orm() + +class SXTableHandler(ContentHandler): + def __init__(self, fname, callback): + self.in_row = False + self.el_data = {} + self.ch_data = '' + + self.fname = fname.lower() + self.callback = callback + + def startElement(self, name, attrs): + if name.lower() == self.fname: + pass + elif name.lower() == "row": + self.in_row = True + + def characters(self, ch): + self.ch_data += ch + + def endElement(self, name): + if name.lower() == self.fname: + pass + elif name.lower() == "row": + self.callback(self.el_data) + + self.in_row = False + del self.el_data + self.el_data = {} + elif self.in_row: + self.el_data[name.lower()] = self.ch_data.strip() + del self.ch_data + self.ch_data = '' + + +def readTable(path, name, callback): + parser = make_parser() + handler = SXTableHandler(name, callback) + parser.setContentHandler(handler) + + f = os.path.join(path, "%s.xml" % name) + parser.parse(f) + + def dbsafe_encode(value): return force_unicode(b64encode(compress(dumps(deepcopy(value))))) @@ -38,11 +108,13 @@ def readTime(ts): return datetime(*time.strptime(ts, '%Y-%m-%dT%H:%M:%S')[0:6]) -def readEl(el): - return dict([(n.tagName.lower(), getText(n)) for n in el.childNodes if n.nodeType == el.ELEMENT_NODE]) +#def readEl(el): +# return dict([(n.tagName.lower(), getText(n)) for n in el.childNodes if n.nodeType == el.ELEMENT_NODE]) -def readTable(dump, name): - return [readEl(e) for e in minidom.parseString(dump.read("%s.xml" % name)).getElementsByTagName('row')] +#def readTable(dump, name): +# for e in minidom.parseString(dump.read("%s.xml" % name)).getElementsByTagName('row'): +# yield readEl(e) +#return [readEl(e) for e in minidom.parseString(dump.read("%s.xml" % name)).getElementsByTagName('row')] google_accounts_lookup = re.compile(r'^https?://www.google.com/accounts/') yahoo_accounts_lookup = re.compile(r'^https?://me.yahoo.com/a/') @@ -102,52 +174,60 @@ class UnknownYahooUser(UnknownUser): class IdMapper(dict): + + def __init__(self): + self.default = 1 + def __getitem__(self, key): key = int(key) - return super(IdMapper, self).get(key, 1) + return super(IdMapper, self).get(key, self.default) def __setitem__(self, key, value): super(IdMapper, self).__setitem__(int(key), int(value)) +class IdIncrementer(): + def __init__(self, initial): + self.value = initial + + def inc(self): + self.value += 1 + openidre = re.compile('^https?\:\/\/') -def userimport(dump, options): - users = readTable(dump, "Users") +def userimport(path, options): - user_by_name = {} + usernames = [] + openids = set() uidmapper = IdMapper() - merged_users = [] + authenticated_user = options.get('authenticated_user', None) owneruid = options.get('owneruid', None) #check for empty values if not owneruid: owneruid = None + else: + owneruid = int(owneruid) - for sxu in users: + def callback(sxu): create = True + set_mapper_defaults = False if sxu.get('id') == '-1': - continue + return + #print "\n".join(["%s : %s" % i for i in sxu.items()]) - if int(sxu.get('id')) == int(owneruid): - osqau = orm.User.objects.get(id=1) - uidmapper[owneruid] = 1 - uidmapper[-1] = 1 - create = False - else: - username = sxu.get('displayname', - sxu.get('displaynamecleaned', sxu.get('realname', final_username_attempt(sxu)))) + if (owneruid and (int(sxu.get('id')) == owneruid)) or ( + (not owneruid) and len(uidmapper)): - if not isinstance(username, UnknownUser) and username in user_by_name: - #if options.get('mergesimilar', False) and sxu.get('email', 'INVALID') == user_by_name[username].email: - # osqau = user_by_name[username] - # create = False - # uidmapper[sxu.get('id')] = osqau.id - #else: - inc = 1 - while ("%s %d" % (username, inc)) in user_by_name: - inc += 1 + set_mapper_defaults = True - username = "%s %d" % (username, inc) + if authenticated_user: + osqau = orm.User.objects.get(id=authenticated_user.id) + + for assoc in orm.AuthKeyUserAssociation.objects.filter(user=osqau): + openids.add(assoc.key) + + uidmapper[owneruid] = osqau.id + create = False sxbadges = sxu.get('badgesummary', None) badges = {'1':'0', '2':'0', '3':'0'} @@ -156,9 +236,28 @@ def userimport(dump, options): badges.update(dict([b.split('=') for b in sxbadges.split()])) if create: + username = unicode(sxu.get('displayname', + sxu.get('displaynamecleaned', sxu.get('realname', final_username_attempt(sxu)))))[:30] + + if username in usernames: + #if options.get('mergesimilar', False) and sxu.get('email', 'INVALID') == user_by_name[username].email: + # osqau = user_by_name[username] + # create = False + # uidmapper[sxu.get('id')] = osqau.id + #else: + inc = 0 + + while True: + inc += 1 + totest = "%s %d" % (username[:29 - len(str(inc))], inc) + + if not totest in usernames: + username = totest + break + osqau = orm.User( id = sxu.get('id'), - username = unicode(username), + username = username, password = '!', email = sxu.get('email', ''), is_superuser = sxu.get('usertypeid') == '5', @@ -174,7 +273,7 @@ def userimport(dump, options): gold = int(badges['1']), silver = int(badges['2']), bronze = int(badges['3']), - real_name = sxu.get('realname', ''), + real_name = sxu.get('realname', '')[:30], location = sxu.get('location', ''), ) @@ -224,27 +323,39 @@ def userimport(dump, options): osqau.location = sxu.get('location', '') osqau.real_name = sxu.get('realname', '') - merged_users.append(osqau.id) + #merged_users.append(osqau.id) osqau.save() - user_by_name[osqau.username] = osqau + if set_mapper_defaults: + uidmapper[-1] = osqau.id + uidmapper.default = osqau.id + + usernames.append(osqau.username) openid = sxu.get('openid', None) - if openid and openidre.match(openid): + if openid and openidre.match(openid) and (not openid in openids): assoc = orm.AuthKeyUserAssociation(user=osqau, key=openid, provider="openidurl") assoc.save() + openids.add(openid) + + openidalt = sxu.get('openidalt', None) + if openidalt and openidre.match(openidalt) and (not openidalt in openids): + assoc = orm.AuthKeyUserAssociation(user=osqau, key=openidalt, provider="openidurl") + assoc.save() + openids.add(openidalt) - if uidmapper[-1] == -1: - uidmapper[-1] = 1 + readTable(path, "Users", callback) - return (uidmapper, merged_users) + #if uidmapper[-1] == -1: + # uidmapper[-1] = 1 + + return uidmapper def tagsimport(dump, uidmap): - tags = readTable(dump, "Tags") tagmap = {} - for sxtag in tags: + def callback(sxtag): otag = orm.Tag( id = int(sxtag['id']), name = sxtag['name'], @@ -255,6 +366,8 @@ def tagsimport(dump, uidmap): tagmap[otag.name] = otag + readTable(dump, "Tags", callback) + return tagmap def add_post_state(name, post, action): @@ -280,19 +393,9 @@ def remove_post_state(name, post): post.state_string = "".join("(%s)" % s for s in re.findall('\w+', post.state_string) if s != name) def postimport(dump, uidmap, tagmap): - history = {} - accepted = {} all = {} - for h in readTable(dump, "PostHistory"): - if not history.get(h.get('postid'), None): - history[h.get('postid')] = [] - - history[h.get('postid')].append(h) - - posts = readTable(dump, "Posts") - - for sxpost in posts: + def callback(sxpost): nodetype = (sxpost.get('posttypeid') == '1') and "nodetype" or "answer" post = orm.Node( @@ -350,29 +453,40 @@ def postimport(dump, uidmap, tagmap): post.extra_count = sxpost.get('viewcount', 0) + add_tags_to_post(post, tagmap) + all[int(post.id)] = int(post.id) + else: post.parent_id = sxpost['parentid'] + post.abs_parent_id = sxpost['parentid'] + all[int(post.id)] = int(sxpost['parentid']) post.save() - all[int(post.id)] = post + create_and_activate_revision(post) + + del post + + readTable(dump, "Posts", callback) return all -def comment_import(dump, uidmap, posts): - comments = readTable(dump, "PostComments") - currid = max(posts.keys()) +def comment_import(dump, uidmap, absparent_map): + posts = absparent_map.keys() + + currid = IdIncrementer(max(posts)) mapping = {} - for sxc in comments: - currid += 1 + def callback(sxc): + currid.inc() oc = orm.Node( - id = currid, + id = currid.value, node_type = "comment", added_at = readTime(sxc['creationdate']), author_id = uidmap[sxc.get('userid', 1)], body = sxc['text'], parent_id = sxc.get('postid'), + abs_parent_id = absparent_map.get(int(sxc.get('postid')), sxc.get('postid')) ) if sxc.get('deletiondate', None): @@ -400,23 +514,22 @@ def comment_import(dump, uidmap, posts): action_date = oc.added_at ) + create_and_activate_revision(oc) + create_action.save() oc.save() - posts[oc.id] = oc + posts.append(int(oc.id)) mapping[int(sxc['id'])] = int(oc.id) + readTable(dump, "PostComments", callback) return posts, mapping -def add_tags_to_posts(posts, tagmap): - for post in posts.values(): - if post.node_type == "question": - tags = [tag for tag in [tagmap.get(name.strip()) for name in post.tagnames.split(u' ') if name] if tag] - post.tagnames = " ".join([t.name for t in tags]).strip() - post.tags = tags - - create_and_activate_revision(post) +def add_tags_to_post(post, tagmap): + tags = [tag for tag in [tagmap.get(name.strip()) for name in post.tagnames.split(u' ') if name] if tag] + post.tagnames = " ".join([t.name for t in tags]).strip() + post.tags = tags def create_and_activate_revision(post): @@ -436,24 +549,28 @@ def create_and_activate_revision(post): post.save() def post_vote_import(dump, uidmap, posts): - votes = readTable(dump, "Posts2Votes") - close_reasons = dict([(r['id'], r['name']) for r in readTable(dump, "CloseReasons")]) + close_reasons = {} + + def close_callback(r): + close_reasons[r['id']] = r['name'] + + readTable(dump, "CloseReasons", close_callback) user2vote = [] - for sxv in votes: + def callback(sxv): action = orm.Action( user_id=uidmap[sxv['userid']], action_date = readTime(sxv['creationdate']), ) - node = posts.get(int(sxv['postid']), None) - if not node: continue + if not int(sxv['postid']) in posts: return + node = orm.Node.objects.get(id=sxv['postid']) action.node = node if sxv['votetypeid'] == '1': answer = node - question = posts.get(int(answer.parent_id), None) + question = orm.Node.objects.get(id=answer.parent_id) action.action_type = "acceptanswer" action.save() @@ -557,12 +674,14 @@ def post_vote_import(dump, uidmap, posts): state = {"acceptanswer": "accepted", "delete": "deleted", "close": "closed"}[action.action_type] add_post_state(state, node, action) + readTable(dump, "Posts2Votes", callback) + -def comment_vote_import(dump, uidmap, comments, posts): - votes = readTable(dump, "Comments2Votes") +def comment_vote_import(dump, uidmap, comments): user2vote = [] + comments2score = {} - for sxv in votes: + def callback(sxv): if sxv['votetypeid'] == "2": comment_id = comments[int(sxv['postcommentid'])] user_id = uidmap[sxv['userid']] @@ -588,14 +707,27 @@ def comment_vote_import(dump, uidmap, comments, posts): ov.save() - posts[int(action.node_id)].score += 1 - posts[int(action.node_id)].save() + if not comment_id in comments2score: + comments2score[comment_id] = 1 + else: + comments2score[comment_id] += 1 + + readTable(dump, "Comments2Votes", callback) + + for cid, score in comments2score.items(): + orm.Node.objects.filter(id=cid).update(score=score) def badges_import(dump, uidmap, post_list): - node_ctype = orm['contenttypes.contenttype'].objects.get(name='node') + + sxbadges = {} + + def sxcallback(b): + sxbadges[int(b['id'])] = b + + readTable(dump, "Badges", sxcallback) + obadges = dict([(b.cls, b) for b in orm.Badge.objects.all()]) - sxbadges = dict([(int(b['id']), b) for b in readTable(dump, "Badges")]) user_badge_count = {} sx_to_osqa = {} @@ -614,10 +746,9 @@ def badges_import(dump, uidmap, post_list): osqab.save() sx_to_osqa[id] = osqab - sxawards = readTable(dump, "Users2Badges") osqaawards = [] - for sxa in sxawards: + def callback(sxa): badge = sx_to_osqa[int(sxa['badgeid'])] user_id = uidmap[sxa['userid']] @@ -635,24 +766,39 @@ def badges_import(dump, uidmap, post_list): osqaa = orm.Award( user_id = uidmap[sxa['userid']], badge = badge, - node = post_list[user_badge_count[user_id]], + node_id = post_list[user_badge_count[user_id]], awarded_at = action.action_date, action = action ) osqaa.save() badge.awarded_count += 1 + user_badge_count[user_id] += 1 + readTable(dump, "Users2Badges", callback) + for badge in obadges.values(): badge.save() -def pages_import(dump): +def save_setting(k, v): + try: + kv = orm.KeyValue.objects.get(key=k) + kv.value = v + except: + kv = orm.KeyValue(key = k, value = v) + + kv.save() + + +def pages_import(dump, currid, owner): + currid = IdIncrementer(currid) registry = {} - sx_pages = readTable(dump, "FlatPages") - for sxp in sx_pages: + def callback(sxp): + currid.inc() page = orm.Node( + id = currid.value, node_type = "page", title = sxp['name'], body = b64decode(sxp['value']), @@ -666,9 +812,11 @@ def pages_import(dump): 'sidebar_render': "html", 'comments': False }), - author_id = 1 + author_id = owner ) + create_and_activate_revision(page) + page.save() registry[sxp['url'][1:]] = page.id @@ -690,17 +838,19 @@ def pages_import(dump): pub_action.save() add_post_state("published", page, pub_action) - kv = orm.KeyValue(key='STATIC_PAGE_REGISTRY', value=dbsafe_encode(registry)) - kv.save() + readTable(dump, "FlatPages", callback) + + save_setting('STATIC_PAGE_REGISTRY', dbsafe_encode(registry)) sx2osqa_set_map = { u'theme.html.name': 'APP_TITLE', -u'theme.html.footer': 'USE_CUSTOM_FOOTER', +u'theme.html.footer': 'CUSTOM_FOOTER', u'theme.html.sidebar': 'SIDEBAR_UPPER_TEXT', u'theme.html.sidebar-low': 'SIDEBAR_LOWER_TEXT', u'theme.html.welcome': 'APP_INTRO', u'theme.html.head': 'CUSTOM_HEAD', -u'theme.html.header': 'CUSTOM_HEADER' +u'theme.html.header': 'CUSTOM_HEADER', +u'theme.css': 'CUSTOM_CSS', } html_codes = ( @@ -721,23 +871,31 @@ def html_decode(html): def static_import(dump): - sx_sets = readTable(dump, "ThemeTextResources") sx_unknown = {} - for set in sx_sets: + def callback(set): if unicode(set['name']) in sx2osqa_set_map: - kv = orm.KeyValue( - key = sx2osqa_set_map[set['name']], - value = dbsafe_encode(html_decode(set['value'])) - ) - - kv.save() + save_setting(sx2osqa_set_map[set['name']], dbsafe_encode(html_decode(set['value']))) else: sx_unknown[set['name']] = html_decode(set['value']) - unknown = orm.KeyValue(key='SXIMPORT_UNKNOWN_SETS', value=dbsafe_encode(sx_unknown)) - unknown.save() + readTable(dump, "ThemeTextResources", callback) + save_setting('SXIMPORT_UNKNOWN_SETS', dbsafe_encode(sx_unknown)) + +def disable_triggers(): + from south.db import db + if db.backend_name == "postgres": + db.execute_many(PG_DISABLE_TRIGGERS) + db.commit_transaction() + db.start_transaction() + +def enable_triggers(): + from south.db import db + if db.backend_name == "postgres": + db.start_transaction() + db.execute_many(PG_ENABLE_TRIGGERS) + db.commit_transaction() def reset_sequences(): from south.db import db @@ -746,24 +904,104 @@ def reset_sequences(): db.execute_many(PG_SEQUENCE_RESETS) db.commit_transaction() +def reindex_fts(): + from south.db import db + if db.backend_name == "postgres": + db.start_transaction() + db.execute_many("UPDATE forum_noderevision set id = id WHERE TRUE;") + db.commit_transaction() + + def sximport(dump, options): -#uidmap, merged_users = userimport(dump, options) -#tagmap = tagsimport(dump, uidmap) -#posts = postimport(dump, uidmap, tagmap) -#posts, comments = comment_import(dump, uidmap, posts) -#add_tags_to_posts(posts, tagmap) -#post_vote_import(dump, uidmap, posts) -#comment_vote_import(dump, uidmap, comments, posts) -#badges_import(dump, uidmap, posts.values()) - - pages_import(dump) - #static_import(dump) + try: + disable_triggers() + triggers_disabled = True + except: + triggers_disabled = False + + uidmap = userimport(dump, options) + tagmap = tagsimport(dump, uidmap) + gc.collect() + + posts = postimport(dump, uidmap, tagmap) + gc.collect() + + posts, comments = comment_import(dump, uidmap, posts) + gc.collect() + + post_vote_import(dump, uidmap, posts) + gc.collect() + + comment_vote_import(dump, uidmap, comments) + gc.collect() + + badges_import(dump, uidmap, posts) + + pages_import(dump, max(posts), uidmap.default) + static_import(dump) + gc.collect() from south.db import db db.commit_transaction() reset_sequences() + if triggers_disabled: + enable_triggers() + reindex_fts() + + +PG_DISABLE_TRIGGERS = """ +ALTER table auth_user DISABLE TRIGGER ALL; +ALTER table auth_user_groups DISABLE TRIGGER ALL; +ALTER table auth_user_user_permissions DISABLE TRIGGER ALL; +ALTER table forum_keyvalue DISABLE TRIGGER ALL; +ALTER table forum_action DISABLE TRIGGER ALL; +ALTER table forum_actionrepute DISABLE TRIGGER ALL; +ALTER table forum_subscriptionsettings DISABLE TRIGGER ALL; +ALTER table forum_validationhash DISABLE TRIGGER ALL; +ALTER table forum_authkeyuserassociation DISABLE TRIGGER ALL; +ALTER table forum_tag DISABLE TRIGGER ALL; +ALTER table forum_markedtag DISABLE TRIGGER ALL; +ALTER table forum_node DISABLE TRIGGER ALL; +ALTER table forum_nodestate DISABLE TRIGGER ALL; +ALTER table forum_node_tags DISABLE TRIGGER ALL; +ALTER table forum_noderevision DISABLE TRIGGER ALL; +ALTER table forum_node_tags DISABLE TRIGGER ALL; +ALTER table forum_questionsubscription DISABLE TRIGGER ALL; +ALTER table forum_vote DISABLE TRIGGER ALL; +ALTER table forum_flag DISABLE TRIGGER ALL; +ALTER table forum_badge DISABLE TRIGGER ALL; +ALTER table forum_award DISABLE TRIGGER ALL; +ALTER table forum_openidnonce DISABLE TRIGGER ALL; +ALTER table forum_openidassociation DISABLE TRIGGER ALL; +""" + +PG_ENABLE_TRIGGERS = """ +ALTER table auth_user ENABLE TRIGGER ALL; +ALTER table auth_user_groups ENABLE TRIGGER ALL; +ALTER table auth_user_user_permissions ENABLE TRIGGER ALL; +ALTER table forum_keyvalue ENABLE TRIGGER ALL; +ALTER table forum_action ENABLE TRIGGER ALL; +ALTER table forum_actionrepute ENABLE TRIGGER ALL; +ALTER table forum_subscriptionsettings ENABLE TRIGGER ALL; +ALTER table forum_validationhash ENABLE TRIGGER ALL; +ALTER table forum_authkeyuserassociation ENABLE TRIGGER ALL; +ALTER table forum_tag ENABLE TRIGGER ALL; +ALTER table forum_markedtag ENABLE TRIGGER ALL; +ALTER table forum_node ENABLE TRIGGER ALL; +ALTER table forum_nodestate ENABLE TRIGGER ALL; +ALTER table forum_node_tags ENABLE TRIGGER ALL; +ALTER table forum_noderevision ENABLE TRIGGER ALL; +ALTER table forum_node_tags ENABLE TRIGGER ALL; +ALTER table forum_questionsubscription ENABLE TRIGGER ALL; +ALTER table forum_vote ENABLE TRIGGER ALL; +ALTER table forum_flag ENABLE TRIGGER ALL; +ALTER table forum_badge ENABLE TRIGGER ALL; +ALTER table forum_award ENABLE TRIGGER ALL; +ALTER table forum_openidnonce ENABLE TRIGGER ALL; +ALTER table forum_openidassociation ENABLE TRIGGER ALL; +""" PG_SEQUENCE_RESETS = """ SELECT setval('"auth_user_id_seq"', coalesce(max("id"), 1) + 2, max("id") IS NOT null) FROM "auth_user"; @@ -783,8 +1021,6 @@ SELECT setval('"forum_node_tags_id_seq"', coalesce(max("id"), 1) + 2, max("id") SELECT setval('"forum_noderevision_id_seq"', coalesce(max("id"), 1) + 2, max("id") IS NOT null) FROM "forum_noderevision"; SELECT setval('"forum_node_tags_id_seq"', coalesce(max("id"), 1) + 2, max("id") IS NOT null) FROM "forum_node_tags"; SELECT setval('"forum_questionsubscription_id_seq"', coalesce(max("id"), 1) + 2, max("id") IS NOT null) FROM "forum_questionsubscription"; -SELECT setval('"forum_node_tags_id_seq"', coalesce(max("id"), 1) + 2, max("id") IS NOT null) FROM "forum_node_tags"; -SELECT setval('"forum_node_tags_id_seq"', coalesce(max("id"), 1) + 2, max("id") IS NOT null) FROM "forum_node_tags"; SELECT setval('"forum_vote_id_seq"', coalesce(max("id"), 1) + 2, max("id") IS NOT null) FROM "forum_vote"; SELECT setval('"forum_flag_id_seq"', coalesce(max("id"), 1) + 2, max("id") IS NOT null) FROM "forum_flag"; SELECT setval('"forum_badge_id_seq"', coalesce(max("id"), 1) + 2, max("id") IS NOT null) FROM "forum_badge"; @@ -795,4 +1031,4 @@ SELECT setval('"forum_openidassociation_id_seq"', coalesce(max("id"), 1) + 2, ma - \ No newline at end of file +