X-Git-Url: https://git.openstreetmap.org./osqa.git/blobdiff_plain/ae1bc67551ec58d2e419cffc3aaaf41f17dcf4cd..13bee2f2e6aea2d936909861f3ab60bb09821a60:/forum_modules/sximporter/importer.py diff --git a/forum_modules/sximporter/importer.py b/forum_modules/sximporter/importer.py index bc1cf9f..0f7f812 100644 --- a/forum_modules/sximporter/importer.py +++ b/forum_modules/sximporter/importer.py @@ -4,6 +4,8 @@ from xml.dom import minidom from datetime import datetime, timedelta import time import re +import os +import gc from django.utils.translation import ugettext as _ from django.template.defaultfilters import slugify from forum.models.utils import dbsafe_encode @@ -20,6 +22,51 @@ from copy import deepcopy from base64 import b64encode, b64decode from zlib import compress, decompress +from xml.sax import make_parser +from xml.sax.handler import ContentHandler + +class SXTableHandler(ContentHandler): + def __init__(self, fname, callback): + self.in_row = False + self.el_data = {} + self.ch_data = '' + + self.fname = fname.lower() + self.callback = callback + + def startElement(self, name, attrs): + if name.lower() == self.fname: + pass + elif name.lower() == "row": + self.in_row = True + + def characters(self, ch): + self.ch_data += ch + + def endElement(self, name): + if name.lower() == self.fname: + pass + elif name.lower() == "row": + self.callback(self.el_data) + + self.in_row = False + del self.el_data + self.el_data = {} + elif self.in_row: + self.el_data[name.lower()] = self.ch_data.strip() + del self.ch_data + self.ch_data = '' + + +def readTable(path, name, callback): + parser = make_parser() + handler = SXTableHandler(name, callback) + parser.setContentHandler(handler) + + f = os.path.join(path, "%s.xml" % name) + parser.parse(f) + + def dbsafe_encode(value): return force_unicode(b64encode(compress(dumps(deepcopy(value))))) @@ -38,11 +85,13 @@ def readTime(ts): return datetime(*time.strptime(ts, '%Y-%m-%dT%H:%M:%S')[0:6]) -def readEl(el): - return dict([(n.tagName.lower(), getText(n)) for n in el.childNodes if n.nodeType == el.ELEMENT_NODE]) +#def readEl(el): +# return dict([(n.tagName.lower(), getText(n)) for n in el.childNodes if n.nodeType == el.ELEMENT_NODE]) -def readTable(dump, name): - return [readEl(e) for e in minidom.parseString(dump.read("%s.xml" % name)).getElementsByTagName('row')] +#def readTable(dump, name): +# for e in minidom.parseString(dump.read("%s.xml" % name)).getElementsByTagName('row'): +# yield readEl(e) +#return [readEl(e) for e in minidom.parseString(dump.read("%s.xml" % name)).getElementsByTagName('row')] google_accounts_lookup = re.compile(r'^https?://www.google.com/accounts/') yahoo_accounts_lookup = re.compile(r'^https?://me.yahoo.com/a/') @@ -109,45 +158,59 @@ class IdMapper(dict): def __setitem__(self, key, value): super(IdMapper, self).__setitem__(int(key), int(value)) +class IdIncrementer(): + def __init__(self, initial): + self.value = initial + + def inc(self): + self.value += 1 + openidre = re.compile('^https?\:\/\/') -def userimport(dump, options): - users = readTable(dump, "Users") +def userimport(path, options): +#users = readTable(dump, "Users") - user_by_name = {} + usernames = [] + openids = set() uidmapper = IdMapper() - merged_users = [] + #merged_users = [] owneruid = options.get('owneruid', None) #check for empty values if not owneruid: owneruid = None - for sxu in users: + def callback(sxu): create = True if sxu.get('id') == '-1': - continue - + return + #print "\n".join(["%s : %s" % i for i in sxu.items()]) if int(sxu.get('id')) == int(owneruid): osqau = orm.User.objects.get(id=1) + for assoc in orm.AuthKeyUserAssociation.objects.filter(user=osqau): + openids.add(assoc.key) uidmapper[owneruid] = 1 uidmapper[-1] = 1 create = False else: - username = sxu.get('displayname', - sxu.get('displaynamecleaned', sxu.get('realname', final_username_attempt(sxu)))) + username = unicode(sxu.get('displayname', + sxu.get('displaynamecleaned', sxu.get('realname', final_username_attempt(sxu)))))[:30] - if not isinstance(username, UnknownUser) and username in user_by_name: + if username in usernames: #if options.get('mergesimilar', False) and sxu.get('email', 'INVALID') == user_by_name[username].email: # osqau = user_by_name[username] # create = False # uidmapper[sxu.get('id')] = osqau.id #else: - inc = 1 - while ("%s %d" % (username, inc)) in user_by_name: + inc = 0 + + while True: inc += 1 + totest = "%s %d" % (username[:29 - len(str(inc))], inc) - username = "%s %d" % (username, inc) + if not totest in usernames: + username = totest + break sxbadges = sxu.get('badgesummary', None) badges = {'1':'0', '2':'0', '3':'0'} @@ -158,7 +221,7 @@ def userimport(dump, options): if create: osqau = orm.User( id = sxu.get('id'), - username = unicode(username), + username = username, password = '!', email = sxu.get('email', ''), is_superuser = sxu.get('usertypeid') == '5', @@ -174,7 +237,7 @@ def userimport(dump, options): gold = int(badges['1']), silver = int(badges['2']), bronze = int(badges['3']), - real_name = sxu.get('realname', ''), + real_name = sxu.get('realname', '')[:30], location = sxu.get('location', ''), ) @@ -224,27 +287,36 @@ def userimport(dump, options): osqau.location = sxu.get('location', '') osqau.real_name = sxu.get('realname', '') - merged_users.append(osqau.id) + #merged_users.append(osqau.id) osqau.save() - user_by_name[osqau.username] = osqau + usernames.append(osqau.username) openid = sxu.get('openid', None) - if openid and openidre.match(openid): + if openid and openidre.match(openid) and (not openid in openids): assoc = orm.AuthKeyUserAssociation(user=osqau, key=openid, provider="openidurl") assoc.save() + openids.add(openid) + + openidalt = sxu.get('openidalt', None) + if openidalt and openidre.match(openidalt) and (not openidalt in openids): + assoc = orm.AuthKeyUserAssociation(user=osqau, key=openidalt, provider="openidurl") + assoc.save() + openids.add(openidalt) + + readTable(path, "Users", callback) if uidmapper[-1] == -1: uidmapper[-1] = 1 - return (uidmapper, merged_users) + return uidmapper def tagsimport(dump, uidmap): - tags = readTable(dump, "Tags") +#tags = readTable(dump, "Tags") tagmap = {} - for sxtag in tags: + def callback(sxtag): otag = orm.Tag( id = int(sxtag['id']), name = sxtag['name'], @@ -255,6 +327,8 @@ def tagsimport(dump, uidmap): tagmap[otag.name] = otag + readTable(dump, "Tags", callback) + return tagmap def add_post_state(name, post, action): @@ -280,19 +354,19 @@ def remove_post_state(name, post): post.state_string = "".join("(%s)" % s for s in re.findall('\w+', post.state_string) if s != name) def postimport(dump, uidmap, tagmap): - history = {} - accepted = {} - all = {} - - for h in readTable(dump, "PostHistory"): - if not history.get(h.get('postid'), None): - history[h.get('postid')] = [] +#history = {} +#accepted = {} + all = [] - history[h.get('postid')].append(h) + #for h in readTable(dump, "PostHistory"): + # if not history.get(h.get('postid'), None): + # history[h.get('postid')] = [] + # + # history[h.get('postid')].append(h) - posts = readTable(dump, "Posts") + #posts = readTable(dump, "Posts") - for sxpost in posts: + def callback(sxpost): nodetype = (sxpost.get('posttypeid') == '1') and "nodetype" or "answer" post = orm.Node( @@ -350,24 +424,31 @@ def postimport(dump, uidmap, tagmap): post.extra_count = sxpost.get('viewcount', 0) + add_tags_to_post(post, tagmap) + else: post.parent_id = sxpost['parentid'] post.save() - all[int(post.id)] = post + all.append(int(post.id)) + create_and_activate_revision(post) + + del post + + readTable(dump, "Posts", callback) return all def comment_import(dump, uidmap, posts): - comments = readTable(dump, "PostComments") - currid = max(posts.keys()) +#comments = readTable(dump, "PostComments") + currid = IdIncrementer(max(posts)) mapping = {} - for sxc in comments: - currid += 1 + def callback(sxc): + currid.inc() oc = orm.Node( - id = currid, + id = currid.value, node_type = "comment", added_at = readTime(sxc['creationdate']), author_id = uidmap[sxc.get('userid', 1)], @@ -400,23 +481,22 @@ def comment_import(dump, uidmap, posts): action_date = oc.added_at ) + create_and_activate_revision(oc) + create_action.save() oc.save() - posts[oc.id] = oc + posts.append(int(oc.id)) mapping[int(sxc['id'])] = int(oc.id) + readTable(dump, "PostComments", callback) return posts, mapping -def add_tags_to_posts(posts, tagmap): - for post in posts.values(): - if post.node_type == "question": - tags = [tag for tag in [tagmap.get(name.strip()) for name in post.tagnames.split(u' ') if name] if tag] - post.tagnames = " ".join([t.name for t in tags]).strip() - post.tags = tags - - create_and_activate_revision(post) +def add_tags_to_post(post, tagmap): + tags = [tag for tag in [tagmap.get(name.strip()) for name in post.tagnames.split(u' ') if name] if tag] + post.tagnames = " ".join([t.name for t in tags]).strip() + post.tags = tags def create_and_activate_revision(post): @@ -436,24 +516,29 @@ def create_and_activate_revision(post): post.save() def post_vote_import(dump, uidmap, posts): - votes = readTable(dump, "Posts2Votes") - close_reasons = dict([(r['id'], r['name']) for r in readTable(dump, "CloseReasons")]) +#votes = readTable(dump, "Posts2Votes") + close_reasons = {} + + def close_callback(r): + close_reasons[r['id']] = r['name'] + + readTable(dump, "CloseReasons", close_callback) user2vote = [] - for sxv in votes: + def callback(sxv): action = orm.Action( user_id=uidmap[sxv['userid']], action_date = readTime(sxv['creationdate']), ) - node = posts.get(int(sxv['postid']), None) - if not node: continue + if not int(sxv['postid']) in posts: return + node = orm.Node.objects.get(id=sxv['postid']) action.node = node if sxv['votetypeid'] == '1': answer = node - question = posts.get(int(answer.parent_id), None) + question = orm.Node.objects.get(id=answer.parent_id) action.action_type = "acceptanswer" action.save() @@ -557,12 +642,15 @@ def post_vote_import(dump, uidmap, posts): state = {"acceptanswer": "accepted", "delete": "deleted", "close": "closed"}[action.action_type] add_post_state(state, node, action) + readTable(dump, "Posts2Votes", callback) + -def comment_vote_import(dump, uidmap, comments, posts): - votes = readTable(dump, "Comments2Votes") +def comment_vote_import(dump, uidmap, comments): +#votes = readTable(dump, "Comments2Votes") user2vote = [] + comments2score = {} - for sxv in votes: + def callback(sxv): if sxv['votetypeid'] == "2": comment_id = comments[int(sxv['postcommentid'])] user_id = uidmap[sxv['userid']] @@ -588,14 +676,28 @@ def comment_vote_import(dump, uidmap, comments, posts): ov.save() - posts[int(action.node_id)].score += 1 - posts[int(action.node_id)].save() + if not comment_id in comments2score: + comments2score[comment_id] = 1 + else: + comments2score[comment_id] += 1 + + readTable(dump, "Comments2Votes", callback) + + for cid, score in comments2score.items(): + orm.Node.objects.filter(id=cid).update(score=score) def badges_import(dump, uidmap, post_list): - node_ctype = orm['contenttypes.contenttype'].objects.get(name='node') +#node_ctype = orm['contenttypes.contenttype'].objects.get(name='node') + + sxbadges = {} + + def sxcallback(b): + sxbadges[int(b['id'])] = b + + readTable(dump, "Badges", sxcallback) + obadges = dict([(b.cls, b) for b in orm.Badge.objects.all()]) - sxbadges = dict([(int(b['id']), b) for b in readTable(dump, "Badges")]) user_badge_count = {} sx_to_osqa = {} @@ -614,10 +716,9 @@ def badges_import(dump, uidmap, post_list): osqab.save() sx_to_osqa[id] = osqab - sxawards = readTable(dump, "Users2Badges") osqaawards = [] - for sxa in sxawards: + def callback(sxa): badge = sx_to_osqa[int(sxa['badgeid'])] user_id = uidmap[sxa['userid']] @@ -635,7 +736,7 @@ def badges_import(dump, uidmap, post_list): osqaa = orm.Award( user_id = uidmap[sxa['userid']], badge = badge, - node = post_list[user_badge_count[user_id]], + node_id = post_list[user_badge_count[user_id]], awarded_at = action.action_date, action = action ) @@ -644,15 +745,20 @@ def badges_import(dump, uidmap, post_list): badge.awarded_count += 1 user_badge_count[user_id] += 1 + readTable(dump, "Users2Badges", callback) + for badge in obadges.values(): badge.save() -def pages_import(dump): +def pages_import(dump, currid): + currid = IdIncrementer(currid) registry = {} - sx_pages = readTable(dump, "FlatPages") + #sx_pages = readTable(dump, "FlatPages") - for sxp in sx_pages: + def callback(sxp): + currid.inc() page = orm.Node( + id = currid.value, node_type = "page", title = sxp['name'], body = b64decode(sxp['value']), @@ -669,6 +775,8 @@ def pages_import(dump): author_id = 1 ) + create_and_activate_revision(page) + page.save() registry[sxp['url'][1:]] = page.id @@ -690,17 +798,20 @@ def pages_import(dump): pub_action.save() add_post_state("published", page, pub_action) + readTable(dump, "FlatPages", callback) + kv = orm.KeyValue(key='STATIC_PAGE_REGISTRY', value=dbsafe_encode(registry)) kv.save() sx2osqa_set_map = { u'theme.html.name': 'APP_TITLE', -u'theme.html.footer': 'USE_CUSTOM_FOOTER', +u'theme.html.footer': 'CUSTOM_FOOTER', u'theme.html.sidebar': 'SIDEBAR_UPPER_TEXT', u'theme.html.sidebar-low': 'SIDEBAR_LOWER_TEXT', u'theme.html.welcome': 'APP_INTRO', u'theme.html.head': 'CUSTOM_HEAD', -u'theme.html.header': 'CUSTOM_HEADER' +u'theme.html.header': 'CUSTOM_HEADER', +u'theme.css': 'CUSTOM_CSS', } html_codes = ( @@ -721,23 +832,42 @@ def html_decode(html): def static_import(dump): - sx_sets = readTable(dump, "ThemeTextResources") +#sx_sets = readTable(dump, "ThemeTextResources") sx_unknown = {} - for set in sx_sets: + def callback(set): if unicode(set['name']) in sx2osqa_set_map: - kv = orm.KeyValue( - key = sx2osqa_set_map[set['name']], - value = dbsafe_encode(html_decode(set['value'])) - ) + try: + kv = orm.KeyValue.objects.get(key=sx2osqa_set_map[set['name']]) + kv.value = dbsafe_encode(html_decode(set['value'])) + except: + kv = orm.KeyValue( + key = sx2osqa_set_map[set['name']], + value = dbsafe_encode(html_decode(set['value'])) + ) kv.save() else: sx_unknown[set['name']] = html_decode(set['value']) + readTable(dump, "ThemeTextResources", callback) + unknown = orm.KeyValue(key='SXIMPORT_UNKNOWN_SETS', value=dbsafe_encode(sx_unknown)) unknown.save() +def disable_triggers(): + from south.db import db + if db.backend_name == "postgres": + db.execute_many(PG_DISABLE_TRIGGERS) + db.commit_transaction() + db.start_transaction() + +def enable_triggers(): + from south.db import db + if db.backend_name == "postgres": + db.start_transaction() + db.execute_many(PG_ENABLE_TRIGGERS) + db.commit_transaction() def reset_sequences(): from south.db import db @@ -746,24 +876,104 @@ def reset_sequences(): db.execute_many(PG_SEQUENCE_RESETS) db.commit_transaction() +def reindex_fts(): + from south.db import db + if db.backend_name == "postgres": + db.start_transaction() + db.execute_many("UPDATE forum_noderevision set id = id WHERE TRUE;") + db.commit_transaction() + + def sximport(dump, options): - uidmap, merged_users = userimport(dump, options) + try: + disable_triggers() + triggers_disabled = True + except: + triggers_disabled = False + + uidmap = userimport(dump, options) tagmap = tagsimport(dump, uidmap) + gc.collect() + posts = postimport(dump, uidmap, tagmap) + gc.collect() + posts, comments = comment_import(dump, uidmap, posts) - add_tags_to_posts(posts, tagmap) + gc.collect() + post_vote_import(dump, uidmap, posts) - comment_vote_import(dump, uidmap, comments, posts) - badges_import(dump, uidmap, posts.values()) + gc.collect() + + comment_vote_import(dump, uidmap, comments) + gc.collect() + + badges_import(dump, uidmap, posts) - pages_import(dump) + pages_import(dump, max(posts)) static_import(dump) + gc.collect() from south.db import db db.commit_transaction() reset_sequences() + if triggers_disabled: + enable_triggers() + reindex_fts() + + +PG_DISABLE_TRIGGERS = """ +ALTER table auth_user DISABLE TRIGGER ALL; +ALTER table auth_user_groups DISABLE TRIGGER ALL; +ALTER table auth_user_user_permissions DISABLE TRIGGER ALL; +ALTER table forum_keyvalue DISABLE TRIGGER ALL; +ALTER table forum_action DISABLE TRIGGER ALL; +ALTER table forum_actionrepute DISABLE TRIGGER ALL; +ALTER table forum_subscriptionsettings DISABLE TRIGGER ALL; +ALTER table forum_validationhash DISABLE TRIGGER ALL; +ALTER table forum_authkeyuserassociation DISABLE TRIGGER ALL; +ALTER table forum_tag DISABLE TRIGGER ALL; +ALTER table forum_markedtag DISABLE TRIGGER ALL; +ALTER table forum_node DISABLE TRIGGER ALL; +ALTER table forum_nodestate DISABLE TRIGGER ALL; +ALTER table forum_node_tags DISABLE TRIGGER ALL; +ALTER table forum_noderevision DISABLE TRIGGER ALL; +ALTER table forum_node_tags DISABLE TRIGGER ALL; +ALTER table forum_questionsubscription DISABLE TRIGGER ALL; +ALTER table forum_vote DISABLE TRIGGER ALL; +ALTER table forum_flag DISABLE TRIGGER ALL; +ALTER table forum_badge DISABLE TRIGGER ALL; +ALTER table forum_award DISABLE TRIGGER ALL; +ALTER table forum_openidnonce DISABLE TRIGGER ALL; +ALTER table forum_openidassociation DISABLE TRIGGER ALL; +""" + +PG_ENABLE_TRIGGERS = """ +ALTER table auth_user ENABLE TRIGGER ALL; +ALTER table auth_user_groups ENABLE TRIGGER ALL; +ALTER table auth_user_user_permissions ENABLE TRIGGER ALL; +ALTER table forum_keyvalue ENABLE TRIGGER ALL; +ALTER table forum_action ENABLE TRIGGER ALL; +ALTER table forum_actionrepute ENABLE TRIGGER ALL; +ALTER table forum_subscriptionsettings ENABLE TRIGGER ALL; +ALTER table forum_validationhash ENABLE TRIGGER ALL; +ALTER table forum_authkeyuserassociation ENABLE TRIGGER ALL; +ALTER table forum_tag ENABLE TRIGGER ALL; +ALTER table forum_markedtag ENABLE TRIGGER ALL; +ALTER table forum_node ENABLE TRIGGER ALL; +ALTER table forum_nodestate ENABLE TRIGGER ALL; +ALTER table forum_node_tags ENABLE TRIGGER ALL; +ALTER table forum_noderevision ENABLE TRIGGER ALL; +ALTER table forum_node_tags ENABLE TRIGGER ALL; +ALTER table forum_questionsubscription ENABLE TRIGGER ALL; +ALTER table forum_vote ENABLE TRIGGER ALL; +ALTER table forum_flag ENABLE TRIGGER ALL; +ALTER table forum_badge ENABLE TRIGGER ALL; +ALTER table forum_award ENABLE TRIGGER ALL; +ALTER table forum_openidnonce ENABLE TRIGGER ALL; +ALTER table forum_openidassociation ENABLE TRIGGER ALL; +""" PG_SEQUENCE_RESETS = """ SELECT setval('"auth_user_id_seq"', coalesce(max("id"), 1) + 2, max("id") IS NOT null) FROM "auth_user"; @@ -783,8 +993,6 @@ SELECT setval('"forum_node_tags_id_seq"', coalesce(max("id"), 1) + 2, max("id") SELECT setval('"forum_noderevision_id_seq"', coalesce(max("id"), 1) + 2, max("id") IS NOT null) FROM "forum_noderevision"; SELECT setval('"forum_node_tags_id_seq"', coalesce(max("id"), 1) + 2, max("id") IS NOT null) FROM "forum_node_tags"; SELECT setval('"forum_questionsubscription_id_seq"', coalesce(max("id"), 1) + 2, max("id") IS NOT null) FROM "forum_questionsubscription"; -SELECT setval('"forum_node_tags_id_seq"', coalesce(max("id"), 1) + 2, max("id") IS NOT null) FROM "forum_node_tags"; -SELECT setval('"forum_node_tags_id_seq"', coalesce(max("id"), 1) + 2, max("id") IS NOT null) FROM "forum_node_tags"; SELECT setval('"forum_vote_id_seq"', coalesce(max("id"), 1) + 2, max("id") IS NOT null) FROM "forum_vote"; SELECT setval('"forum_flag_id_seq"', coalesce(max("id"), 1) + 2, max("id") IS NOT null) FROM "forum_flag"; SELECT setval('"forum_badge_id_seq"', coalesce(max("id"), 1) + 2, max("id") IS NOT null) FROM "forum_badge"; @@ -795,4 +1003,4 @@ SELECT setval('"forum_openidassociation_id_seq"', coalesce(max("id"), 1) + 2, ma - \ No newline at end of file +