add custom importer for sfn.org

This commit is contained in:
Régis Hanol 2015-04-14 18:16:42 +02:00
parent 869d8e25ad
commit 32e02411bd
1 changed files with 269 additions and 0 deletions

View File

@ -0,0 +1,269 @@
# custom importer for www.sfn.org, feel free to borrow ideas
require 'mysql2'
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
class ImportScripts::Sfn < ImportScripts::Base
BATCH_SIZE = 1000
def initialize
super
end
def execute
import_users
import_categories
import_topics
import_posts
end
def import_users
puts "", "importing users..."
user_count = mysql_query <<-SQL
SELECT COUNT(DISTINCT cm.ContactKey) AS "count"
FROM CommunityMember cm
LEFT JOIN EgroupSubscription es ON es.ContactKey = cm.ContactKey
WHERE LENGTH(COALESCE(es.EmailAddr_, "")) > 5
SQL
user_count = user_count.first["count"]
batches(BATCH_SIZE) do |offset|
users = mysql_query <<-SQL
SELECT cm.ContactKey AS "id",
cm.InvitedOn AS "created_at",
es.EmailAddr_ AS "email",
es.FullName_ AS "name",
c.Bio AS "bio",
c.ProfileImage AS "avatar"
FROM CommunityMember cm
LEFT JOIN EgroupSubscription es ON es.ContactKey = cm.ContactKey
LEFT JOIN Contact c ON c.ContactKey = cm.ContactKey
WHERE LENGTH(COALESCE(es.EmailAddr_, "")) > 5
GROUP BY cm.ContactKey
ORDER BY "created_at"
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
break if users.size < 1
create_users(users, total: user_count, offset: offset) do |user|
{
id: user["id"],
name: user["name"],
email: user["email"],
bio_raw: user["bio"],
created_at: user["created_at"],
post_create_action: proc do |newuser|
next if user["avatar"].blank?
avatar = Tempfile.new("sfn-avatar")
avatar.write(user["avatar"].encode("ASCII-8BIT").force_encoding("UTF-8"))
avatar.rewind
upload = Upload.create_for(newuser.id, avatar, "avatar.jpg", avatar.size)
if upload.persisted?
newuser.create_user_avatar
newuser.user_avatar.update(custom_upload_id: upload.id)
newuser.update(uploaded_avatar_id: upload.id)
end
avatar.try(:close!) rescue nil
end
}
end
end
end
NEW_CATEGORIES = [
"Abstract Topic Matching Forum",
"Animals in Research",
"Brain Awareness and Teaching",
"Career Advice",
"Career Paths",
"Diversity",
"Early Career Policy Advocates",
"LATP Associates",
"LATP Fellows",
"Mid and Advanced Career",
"Neurobiology of Disease Workshop",
"Neuroscience 2015",
"Neuroscience Scholars Program",
"NSP Associates",
"NSP Fellows",
"Outreach",
"Postdocs and Early Career",
"Program Committee",
"Program Development",
"Roommate Matching Forum",
"Scientific Research",
"Students",
]
# EgroupKey => New Category Name
CATEGORY_MAPPING = {
"{DE10E4F4-621A-48BF-9B45-05D9F774A590}" => "Abstract Topic Matching Forum",
"{3FFC1217-1576-4D38-BB81-D6CADC7FB793}" => "Animals in Research",
"{9362BB21-BF6C-4E55-A3E0-18CD5D9F3323}" => "Brain Awareness and Teaching",
"{3AC01B09-A21F-4166-95DA-0E585E271075}" => "Brain Awareness and Teaching",
"{C249728D-8C9E-4138-AA49-D02467C28EAD}" => "Career Advice",
"{01570B85-0124-478F-A8B9-B028BD1B1F2F}" => "Career Paths",
"{2A430528-278A-46CD-BE1A-07CFA1122919}" => "Diversity",
"{2F211345-3C19-43C9-90B5-27BA9FCD4DB0}" => "Diversity",
"{8092297D-8DF4-404A-8BEB-4D5D0DC6A191}" => "Early Career Policy Advocates",
"{8CB58762-D562-448C-9AF1-8DAE6C482C9B}" => "LATP Associates",
"{CDF80A92-925A-46DD-A867-8558FA72D016}" => "LATP Fellows",
"{E71E237B-7C23-4596-AECA-655BD8ED50DB}" => "Mid and Advanced Career",
"{1D674C38-17CB-4C48-826A-D465AC3F8948}" => "Neurobiology of Disease Workshop",
"{3D4F885B-0037-403B-83DD-62FAA8E81DF1}" => "Neuroscience 2015",
"{9ACC3B40-E4A3-4FFD-AADC-C8403EB6231D}" => "Neuroscience 2015",
"{9FC30FFB-E450-4361-8844-0266C3D96868}" => "Neuroscience Scholars Program",
"{3E78123E-87CE-435E-B4B7-7DAB1A21C541}" => "NSP Associates",
"{12D889D3-5CFD-49D5-93E4-32AAB2CFFCDA}" => "NSP Fellows",
"{FA86D79E-170E-4F53-8F1C-942CB3FFB19E}" => "Outreach",
"{D7041C64-3D32-4010-B3D8-71858323CB4A}" => "Outreach",
"{69B76913-4E23-4C80-A11E-9CDB4130722E}" => "Outreach",
"{774878EA-96AD-49F5-9D29-105AEA488007}" => "Outreach",
"{E6349704-FD01-41B1-9C59-68E928DD4318}" => "Postdocs and Early Career",
"{31CF5944-2567-4E79-9730-18EEC23E5B52}" => "Postdocs and Early Career",
"{5625C403-AFAE-4323-A470-33FC32B12B53}" => "Program Committee",
"{8415D871-54F5-4128-B099-E5A376A6B41B}" => "Program Development",
"{B4DF2044-47AB-4329-8BF7-0D832CAB402C}" => "Roommate Matching Forum",
"{6A3A12B9-5C72-472F-97AC-F34983674960}" => "Scientific Research",
"{2CF635E9-4866-451C-A4F2-E2A8A80FED54}" => "Scientific Research",
"{CF2DDCCE-737F-499D-AFE4-E5C36F195C8B}" => "Scientific Research",
"{282B48D7-AC1D-453E-9806-3C6CE6830EF9}" => "Scientific Research",
"{6D750CAF-E96F-4AD1-A45B-7B74FDFF0B40}" => "Scientific Research",
"{10AF5D45-BEB3-4F07-BE77-0BAB6910DE10}" => "Scientific Research",
"{18D7F624-26D1-44B9-BF33-AB5C5A2AB2BF}" => "Scientific Research",
"{6016FF4F-D834-4888-BA03-F9FE8CB1D4CC}" => "Scientific Research",
"{B0290A37-EA39-4CB8-B6CB-3E0B7EF6D036}" => "Scientific Research",
"{97CC60D0-B93A-43FF-BB48-366FAAEE2BAC}" => "Scientific Research",
"{8FC9B57B-2755-4FC5-90E8-CCDB56CF2F66}" => "Scientific Research",
"{57C8BF37-357E-4FE6-952D-906248642792}" => "Scientific Research",
"{7B2A3B63-BC2C-4219-830C-BA1DECB33337}" => "Scientific Research",
"{0ED1D205-0E48-48D2-B82B-3CE80C6C553F}" => "Scientific Research",
"{10355962-D172-4294-AA8E-1BC381B67971}" => "Scientific Research",
"{C84B0222-5232-4B94-9FB8-DDF802241171}" => "Scientific Research",
"{9143F984-0D67-46CB-AAAF-7FE3B6335E07}" => "Scientific Research",
"{1392DC10-37A0-46A6-9979-4568D0224C5F}" => "Scientific Research",
"{E4891409-0F4F-4151-B550-ECE53655E231}" => "Scientific Research",
"{9613BAC2-229B-4563-9E1C-35C31CDDCE2F}" => "Students",
}
def import_categories
puts "", "importing categories..."
create_categories(NEW_CATEGORIES) do |category|
{ id: category, name: category }
end
end
def import_topics
puts "", "importing topics..."
topic_count = mysql_query <<-SQL
SELECT COUNT(MessageID_) AS "count"
FROM EgroupMessages
WHERE ParentId_ = 0
AND ApprovedRejectedPendingInd = "Approved"
SQL
topic_count = topic_count.first["count"]
batches(BATCH_SIZE) do |offset|
topics = mysql_query <<-SQL
SELECT MessageID_ AS "id",
EgroupKey AS "category_id",
ContactKey AS "user_id",
HdrSubject_ AS "title",
Body_ AS "raw",
CreatStamp_ AS "created_at"
FROM EgroupMessages
WHERE ParentId_ = 0
AND ApprovedRejectedPendingInd = "Approved"
ORDER BY "created_at"
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
break if topics.size < 1
create_posts(topics, total: topic_count, offset: offset) do |topic|
next unless category_id = CATEGORY_MAPPING[topic["category_id"]]
{
id: topic["id"],
category: category_id_from_imported_category_id(category_id),
user_id: user_id_from_imported_user_id(topic["user_id"]) || Discourse::SYSTEM_USER_ID,
title: topic["title"][0..250],
raw: cleanup_raw(topic["raw"]),
created_at: topic["created_at"],
}
end
end
end
def import_posts
puts "", "importing posts..."
posts_count = mysql_query <<-SQL
SELECT COUNT(MessageID_) AS "count"
FROM EgroupMessages
WHERE ParentId_ > 0
AND ApprovedRejectedPendingInd = "Approved"
SQL
posts_count = posts_count.first["count"]
batches(BATCH_SIZE) do |offset|
posts = mysql_query <<-SQL
SELECT MessageID_ AS "id",
ContactKey AS "user_id",
ParentID_ AS "topic_id",
Body_ AS "raw",
CreatStamp_ AS "created_at"
FROM EgroupMessages
WHERE ParentId_ > 0
AND ApprovedRejectedPendingInd = "Approved"
ORDER BY "created_at"
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
break if posts.size < 1
create_posts(posts, total: posts_count, offset: offset) do |post|
next unless parent = topic_lookup_from_imported_post_id(post["topic_id"])
{
id: post["id"],
topic_id: parent[:topic_id],
user_id: user_id_from_imported_user_id(post["user_id"]) || Discourse::SYSTEM_USER_ID,
raw: cleanup_raw(post["raw"]),
created_at: post["created_at"],
}
end
end
end
def cleanup_raw(raw)
# fix some html
raw.gsub!(/<br\s*\/?>/i, "\n")
# remove "This message has been cross posted to the following eGroups: ..."
raw.gsub!(/^This message has been cross posted to the following eGroups: .+\n-{3,}/i, "")
# remove signatures
raw.gsub!(/-{3,}.+/m, "")
# strip leading/trailing whitespaces
raw.strip
end
def mysql_query(sql)
@client ||= Mysql2::Client.new(username: "root", database: "sfn")
@client.query(sql)
end
end
ImportScripts::Sfn.new.perform