Importer for nabble

This commit is contained in:
Robin Ward 2015-05-22 15:40:26 -04:00
parent 3677c56267
commit ab50d039bb
2 changed files with 150 additions and 1 deletions

View File

@ -24,8 +24,9 @@ module Email
attr_reader :body, :email_log attr_reader :body, :email_log
def initialize(raw) def initialize(raw, opts=nil)
@raw = raw @raw = raw
@opts = opts || {}
end end
def process def process
@ -135,6 +136,8 @@ module Email
body = fix_charset message body = fix_charset message
end end
return body if @opts[:skip_sanity_check]
# Certain trigger phrases that means we didn't parse correctly # Certain trigger phrases that means we didn't parse correctly
if body =~ /Content\-Type\:/ || body =~ /multipart\/alternative/ || body =~ /text\/plain/ if body =~ /Content\-Type\:/ || body =~ /multipart\/alternative/ || body =~ /text\/plain/
raise EmptyEmailError raise EmptyEmailError

View File

@ -0,0 +1,146 @@
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
require 'pg'
class ImportScripts::MyAskBot < ImportScripts::Base
# CHANGE THESE BEFORE RUNNING THE IMPORTER
BATCH_SIZE = 1000
DB_NAME = "nabble"
CATEGORY_ID = 6
def initialize
super
@tagmap = []
@td = PG::TextDecoder::TimestampWithTimeZone.new
@client = PG.connect(dbname: DB_NAME)
end
def execute
import_users
create_forum_topics
import_replies
end
def import_users
puts "", "importing users"
total_count = @client.exec("SELECT COUNT(user_id) FROM user_")[0]["count"]
batches(BATCH_SIZE) do |offset|
users = @client.query(<<-SQL
SELECT user_id, name, email, joined
FROM user_
ORDER BY joined
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
)
break if users.ntuples() < 1
create_users(users, total: total_count, offset: offset) do |user|
{
id: user["user_id"],
email: user["email"] || (SecureRandom.hex << "@domain.com"),
created_at: Time.zone.at(@td.decode(user["joined"])),
name: user["name"]
}
end
end
end
def parse_email(msg)
receiver = Email::Receiver.new(msg, skip_sanity_check: true)
mail = Mail.read_from_string(msg)
receiver.parse_body(mail)
end
def create_forum_topics
puts "", "creating forum topics"
app_node_id = @client.exec("SELECT node_id FROM node WHERE is_app LIMIT 1")[0]['node_id']
topic_count = @client.exec("SELECT COUNT(node_id) AS count FROM node WHERE parent_id = #{app_node_id}")[0]["count"]
batches(BATCH_SIZE) do |offset|
topics = @client.exec <<-SQL
SELECT n.node_id, n.subject, n.owner_id, n.when_created, nm.message, n.msg_fmt
FROM node AS n
INNER JOIN node_msg AS nm ON nm.node_id = n.node_id
WHERE n.parent_id = #{app_node_id}
ORDER BY n.when_created
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
break if topics.ntuples() < 1
create_posts(topics, total: topic_count, offset: offset) do |t|
raw = body_from(t)
next unless raw
{ id: t['node_id'],
title: t['subject'],
user_id: user_id_from_imported_user_id(t["owner_id"]) || Discourse::SYSTEM_USER_ID,
created_at: Time.zone.at(@td.decode(t["when_created"])),
category: CATEGORY_ID,
raw: raw }
end
end
end
def body_from(p)
%w(m s).include?(p['msg_fmt']) ? parse_email(p['message']) : p['message']
rescue Email::Receiver::EmptyEmailError
puts "Skipped #{p['node_id']}"
end
def import_replies
puts "", "creating topic replies"
app_node_id = @client.exec("SELECT node_id FROM node WHERE is_app LIMIT 1")[0]['node_id']
post_count = @client.exec("SELECT COUNT(node_id) AS count FROM node WHERE parent_id != #{app_node_id}")[0]["count"]
topic_ids = {}
batches(BATCH_SIZE) do |offset|
posts = @client.exec <<-SQL
SELECT n.node_id, n.parent_id, n.subject, n.owner_id, n.when_created, nm.message, n.msg_fmt
FROM node AS n
INNER JOIN node_msg AS nm ON nm.node_id = n.node_id
WHERE n.parent_id != #{app_node_id}
ORDER BY n.when_created
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
break if posts.ntuples() < 1
create_posts(posts, total: post_count, offset: offset) do |p|
parent_id = p['parent_id']
id = p['node_id']
topic_id = topic_ids[parent_id]
unless topic_id
topic = topic_lookup_from_imported_post_id(parent_id)
topic_id = topic[:topic_id] if topic
end
next unless topic_id
topic_ids[id] = topic_id
raw = body_from(p)
next unless raw
{ id: id,
topic_id: topic_id,
user_id: user_id_from_imported_user_id(p['owner_id']) || Discourse::SYSTEM_USER_ID,
created_at: Time.zone.at(@td.decode(p["when_created"])),
raw: raw }
end
end
end
end
ImportScripts::MyAskBot.new.perform