nodebb importer (#5842)

This commit is contained in:
Orlando Del Aguila 2018-05-18 03:12:40 -05:00 committed by Régis Hanol
parent 0800098f1a
commit 3287f47f07
2 changed files with 636 additions and 0 deletions

View File

@ -0,0 +1,527 @@
require_relative '../base.rb'
require_relative './redis'
class ImportScripts::NodeBB < ImportScripts::Base
# CHANGE THESE BEFORE RUNNING THE IMPORTER
# ATTACHMENT_DIR needs to be absolute, not relative path
ATTACHMENT_DIR = '/Users/orlando/www/orlando/NodeBB/public/uploads'
BATCH_SIZE = 2000
def initialize
super
adapter = NodeBB::Redis
@client = adapter.new(
host: "localhost",
port: "6379",
db: 0
)
load_merged_posts
end
def load_merged_posts
puts 'loading merged posts with topics...'
# we keep here the posts that were merged
# as topics
#
# { post_id: discourse_post_id }
@merged_posts_map = {}
PostCustomField.where(name: 'import_merged_post_id').pluck(:post_id, :value).each do |post_id, import_id|
post = Post.find(post_id)
topic_id = post.topic_id
nodebb_post_id = post.custom_fields['import_merged_post_id']
@merged_posts_map[nodebb_post_id] = topic_id
end
end
def execute
import_groups
import_categories
import_users
add_users_to_groups
import_topics
import_posts
import_attachments
post_process_posts
end
def import_groups
puts '', 'importing groups'
groups = @client.groups
total_count = groups.count
progress_count = 0
start_time = Time.now
create_groups(groups) do |group|
{
id: group["name"],
name: group["slug"]
}
end
end
def import_categories
puts "", "importing top level categories..."
category_map = @client.categories
category_ids = category_map.keys
categories = category_map.values
top_level_categories = categories.select { |c| c["parentCid"] == "0" }
create_categories(top_level_categories) do |category|
{
id: category["cid"],
name: category["name"],
position: category["order"],
description: category["description"],
}
end
puts "", "importing child categories..."
children_categories = categories.select { |c| c["parentCid"] != "0" }
top_level_category_ids = Set.new(top_level_categories.map { |c| c["cid"] })
# cut down the tree to only 2 levels of categories
children_categories.each do |cc|
while !top_level_category_ids.include?(cc["parentCid"])
cc["parentCid"] = categories.detect { |c| c["cid"] == cc["parentCid"] }["parentCid"]
end
end
create_categories(children_categories) do |category|
{
id: category["cid"],
name: category["name"],
position: category["order"],
description: category["description"],
parent_category_id: category_id_from_imported_category_id(category["parentCid"])
}
end
end
def import_users
puts "", "importing users"
users = @client.users
user_count = users.count
# we use this group to grant admin to users
admin_group = @client.group("administrators")
create_users(users, total: user_count) do |user|
username = user["username"]
email = user["email"]
# skip users without username
next unless username
# fake email for users without email
email = fake_email if email.blank?
# use user.suspended to handle banned users
if user["banned"] == "1"
suspended_at = Time.now
suspended_till = Time.now + 100.years
end
{
id: user["uid"],
name: user["fullname"],
username: username,
email: email,
admin: admin_group["member_ids"].include?(user["uid"]),
website: user["website"],
location: user["location"],
suspended_at: suspended_at,
suspended_till: suspended_till,
primary_group_id: group_id_from_imported_group_id(user["groupTitle"]),
created_at: user["joindate"],
custom_fields: {
import_pass: user["password"]
},
post_create_action: proc do |u|
import_profile_picture(user, u)
import_profile_background(user, u)
end
}
end
end
def import_profile_picture(old_user, imported_user)
picture = old_user["picture"]
return if picture.blank?
# URI.scheme returns nil for internal URLs
uri = URI.parse(picture)
is_external = uri.scheme
if is_external
# download external image
begin
string_io = open(picture, read_timeout: 5)
rescue Net::ReadTimeout
puts "timeout downloading avatar for user #{imported_user.id}"
return nil
end
# continue if download failed
return unless string_io
# try to get filename from headers
if string_io.meta["content-disposition"]
filename = string_io.meta["content-disposition"].match(/filename=(\"?)(.+)\1/)[2]
end
# try to get it from path
filename = File.basename(picture) unless filename
# can't determine filename, skip upload
if !filename
puts "Can't determine filename, skipping avatar upload for user #{imported_user.id}"
return
end
# write tmp file
file = Tempfile.new(filename, encoding: 'ascii-8bit')
file.write string_io.read
file.rewind
upload = UploadCreator.new(file, filename).create_for(imported_user.id)
else
# remove "/assets/uploads/" from attachment
picture = picture.gsub("/assets/uploads", "")
filepath = File.join(ATTACHMENT_DIR, picture)
filename = File.basename(picture)
unless File.exists?(filepath)
puts "Avatar file doesn't exist: #{filename}"
return nil
end
upload = create_upload(imported_user.id, filepath, filename)
end
return if !upload.persisted?
imported_user.create_user_avatar
imported_user.user_avatar.update(custom_upload_id: upload.id)
imported_user.update(uploaded_avatar_id: upload.id)
ensure
string_io.close rescue nil
file.close rescue nil
file.unlind rescue nil
end
def import_profile_background(old_user, imported_user)
picture = old_user["cover:url"]
return if picture.blank?
# URI returns nil for invalid URLs
uri = URI.parse(picture)
is_external = uri.scheme
if is_external
begin
string_io = open(picture, read_timeout: 5)
rescue Net::ReadTimeout
return nil
end
if string_io.meta["content-disposition"]
filename = string_io.meta["content-disposition"].match(/filename=(\"?)(.+)\1/)[2]
end
filename = File.basename(picture) unless filename
# can't determine filename, skip upload
if !filename
puts "Can't determine filename, skipping background upload for user #{imported_user.id}"
return
end
# write tmp file
file = Tempfile.new(filename, encoding: 'ascii-8bit')
file.write string_io.read
file.rewind
upload = UploadCreator.new(file, filename).create_for(imported_user.id)
else
# remove "/assets/uploads/" from attachment
picture = picture.gsub("/assets/uploads", "")
filepath = File.join(ATTACHMENT_DIR, picture)
filename = File.basename(picture)
unless File.exists?(filepath)
puts "Background file doesn't exist: #{filename}"
return nil
end
upload = create_upload(imported_user.id, filepath, filename)
end
return if !upload.persisted?
imported_user.user_profile.update(profile_background: upload.url)
ensure
string_io.close rescue nil
file.close rescue nil
file.unlink rescue nil
end
def add_users_to_groups
puts "", "adding users to groups..."
groups = @client.groups
total_count = groups.count
progress_count = 0
start_time = Time.now
@client.groups.each do |group|
dgroup = find_group_by_import_id(group["name"])
# do thing if we migrated this group already
next if dgroup.custom_fields['import_users_added']
group_member_ids = group["member_ids"].map { |uid| user_id_from_imported_user_id(uid) }
group_owner_ids = group["owner_ids"].map { |uid| user_id_from_imported_user_id(uid) }
# add members
dgroup.bulk_add(group_member_ids)
# reload group
dgroup.reload
# add owners
owners = User.find(group_owner_ids)
owners.each { |owner| dgroup.add_owner(owner) }
dgroup.custom_fields['import_users_added'] = true
dgroup.save
progress_count += 1
print_status(progress_count, total_count, start_time)
end
end
def import_topics
puts "", "importing topics..."
topic_count = @client.topic_count
batches(BATCH_SIZE) do |offset|
topics = @client.topics(offset, BATCH_SIZE)
break if topics.size < 1
create_posts(topics, total: topic_count, offset: offset) do |topic|
# skip if is deleted
if topic["deleted"] == "1"
puts "Topic with id #{topic["tid"]} was deleted, skipping"
next
end
topic_id = "t#{topic["tid"]}"
raw = topic["mainpost"]["content"]
data = {
id: topic_id,
user_id: user_id_from_imported_user_id(topic["uid"]) || Discourse::SYSTEM_USER_ID,
title: topic["title"],
category: category_id_from_imported_category_id(topic["cid"]),
raw: raw,
created_at: topic["timestamp"],
views: topic["viewcount"],
closed: topic["locked"] == "1",
post_create_action: proc do |p|
# keep track of this to use in import_posts
p.custom_fields["import_merged_post_id"] = topic["mainPid"]
p.save
@merged_posts_map[topic["mainPid"]] = p.id
end
}
data[:pinned_at] = data[:created_at] if topic["pinned"] == "1"
data
end
end
end
def import_posts
puts "", "importing posts..."
post_count = @client.post_count
batches(BATCH_SIZE) do |offset|
posts = @client.posts(offset, BATCH_SIZE)
break if posts.size < 1
create_posts(posts, total: post_count, offset: offset) do |post|
# skip if it's merged_post
next if @merged_posts_map[post["pid"]]
# skip if it's deleted
next if post["deleted"] == "1"
raw = post["content"]
post_id = "p#{post["pid"]}"
next if raw.blank?
topic = topic_lookup_from_imported_post_id("t#{post["tid"]}")
unless topic
puts "Topic with id #{post["tid"]} not found, skipping"
next
end
data = {
id: post_id,
user_id: user_id_from_imported_user_id(post["uid"]) || Discourse::SYSTEM_USER_ID,
topic_id: topic[:topic_id],
raw: raw,
created_at: post["timestamp"],
post_create_action: proc do |p|
post["upvoted_by"].each do |upvoter_id|
user = User.new
user.id = user_id_from_imported_user_id(upvoter_id) || Discourse::SYSTEM_USER_ID
begin
PostAction.act(user, p, PostActionType.types[:like])
rescue PostAction::AlreadyActed
end
end
end
}
if post['toPid']
# Look reply to topic
parent_id = topic_lookup_from_imported_post_id("t#{post['toPid']}").try(:[], :post_number)
# Look reply post if topic is missing
parent_id ||= topic_lookup_from_imported_post_id("p#{post['toPid']}").try(:[], :post_number)
if parent_id
data[:reply_to_post_number] = parent_id
else
puts "Post with id #{post["toPid"]} not found for reply"
end
end
data
end
end
end
def post_process_posts
puts "", "Postprocessing posts..."
current = 0
max = Post.count
start_time = Time.now
Post.find_each do |post|
begin
next if post.custom_fields['import_post_processing']
new_raw = postprocess_post(post)
if new_raw != post.raw
post.raw = new_raw
post.custom_fields['import_post_processing'] = true
post.save
end
ensure
print_status(current += 1, max, start_time)
end
end
end
def import_attachments
puts '', 'importing attachments...'
current = 0
max = Post.count
start_time = Time.now
Post.find_each do |post|
current += 1
print_status(current, max, start_time)
new_raw = post.raw.dup
new_raw.gsub!(/\[(.*)\]\((\/assets\/uploads\/files\/.*)\)/) do
image_md = Regexp.last_match[0]
text, filepath = $1, $2
filepath = filepath.gsub("/assets/uploads", ATTACHMENT_DIR)
# if file exists
# upload attachment and return html for it
if File.exists?(filepath)
filename = File.basename(filepath)
upload = create_upload(post.user_id, filepath, filename)
html_for_upload(upload, filename)
else
puts "File with path #{filepath} not found for post #{post.id}, upload will be broken"
image_md
end
end
if new_raw != post.raw
PostRevisor.new(post).revise!(post.user, { raw: new_raw }, bypass_bump: true, edit_reason: 'Import attachments from NodeBB')
end
end
end
def postprocess_post(post)
raw = post.raw
# [link to post](/post/:id)
raw = raw.gsub(/\[(.*)\]\(\/post\/(\d+).*\)/) do
text, post_id = $1, $2
if topic_lookup = topic_lookup_from_imported_post_id("p#{post_id}")
url = topic_lookup[:url]
"[#{text}](#{url})"
else
"/404"
end
end
# [link to topic](/topic/:id)
raw = raw.gsub(/\[(.*)\]\(\/topic\/(\d+).*\)/) do
text, topic_id = $1, $2
if topic_lookup = topic_lookup_from_imported_post_id("t#{topic_id}")
url = topic_lookup[:url]
"[#{text}](#{url})"
else
"/404"
end
end
# @username with dash to underscore
raw = raw.gsub(/@([a-zA-Z0-9-]+)/) do
username = $1
username.gsub('-', '_')
end
raw
end
def fake_email
SecureRandom.hex << "@domain.com"
end
end
ImportScripts::NodeBB.new.perform

View File

@ -0,0 +1,109 @@
require 'redis'
module NodeBB
class Redis
attr_reader :redis
def initialize(params)
@redis = ::Redis.new(params)
end
def groups
group_keys = redis.zrange('groups:visible:createtime', 0, -1)
group_keys.map { |group_key| group(group_key) }
end
def group(id)
group = redis.hgetall("group:#{id}")
group["createtime"] = timestamp_to_date(group["createtime"])
group["member_ids"] = redis.zrange("group:#{id}:members", 0, -1)
group["owner_ids"] = redis.smembers("group:#{id}:owners")
group
end
def users
user_keys = redis.zrange('users:joindate', 0, -1)
user_keys.map { |user_key| user(user_key) }
end
def user(id)
user = redis.hgetall("user:#{id}")
user["joindate"] = timestamp_to_date(user["joindate"])
user["lastonline"] = timestamp_to_date(user["lastonline"])
user
end
def categories
category_keys = redis.zrange('categories:cid', 0, -1)
{}.tap do |categories|
category_keys.each do |category_key|
category = redis.hgetall("category:#{category_key}")
categories[category['cid']] = category
end
end
end
def topics(offset = 0, page_size = 2000)
# redis get keys inclusive
# so we move the offset a bit to continue in the next item
offset = offset + 1 unless offset == 0
from = offset
to = page_size + offset
topic_keys = redis.zrange('topics:tid', from, to)
topic_keys.map { |topic_key| topic(topic_key) }
end
def topic(id)
topic = redis.hgetall("topic:#{id}")
topic["lastposttime"] = timestamp_to_date(topic["lastposttime"])
topic["timestamp"] = timestamp_to_date(topic["timestamp"])
topic["mainpost"] = post(topic["mainPid"])
topic
end
def topic_count
redis.zcard('topics:tid')
end
def posts(offset = 0, page_size = 2000)
# redis get keys inclusive
# so we move the offset a bit to continue in the next item
offset = offset + 1 unless offset == 0
from = offset
to = page_size + offset
post_keys = redis.zrange('posts:pid', from, to)
post_keys.map { |post_key| post(post_key) }
end
def post(id)
post = redis.hgetall("post:#{id}")
post["timestamp"] = timestamp_to_date(post["timestamp"])
post["upvoted_by"] = redis.smembers("pid:#{id}:upvote")
post
end
def post_count
redis.zcard('posts:pid')
end
private
def timestamp_to_date(createtime)
Time.at(createtime[0..-4].to_i).utc if createtime
end
end
end