VBulletin5 importer improvements (#9477)

- no more hard coded contenttypes
- permalinks for topics, categories, subcategories
- better uploads handling
- tag support
This commit is contained in:
discoursehosting 2020-04-22 22:04:59 +02:00 committed by GitHub
parent 9cbbaf4237
commit 094ddb1c1f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 198 additions and 104 deletions

View File

@ -6,14 +6,19 @@ require 'htmlentities'
class ImportScripts::VBulletin < ImportScripts::Base class ImportScripts::VBulletin < ImportScripts::Base
BATCH_SIZE = 1000 BATCH_SIZE = 1000
DBPREFIX = "vb_"
ROOT_NODE = 2 ROOT_NODE = 2
# CHANGE THESE BEFORE RUNNING THE IMPORTER
DATABASE = "yourforum"
TIMEZONE = "America/Los_Angeles" TIMEZONE = "America/Los_Angeles"
ATTACHMENT_DIR = '/home/discourse/yourforum/customattachments/'
AVATAR_DIR = '/home/discourse/yourforum/avatars/' # override these using environment vars
URL_PREFIX ||= ENV['URL_PREFIX'] || "forum/"
DB_PREFIX ||= ENV['DB_PREFIX'] || "vb_"
DB_HOST ||= ENV['DB_HOST'] || "localhost"
DB_NAME ||= ENV['DB_NAME'] || "vbulletin"
DB_PASS ||= ENV['DB_PASS'] || "password"
DB_USER ||= ENV['DB_USER'] || "username"
ATTACH_DIR ||= ENV['ATTACH_DIR'] || "/home/discourse/vbulletin/attach"
AVATAR_DIR ||= ENV['AVATAR_DIR'] || "/home/discourse/vbulletin/avatars"
def initialize def initialize
super super
@ -25,12 +30,15 @@ class ImportScripts::VBulletin < ImportScripts::Base
@htmlentities = HTMLEntities.new @htmlentities = HTMLEntities.new
@client = Mysql2::Client.new( @client = Mysql2::Client.new(
host: "localhost", host: DB_HOST,
username: "root", username: DB_USER,
database: DATABASE, database: DB_NAME,
password: "password" password: DB_PASS
) )
@forum_typeid = mysql_query("SELECT contenttypeid FROM #{DB_PREFIX}contenttype WHERE class='Forum'").first['contenttypeid']
@channel_typeid = mysql_query("SELECT contenttypeid FROM #{DB_PREFIX}contenttype WHERE class='Channel'").first['contenttypeid']
@text_typeid = mysql_query("SELECT contenttypeid FROM #{DB_PREFIX}contenttype WHERE class='Text'").first['contenttypeid']
end end
def execute def execute
@ -40,8 +48,10 @@ class ImportScripts::VBulletin < ImportScripts::Base
import_topics import_topics
import_posts import_posts
import_attachments import_attachments
import_tags
close_topics close_topics
post_process_posts post_process_posts
create_permalinks
end end
def import_groups def import_groups
@ -49,7 +59,7 @@ class ImportScripts::VBulletin < ImportScripts::Base
groups = mysql_query <<-SQL groups = mysql_query <<-SQL
SELECT usergroupid, title SELECT usergroupid, title
FROM #{DBPREFIX}usergroup FROM #{DB_PREFIX}usergroup
ORDER BY usergroupid ORDER BY usergroupid
SQL SQL
@ -64,7 +74,7 @@ class ImportScripts::VBulletin < ImportScripts::Base
def import_users def import_users
puts "", "importing users" puts "", "importing users"
user_count = mysql_query("SELECT COUNT(userid) count FROM #{DBPREFIX}user").first["count"] user_count = mysql_query("SELECT COUNT(userid) count FROM #{DB_PREFIX}user").first["count"]
batches(BATCH_SIZE) do |offset| batches(BATCH_SIZE) do |offset|
users = mysql_query <<-SQL users = mysql_query <<-SQL
@ -73,8 +83,8 @@ class ImportScripts::VBulletin < ImportScripts::Base
WHEN u.scheme='legacy' THEN REPLACE(token, ' ', ':') WHEN u.scheme='legacy' THEN REPLACE(token, ' ', ':')
END AS password, END AS password,
IF(ug.title = 'Administrators', 1, 0) AS admin IF(ug.title = 'Administrators', 1, 0) AS admin
FROM #{DBPREFIX}user u FROM #{DB_PREFIX}user u
LEFT JOIN #{DBPREFIX}usergroup ug ON ug.usergroupid = u.usergroupid LEFT JOIN #{DB_PREFIX}usergroup ug ON ug.usergroupid = u.usergroupid
ORDER BY userid ORDER BY userid
LIMIT #{BATCH_SIZE} LIMIT #{BATCH_SIZE}
OFFSET #{offset} OFFSET #{offset}
@ -101,7 +111,7 @@ class ImportScripts::VBulletin < ImportScripts::Base
post_create_action: proc do |u| post_create_action: proc do |u|
@old_username_to_new_usernames[user["username"]] = u.username @old_username_to_new_usernames[user["username"]] = u.username
import_profile_picture(user, u) import_profile_picture(user, u)
import_profile_background(user, u) # import_profile_background(user, u)
end end
} }
end end
@ -111,7 +121,7 @@ class ImportScripts::VBulletin < ImportScripts::Base
def import_profile_picture(old_user, imported_user) def import_profile_picture(old_user, imported_user)
query = mysql_query <<-SQL query = mysql_query <<-SQL
SELECT filedata, filename SELECT filedata, filename
FROM #{DBPREFIX}customavatar FROM #{DB_PREFIX}customavatar
WHERE userid = #{old_user["userid"]} WHERE userid = #{old_user["userid"]}
ORDER BY dateline DESC ORDER BY dateline DESC
LIMIT 1 LIMIT 1
@ -148,7 +158,7 @@ class ImportScripts::VBulletin < ImportScripts::Base
def import_profile_background(old_user, imported_user) def import_profile_background(old_user, imported_user)
query = mysql_query <<-SQL query = mysql_query <<-SQL
SELECT filedata, filename SELECT filedata, filename
FROM #{DBPREFIX}customprofilepic FROM #{DB_PREFIX}customprofilepic
WHERE userid = #{old_user["userid"]} WHERE userid = #{old_user["userid"]}
ORDER BY dateline DESC ORDER BY dateline DESC
LIMIT 1 LIMIT 1
@ -176,13 +186,13 @@ class ImportScripts::VBulletin < ImportScripts::Base
puts "", "importing top level categories..." puts "", "importing top level categories..."
categories = mysql_query("SELECT nodeid AS forumid, title, description, displayorder, parentid categories = mysql_query("SELECT nodeid AS forumid, title, description, displayorder, parentid
FROM #{DBPREFIX}node FROM #{DB_PREFIX}node
WHERE parentid=#{ROOT_NODE} WHERE parentid=#{ROOT_NODE}
UNION UNION
SELECT nodeid, title, description, displayorder, parentid SELECT nodeid, title, description, displayorder, parentid
FROM #{DBPREFIX}node FROM #{DB_PREFIX}node
WHERE contenttypeid = 23 WHERE contenttypeid = #{@channel_typeid}
AND parentid IN (SELECT nodeid FROM #{DBPREFIX}node WHERE parentid=#{ROOT_NODE})").to_a AND parentid IN (SELECT nodeid FROM #{DB_PREFIX}node WHERE parentid=#{ROOT_NODE})").to_a
top_level_categories = categories.select { |c| c["parentid"] == ROOT_NODE } top_level_categories = categories.select { |c| c["parentid"] == ROOT_NODE }
@ -224,19 +234,26 @@ class ImportScripts::VBulletin < ImportScripts::Base
# keep track of closed topics # keep track of closed topics
@closed_topic_ids = [] @closed_topic_ids = []
topic_count = mysql_query("select count(nodeid) cnt from #{DBPREFIX}node where parentid in ( topic_count = mysql_query("SELECT COUNT(nodeid) cnt
select nodeid from #{DBPREFIX}node where contenttypeid=23 ) and contenttypeid=22;").first["cnt"] FROM #{DB_PREFIX}node
WHERE (unpublishdate = 0 OR unpublishdate IS NULL)
AND (approved = 1 AND showapproved = 1)
AND parentid IN (
SELECT nodeid FROM #{DB_PREFIX}node WHERE contenttypeid=#{@channel_typeid} ) AND contenttypeid=#{@text_typeid};"
).first["cnt"]
batches(BATCH_SIZE) do |offset| batches(BATCH_SIZE) do |offset|
topics = mysql_query <<-SQL topics = mysql_query <<-SQL
SELECT t.nodeid AS threadid, t.title, t.parentid AS forumid,t.open,t.userid AS postuserid,t.publishdate AS dateline, SELECT t.nodeid AS threadid, t.title, t.parentid AS forumid,t.open,t.userid AS postuserid,t.publishdate AS dateline,
nv.count views, 1 AS visible, t.sticky, nv.count views, 1 AS visible, t.sticky,
CONVERT(CAST(rawtext AS BINARY)USING utf8) AS raw CONVERT(CAST(rawtext AS BINARY)USING utf8) AS raw
FROM #{DBPREFIX}node t FROM #{DB_PREFIX}node t
LEFT JOIN #{DBPREFIX}nodeview nv ON nv.nodeid=t.nodeid LEFT JOIN #{DB_PREFIX}nodeview nv ON nv.nodeid=t.nodeid
LEFT JOIN #{DBPREFIX}text txt ON txt.nodeid=t.nodeid LEFT JOIN #{DB_PREFIX}text txt ON txt.nodeid=t.nodeid
WHERE t.parentid in ( select nodeid from #{DBPREFIX}node where contenttypeid=23 ) WHERE t.parentid in ( select nodeid from #{DB_PREFIX}node where contenttypeid=#{@channel_typeid} )
AND t.contenttypeid = 22 AND t.contenttypeid = #{@text_typeid}
AND (t.unpublishdate = 0 OR t.unpublishdate IS NULL)
AND t.approved = 1 AND t.showapproved = 1
ORDER BY t.nodeid ORDER BY t.nodeid
LIMIT #{BATCH_SIZE} LIMIT #{BATCH_SIZE}
OFFSET #{offset} OFFSET #{offset}
@ -277,19 +294,19 @@ class ImportScripts::VBulletin < ImportScripts::Base
rescue rescue
end end
post_count = mysql_query("SELECT COUNT(nodeid) cnt FROM #{DBPREFIX}node WHERE parentid NOT IN ( post_count = mysql_query("SELECT COUNT(nodeid) cnt FROM #{DB_PREFIX}node WHERE parentid NOT IN (
SELECT nodeid FROM #{DBPREFIX}node WHERE contenttypeid=23 ) AND contenttypeid=22;").first["cnt"] SELECT nodeid FROM #{DB_PREFIX}node WHERE contenttypeid=#{@channel_typeid} ) AND contenttypeid=#{@text_typeid};").first["cnt"]
batches(BATCH_SIZE) do |offset| batches(BATCH_SIZE) do |offset|
posts = mysql_query <<-SQL posts = mysql_query <<-SQL
SELECT p.nodeid AS postid, p.userid AS userid, p.parentid AS threadid, SELECT p.nodeid AS postid, p.userid AS userid, p.parentid AS threadid,
CONVERT(CAST(rawtext AS BINARY)USING utf8) AS raw, p.publishdate AS dateline, CONVERT(CAST(rawtext AS BINARY)USING utf8) AS raw, p.publishdate AS dateline,
1 AS visible, p.parentid AS parentid 1 AS visible, p.parentid AS parentid
FROM #{DBPREFIX}node p FROM #{DB_PREFIX}node p
LEFT JOIN #{DBPREFIX}nodeview nv ON nv.nodeid=p.nodeid LEFT JOIN #{DB_PREFIX}nodeview nv ON nv.nodeid=p.nodeid
LEFT JOIN #{DBPREFIX}text txt ON txt.nodeid=p.nodeid LEFT JOIN #{DB_PREFIX}text txt ON txt.nodeid=p.nodeid
WHERE p.parentid NOT IN ( select nodeid from #{DBPREFIX}node where contenttypeid=23 ) WHERE p.parentid NOT IN ( select nodeid from #{DB_PREFIX}node where contenttypeid=#{@channel_typeid} )
AND p.contenttypeid = 22 AND p.contenttypeid = #{@text_typeid}
ORDER BY postid ORDER BY postid
LIMIT #{BATCH_SIZE} LIMIT #{BATCH_SIZE}
OFFSET #{offset} OFFSET #{offset}
@ -320,86 +337,65 @@ class ImportScripts::VBulletin < ImportScripts::Base
end end
end end
# find the uploaded file information from the db
def find_upload(post, attachment_id)
sql = "SELECT a.filedataid, a.filename, fd.userid, LENGTH(fd.filedata) AS dbsize, filedata
FROM #{DBPREFIX}attach a
LEFT JOIN #{DBPREFIX}filedata fd ON fd.filedataid = a.filedataid
WHERE a.nodeid = #{attachment_id}"
results = mysql_query(sql)
unless (row = results.first)
puts "Couldn't find attachment record for post.id = #{post.id}, import_id = #{post.custom_fields['import_id']}"
return nil
end
filename = File.join(ATTACHMENT_DIR, row['userid'].to_s.split('').join('/'), "#{row['filedataid']}.attach")
real_filename = row['filename']
real_filename.prepend SecureRandom.hex if real_filename[0] == '.'
unless File.exists?(filename)
if row['dbsize'].to_i == 0
puts "Attachment file #{row['filedataid']} doesn't exist"
return nil
end
tmpfile = 'attach_' + row['filedataid'].to_s
filename = File.join('/tmp/', tmpfile)
File.open(filename, 'wb') { |f|
#f.write(PG::Connection.unescape_bytea(row['filedata']))
f.write(row['filedata'])
}
end
upload = create_upload(post.user.id, filename, real_filename)
if upload.nil? || !upload.valid?
puts "Upload not valid :("
puts upload.errors.inspect if upload
return nil
end
[upload, real_filename]
rescue Mysql2::Error => e
puts "SQL Error"
puts e.message
puts sql
nil
end
def import_attachments def import_attachments
puts '', 'importing attachments...' puts '', 'importing attachments...'
ext = mysql_query("SELECT GROUP_CONCAT(DISTINCT(extension)) exts FROM #{DB_PREFIX}filedata").first['exts'].split(',')
SiteSetting.authorized_extensions = (SiteSetting.authorized_extensions.split("|") + ext).uniq.join("|")
uploads = mysql_query <<-SQL
SELECT n.parentid nodeid, a.filename, fd.userid, LENGTH(fd.filedata) AS dbsize, filedata, fd.filedataid
FROM #{DB_PREFIX}attach a
LEFT JOIN #{DB_PREFIX}filedata fd ON fd.filedataid = a.filedataid
LEFT JOIN #{DB_PREFIX}node n on n.nodeid = a.nodeid
SQL
current_count = 0 current_count = 0
total_count = mysql_query("SELECT COUNT(nodeid) cnt FROM #{DBPREFIX}node WHERE contenttypeid=22 ").first["cnt"] total_count = uploads.count
success_count = 0 uploads.each do |upload|
fail_count = 0 post_id = PostCustomField.where(name: 'import_id').where(value: upload['nodeid']).first&.post_id
post_id = PostCustomField.where(name: 'import_id').where(value: "thread-#{upload['nodeid']}").first&.post_id unless post_id
if post_id.nil?
puts "Post for #{upload['nodeid']} not found"
next
end
post = Post.find(post_id)
attachment_regex = /\[attach[^\]]*\]n(\d+)\[\/attach\]/i filename = File.join(ATTACH_DIR, upload['userid'].to_s.split('').join('/'), "#{upload['filedataid']}.attach")
real_filename = upload['filename']
real_filename.prepend SecureRandom.hex if real_filename[0] == '.'
Post.find_each do |post| unless File.exists?(filename)
current_count += 1 # attachments can be on filesystem or in database
print_status current_count, total_count # try to retrieve from database if the file did not exist on filesystem
if upload['dbsize'].to_i == 0
new_raw = post.raw.dup puts "Attachment file #{upload['filedataid']} doesn't exist"
new_raw.gsub!(attachment_regex) do |s|
matches = attachment_regex.match(s)
attachment_id = matches[1]
upload, filename = find_upload(post, attachment_id)
unless upload
fail_count += 1
next next
end end
html_for_upload(upload, filename)
tmpfile = 'attach_' + upload['filedataid'].to_s
filename = File.join('/tmp/', tmpfile)
File.open(filename, 'wb') { |f|
#f.write(PG::Connection.unescape_bytea(row['filedata']))
f.write(upload['filedata'])
}
end end
if new_raw != post.raw upl_obj = create_upload(post.user.id, filename, real_filename)
PostRevisor.new(post).revise!(post.user, { raw: new_raw }, bypass_bump: true, edit_reason: 'Import attachments from vBulletin') if upl_obj&.persisted?
html = html_for_upload(upl_obj, real_filename)
if !post.raw[html]
post.raw += "\n\n#{html}\n\n"
post.save!
PostUpload.create!(post: post, upload: upl_obj) unless PostUpload.where(post: post, upload: upl_obj).exists?
end
else
puts "Fail"
exit
end end
current_count += 1
success_count += 1 print_status(current_count, total_count)
end end
end end
@ -619,6 +615,105 @@ class ImportScripts::VBulletin < ImportScripts::Base
raw raw
end end
def create_permalinks
puts "", "creating permalinks..."
current_count = 0
total_count = mysql_query("SELECT COUNT(nodeid) cnt
FROM #{DB_PREFIX}node
WHERE (unpublishdate = 0 OR unpublishdate IS NULL)
AND (approved = 1 AND showapproved = 1)
AND parentid IN (
SELECT nodeid FROM #{DB_PREFIX}node WHERE contenttypeid=#{@channel_typeid} ) AND contenttypeid=#{@text_typeid};"
).first["cnt"]
batches(BATCH_SIZE) do |offset|
topics = mysql_query <<-SQL
SELECT p.urlident p1, f.urlident p2, t.nodeid, t.urlident p3
FROM #{DB_PREFIX}node f
LEFT JOIN #{DB_PREFIX}node t ON t.parentid = f.nodeid
LEFT JOIN #{DB_PREFIX}node p ON p.nodeid = f.parentid
WHERE f.contenttypeid = #{@channel_typeid}
AND t.contenttypeid = #{@text_typeid}
AND t.approved = 1 AND t.showapproved = 1
AND (t.unpublishdate = 0 OR t.unpublishdate IS NULL)
ORDER BY t.nodeid
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
break if topics.size < 1
topics.each do |topic|
current_count += 1
print_status current_count, total_count
disc_topic = topic_lookup_from_imported_post_id("thread-#{topic['nodeid']}")
Permalink.create(
url: "#{URL_PREFIX}#{topic['p1']}/#{topic['p2']}/#{topic['nodeid']}-#{topic['p3']}",
topic_id: disc_topic[:topic_id]
) rescue nil
end
end
# cats
cats = mysql_query <<-SQL
SELECT nodeid, urlident
FROM #{DB_PREFIX}node
WHERE contenttypeid=#{@channel_typeid}
AND parentid=#{ROOT_NODE};
SQL
cats.each do |c|
category_id = CategoryCustomField.where(name: 'import_id').where(value: c['nodeid']).first.category_id
Permalink.create(url: "#{URL_PREFIX}#{c['urlident']}", category_id: category_id) rescue nil
end
# subcats
subcats = mysql_query <<-SQL
SELECT n1.nodeid,n2.urlident p1,n1.urlident p2
FROM #{DB_PREFIX}node n1
LEFT JOIN #{DB_PREFIX}node n2 ON n2.nodeid=n1.parentid
WHERE n2.parentid = #{ROOT_NODE}
AND n1.contenttypeid=#{@channel_typeid};
SQL
subcats.each do |sc|
category_id = CategoryCustomField.where(name: 'import_id').where(value: sc['nodeid']).first.category_id
Permalink.create(url: "#{URL_PREFIX}#{sc['p1']}/#{sc['p2']}", category_id: category_id) rescue nil
end
end
def import_tags
puts "", "importing tags..."
SiteSetting.tagging_enabled = true
SiteSetting.max_tags_per_topic = 100
staff_guardian = Guardian.new(Discourse.system_user)
records = mysql_query(<<~SQL
SELECT nodeid, GROUP_CONCAT(tagtext) tags
FROM #{DB_PREFIX}tag t
LEFT JOIN #{DB_PREFIX}tagnode tn ON tn.tagid = t.tagid
WHERE t.tagid IS NOT NULL
AND tn.nodeid IS NOT NULL
GROUP BY nodeid
SQL
).to_a
current_count = 0
total_count = records.count
records.each do |rec|
current_count += 1
print_status current_count, total_count
tl = topic_lookup_from_imported_post_id("thread-#{rec['nodeid']}")
next if tl.nil? # topic might have been deleted
topic = Topic.find(tl[:topic_id])
tag_names = rec['tags'].force_encoding("UTF-8").split(',')
DiscourseTagging.tag_topic_by_names(topic, staff_guardian, tag_names)
end
end
def parse_timestamp(timestamp) def parse_timestamp(timestamp)
Time.zone.at(@tz.utc_to_local(timestamp)) Time.zone.at(@tz.utc_to_local(timestamp))
end end
@ -626,7 +721,6 @@ class ImportScripts::VBulletin < ImportScripts::Base
def mysql_query(sql) def mysql_query(sql)
@client.query(sql, cache_rows: false) @client.query(sql, cache_rows: false)
end end
end end
ImportScripts::VBulletin.new.perform ImportScripts::VBulletin.new.perform