50% faster vBulletin 4 importer

This commit is contained in:
Régis Hanol 2017-02-01 14:33:09 +01:00
parent 18007ed34b
commit 0d250c3935
3 changed files with 99 additions and 58 deletions

View File

@ -203,24 +203,23 @@ class ImportScripts::Base
def all_records_exist?(type, import_ids) def all_records_exist?(type, import_ids)
return false if import_ids.empty? return false if import_ids.empty?
orig_conn = ActiveRecord::Base.connection connection = ActiveRecord::Base.connection.raw_connection
conn = orig_conn.raw_connection connection.exec('CREATE TEMP TABLE import_ids(val text PRIMARY KEY)')
conn.exec('CREATE TEMP TABLE import_ids(val varchar(200) PRIMARY KEY)')
import_id_clause = import_ids.map { |id| "('#{PG::Connection.escape_string(id.to_s)}')" }.join(",") import_id_clause = import_ids.map { |id| "('#{PG::Connection.escape_string(id.to_s)}')" }.join(",")
conn.exec("INSERT INTO import_ids VALUES #{import_id_clause}") connection.exec("INSERT INTO import_ids VALUES #{import_id_clause}")
existing = "#{type.to_s.classify}CustomField".constantize.where(name: 'import_id') existing = "#{type.to_s.classify}CustomField".constantize
existing = existing.joins('JOIN import_ids ON val = value') existing = existing.where(name: 'import_id')
.joins('JOIN import_ids ON val = value')
if existing.count == import_ids.length .count
if existing == import_ids.length
puts "Skipping #{import_ids.length} already imported #{type}" puts "Skipping #{import_ids.length} already imported #{type}"
return true return true
end end
ensure ensure
conn.exec('DROP TABLE import_ids') connection.exec('DROP TABLE import_ids')
end end
def created_user(user) def created_user(user)

View File

@ -95,5 +95,14 @@ module ImportScripts
url: post.url, url: post.url,
} }
end end
def user_already_imported?(import_id)
@users.has_key?(import_id) || @users.has_key?(import_id.to_s)
end
def post_already_imported?(import_id)
@posts.has_key?(import_id) || @posts.has_key?(import_id.to_s)
end
end end
end end

View File

@ -73,6 +73,8 @@ EOM
def execute def execute
mysql_query("CREATE INDEX firstpostid_index ON #{TABLE_PREFIX}thread (firstpostid)") rescue nil
import_groups import_groups
import_users import_users
create_groups_membership create_groups_membership
@ -111,27 +113,35 @@ EOM
user_count = mysql_query("SELECT COUNT(userid) count FROM #{TABLE_PREFIX}user").first["count"] user_count = mysql_query("SELECT COUNT(userid) count FROM #{TABLE_PREFIX}user").first["count"]
last_user_id = -1
batches(BATCH_SIZE) do |offset| batches(BATCH_SIZE) do |offset|
users = mysql_query <<-SQL users = mysql_query(<<-SQL
SELECT userid, username, homepage, usertitle, usergroupid, joindate, email SELECT userid, username, homepage, usertitle, usergroupid, joindate, email
FROM #{TABLE_PREFIX}user FROM #{TABLE_PREFIX}user
WHERE userid > #{last_user_id}
ORDER BY userid ORDER BY userid
LIMIT #{BATCH_SIZE} LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL SQL
).to_a
break if users.size < 1 break if users.empty?
next if all_records_exist? :users, users.map {|u| u["userid"].to_i} last_user_id = users[-1]["userid"]
before = users.size
users.reject! { |u| @lookup.user_already_imported?(u["userid"].to_i) }
create_users(users, total: user_count, offset: offset) do |user| create_users(users, total: user_count, offset: offset) do |user|
email = user["email"].presence || fake_email
email = fake_email unless email[EmailValidator.email_regex]
username = @htmlentities.decode(user["username"]).strip username = @htmlentities.decode(user["username"]).strip
{ {
id: user["userid"], id: user["userid"],
name: username, name: username,
username: username, username: username,
email: user["email"].presence || fake_email, email: email,
website: user["homepage"].strip, website: user["homepage"].strip,
title: @htmlentities.decode(user["usertitle"]).strip, title: @htmlentities.decode(user["usertitle"]).strip,
primary_group_id: group_id_from_imported_group_id(user["usergroupid"].to_i), primary_group_id: group_id_from_imported_group_id(user["usergroupid"].to_i),
@ -275,19 +285,24 @@ EOM
topic_count = mysql_query("SELECT COUNT(threadid) count FROM #{TABLE_PREFIX}thread").first["count"] topic_count = mysql_query("SELECT COUNT(threadid) count FROM #{TABLE_PREFIX}thread").first["count"]
last_topic_id = -1
batches(BATCH_SIZE) do |offset| batches(BATCH_SIZE) do |offset|
topics = mysql_query <<-SQL topics = mysql_query(<<-SQL
SELECT t.threadid threadid, t.title title, forumid, open, postuserid, t.dateline dateline, views, t.visible visible, sticky, SELECT t.threadid threadid, t.title title, forumid, open, postuserid, t.dateline dateline, views, t.visible visible, sticky,
p.pagetext raw p.pagetext raw
FROM #{TABLE_PREFIX}thread t FROM #{TABLE_PREFIX}thread t
JOIN #{TABLE_PREFIX}post p ON p.postid = t.firstpostid JOIN #{TABLE_PREFIX}post p ON p.postid = t.firstpostid
WHERE t.threadid > #{last_topic_id}
ORDER BY t.threadid ORDER BY t.threadid
LIMIT #{BATCH_SIZE} LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL SQL
).to_a
break if topics.size < 1 break if topics.empty?
next if all_records_exist? :posts, topics.map {|t| "thread-#{t["threadid"]}" }
last_topic_id = topics[-1]["threadid"]
topics.reject! { |t| @lookup.post_already_imported?("thread-#{t["threadid"]}") }
create_posts(topics, total: topic_count, offset: offset) do |topic| create_posts(topics, total: topic_count, offset: offset) do |topic|
raw = preprocess_post_raw(topic["raw"]) rescue nil raw = preprocess_post_raw(topic["raw"]) rescue nil
@ -324,27 +339,32 @@ EOM
def import_posts def import_posts
puts "", "importing posts..." puts "", "importing posts..."
# make sure `firstpostid` is indexed post_count = mysql_query(<<-SQL
begin SELECT COUNT(postid) count
mysql_query("CREATE INDEX firstpostid_index ON #{TABLE_PREFIX}thread (firstpostid)") FROM #{TABLE_PREFIX}post p
rescue Mysql2::Error JOIN #{TABLE_PREFIX}thread t ON t.threadid = p.threadid
puts 'Index already exists' WHERE t.firstpostid <> p.postid
end SQL
).first["count"]
post_count = mysql_query("SELECT COUNT(postid) count FROM #{TABLE_PREFIX}post WHERE postid NOT IN (SELECT firstpostid FROM #{TABLE_PREFIX}thread)").first["count"] last_post_id = -1
batches(BATCH_SIZE) do |offset| batches(BATCH_SIZE) do |offset|
posts = mysql_query <<-SQL posts = mysql_query(<<-SQL
SELECT postid, userid, threadid, pagetext raw, dateline, visible, parentid SELECT p.postid, p.userid, p.threadid, p.pagetext raw, p.dateline, p.visible, p.parentid
FROM #{TABLE_PREFIX}post FROM #{TABLE_PREFIX}post p
WHERE postid NOT IN (SELECT firstpostid FROM #{TABLE_PREFIX}thread) JOIN #{TABLE_PREFIX}thread t ON t.threadid = p.threadid
ORDER BY postid WHERE t.firstpostid <> p.postid
AND p.postid > #{last_post_id}
ORDER BY p.postid
LIMIT #{BATCH_SIZE} LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL SQL
).to_a
break if posts.size < 1 break if posts.empty?
next if all_records_exist? :posts, posts.map {|p| p["postid"] }
last_post_id = posts[-1]["postid"]
posts.reject! { |p| @lookup.post_already_imported?(p["postid"].to_i) }
create_posts(posts, total: post_count, offset: offset) do |post| create_posts(posts, total: post_count, offset: offset) do |post|
raw = preprocess_post_raw(post["raw"]) rescue nil raw = preprocess_post_raw(post["raw"]) rescue nil
@ -374,16 +394,17 @@ EOM
WHERE a.attachmentid = #{attachment_id}" WHERE a.attachmentid = #{attachment_id}"
results = mysql_query(sql) results = mysql_query(sql)
unless (row = results.first) unless row = results.first
puts "Couldn't find attachment record for post.id = #{post.id}, import_id = #{post.custom_fields['import_id']}" puts "Couldn't find attachment record for post.id = #{post.id}, import_id = #{post.custom_fields['import_id']}"
return nil return
end end
filename = File.join(ATTACHMENT_DIR, row['user_id'].to_s.split('').join('/'), "#{row['file_id']}.attach") filename = File.join(ATTACHMENT_DIR, row['user_id'].to_s.split('').join('/'), "#{row['file_id']}.attach")
unless File.exists?(filename) unless File.exists?(filename)
puts "Attachment file doesn't exist: #{filename}" puts "Attachment file doesn't exist: #{filename}"
return nil return
end end
real_filename = row['filename'] real_filename = row['filename']
real_filename.prepend SecureRandom.hex if real_filename[0] == '.' real_filename.prepend SecureRandom.hex if real_filename[0] == '.'
upload = create_upload(post.user.id, filename, real_filename) upload = create_upload(post.user.id, filename, real_filename)
@ -391,15 +412,14 @@ EOM
if upload.nil? || !upload.valid? if upload.nil? || !upload.valid?
puts "Upload not valid :(" puts "Upload not valid :("
puts upload.errors.inspect if upload puts upload.errors.inspect if upload
return nil return
end end
return upload, real_filename [upload, real_filename]
rescue Mysql2::Error => e rescue Mysql2::Error => e
puts "SQL Error" puts "SQL Error"
puts e.message puts e.message
puts sql puts sql
return nil
end end
@ -408,17 +428,22 @@ EOM
topic_count = mysql_query("SELECT COUNT(pmtextid) count FROM #{TABLE_PREFIX}pmtext").first["count"] topic_count = mysql_query("SELECT COUNT(pmtextid) count FROM #{TABLE_PREFIX}pmtext").first["count"]
batches(BATCH_SIZE) do |offset| last_private_message_id = -1
private_messages = mysql_query <<-SQL
SELECT pmtextid, fromuserid, title, message, touserarray, dateline
FROM #{TABLE_PREFIX}pmtext
ORDER BY pmtextid
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
break if private_messages.size < 1 batches(BATCH_SIZE) do |offset|
next if all_records_exist? :posts, private_messages.map {|pm| "pm-#{pm['pmtextid']}" } private_messages = mysql_query(<<-SQL
SELECT pmtextid, fromuserid, title, message, touserarray, dateline
FROM #{TABLE_PREFIX}pmtext
WHERE pmtextid > #{last_private_message_id}
ORDER BY pmtextid
LIMIT #{BATCH_SIZE}
SQL
).to_a
break if private_messages.empty?
last_private_message_id = private_messages[-1]["pmtextid"]
private_messages.reject! { |pm| @lookup.post_already_imported?("pm-#{pm['pmtextid']}") }
title_username_of_pm_first_post = {} title_username_of_pm_first_post = {}
@ -476,12 +501,13 @@ EOM
if title =~ /^Re:/ if title =~ /^Re:/
parent_id = title_username_of_pm_first_post[[title[3..-1], participants]] parent_id = title_username_of_pm_first_post[[title[3..-1], participants]] ||
parent_id = title_username_of_pm_first_post[[title[4..-1], participants]] unless parent_id title_username_of_pm_first_post[[title[4..-1], participants]] ||
parent_id = title_username_of_pm_first_post[[title[5..-1], participants]] unless parent_id title_username_of_pm_first_post[[title[5..-1], participants]] ||
parent_id = title_username_of_pm_first_post[[title[6..-1], participants]] unless parent_id title_username_of_pm_first_post[[title[6..-1], participants]] ||
parent_id = title_username_of_pm_first_post[[title[7..-1], participants]] unless parent_id title_username_of_pm_first_post[[title[7..-1], participants]] ||
parent_id = title_username_of_pm_first_post[[title[8..-1], participants]] unless parent_id title_username_of_pm_first_post[[title[8..-1], participants]]
if parent_id if parent_id
if t = topic_lookup_from_imported_post_id("pm-#{parent_id}") if t = topic_lookup_from_imported_post_id("pm-#{parent_id}")
topic_id = t[:topic_id] topic_id = t[:topic_id]
@ -496,7 +522,7 @@ EOM
mapped[:archetype] = Archetype.private_message mapped[:archetype] = Archetype.private_message
mapped[:target_usernames] = target_usernames.join(',') mapped[:target_usernames] = target_usernames.join(',')
if mapped[:target_usernames].empty? # pm with yourself? if mapped[:target_usernames].size < 1 # pm with yourself?
# skip = true # skip = true
mapped[:target_usernames] = "system" mapped[:target_usernames] = "system"
puts "pm-#{m['pmtextid']} has no target (#{m['touserarray']})" puts "pm-#{m['pmtextid']} has no target (#{m['touserarray']})"
@ -515,7 +541,14 @@ EOM
puts '', 'importing attachments...' puts '', 'importing attachments...'
current_count = 0 current_count = 0
total_count = mysql_query("SELECT COUNT(postid) count FROM #{TABLE_PREFIX}post WHERE postid NOT IN (SELECT firstpostid FROM #{TABLE_PREFIX}thread)").first["count"]
total_count = mysql_query(<<-SQL
SELECT COUNT(postid) count
FROM #{TABLE_PREFIX}post p
JOIN #{TABLE_PREFIX}thread t ON t.threadid = p.threadid
WHERE t.firstpostid <> p.postid
SQL
).first["count"]
success_count = 0 success_count = 0
fail_count = 0 fail_count = 0