50% faster vBulletin 4 importer

This commit is contained in:
Régis Hanol 2017-02-01 14:33:09 +01:00
parent 18007ed34b
commit 0d250c3935
3 changed files with 99 additions and 58 deletions

View File

@ -203,24 +203,23 @@ class ImportScripts::Base
def all_records_exist?(type, import_ids)
return false if import_ids.empty?
orig_conn = ActiveRecord::Base.connection
conn = orig_conn.raw_connection
conn.exec('CREATE TEMP TABLE import_ids(val varchar(200) PRIMARY KEY)')
connection = ActiveRecord::Base.connection.raw_connection
connection.exec('CREATE TEMP TABLE import_ids(val text PRIMARY KEY)')
import_id_clause = import_ids.map { |id| "('#{PG::Connection.escape_string(id.to_s)}')" }.join(",")
conn.exec("INSERT INTO import_ids VALUES #{import_id_clause}")
connection.exec("INSERT INTO import_ids VALUES #{import_id_clause}")
existing = "#{type.to_s.classify}CustomField".constantize.where(name: 'import_id')
existing = existing.joins('JOIN import_ids ON val = value')
if existing.count == import_ids.length
existing = "#{type.to_s.classify}CustomField".constantize
existing = existing.where(name: 'import_id')
.joins('JOIN import_ids ON val = value')
.count
if existing == import_ids.length
puts "Skipping #{import_ids.length} already imported #{type}"
return true
end
ensure
conn.exec('DROP TABLE import_ids')
connection.exec('DROP TABLE import_ids')
end
def created_user(user)

View File

@ -95,5 +95,14 @@ module ImportScripts
url: post.url,
}
end
def user_already_imported?(import_id)
@users.has_key?(import_id) || @users.has_key?(import_id.to_s)
end
def post_already_imported?(import_id)
@posts.has_key?(import_id) || @posts.has_key?(import_id.to_s)
end
end
end

View File

@ -73,6 +73,8 @@ EOM
def execute
mysql_query("CREATE INDEX firstpostid_index ON #{TABLE_PREFIX}thread (firstpostid)") rescue nil
import_groups
import_users
create_groups_membership
@ -111,27 +113,35 @@ EOM
user_count = mysql_query("SELECT COUNT(userid) count FROM #{TABLE_PREFIX}user").first["count"]
last_user_id = -1
batches(BATCH_SIZE) do |offset|
users = mysql_query <<-SQL
users = mysql_query(<<-SQL
SELECT userid, username, homepage, usertitle, usergroupid, joindate, email
FROM #{TABLE_PREFIX}user
WHERE userid > #{last_user_id}
ORDER BY userid
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
).to_a
break if users.size < 1
break if users.empty?
next if all_records_exist? :users, users.map {|u| u["userid"].to_i}
last_user_id = users[-1]["userid"]
before = users.size
users.reject! { |u| @lookup.user_already_imported?(u["userid"].to_i) }
create_users(users, total: user_count, offset: offset) do |user|
email = user["email"].presence || fake_email
email = fake_email unless email[EmailValidator.email_regex]
username = @htmlentities.decode(user["username"]).strip
{
id: user["userid"],
name: username,
username: username,
email: user["email"].presence || fake_email,
email: email,
website: user["homepage"].strip,
title: @htmlentities.decode(user["usertitle"]).strip,
primary_group_id: group_id_from_imported_group_id(user["usergroupid"].to_i),
@ -275,19 +285,24 @@ EOM
topic_count = mysql_query("SELECT COUNT(threadid) count FROM #{TABLE_PREFIX}thread").first["count"]
last_topic_id = -1
batches(BATCH_SIZE) do |offset|
topics = mysql_query <<-SQL
topics = mysql_query(<<-SQL
SELECT t.threadid threadid, t.title title, forumid, open, postuserid, t.dateline dateline, views, t.visible visible, sticky,
p.pagetext raw
FROM #{TABLE_PREFIX}thread t
JOIN #{TABLE_PREFIX}post p ON p.postid = t.firstpostid
WHERE t.threadid > #{last_topic_id}
ORDER BY t.threadid
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
).to_a
break if topics.size < 1
next if all_records_exist? :posts, topics.map {|t| "thread-#{t["threadid"]}" }
break if topics.empty?
last_topic_id = topics[-1]["threadid"]
topics.reject! { |t| @lookup.post_already_imported?("thread-#{t["threadid"]}") }
create_posts(topics, total: topic_count, offset: offset) do |topic|
raw = preprocess_post_raw(topic["raw"]) rescue nil
@ -324,27 +339,32 @@ EOM
def import_posts
puts "", "importing posts..."
# make sure `firstpostid` is indexed
begin
mysql_query("CREATE INDEX firstpostid_index ON #{TABLE_PREFIX}thread (firstpostid)")
rescue Mysql2::Error
puts 'Index already exists'
end
post_count = mysql_query(<<-SQL
SELECT COUNT(postid) count
FROM #{TABLE_PREFIX}post p
JOIN #{TABLE_PREFIX}thread t ON t.threadid = p.threadid
WHERE t.firstpostid <> p.postid
SQL
).first["count"]
post_count = mysql_query("SELECT COUNT(postid) count FROM #{TABLE_PREFIX}post WHERE postid NOT IN (SELECT firstpostid FROM #{TABLE_PREFIX}thread)").first["count"]
last_post_id = -1
batches(BATCH_SIZE) do |offset|
posts = mysql_query <<-SQL
SELECT postid, userid, threadid, pagetext raw, dateline, visible, parentid
FROM #{TABLE_PREFIX}post
WHERE postid NOT IN (SELECT firstpostid FROM #{TABLE_PREFIX}thread)
ORDER BY postid
posts = mysql_query(<<-SQL
SELECT p.postid, p.userid, p.threadid, p.pagetext raw, p.dateline, p.visible, p.parentid
FROM #{TABLE_PREFIX}post p
JOIN #{TABLE_PREFIX}thread t ON t.threadid = p.threadid
WHERE t.firstpostid <> p.postid
AND p.postid > #{last_post_id}
ORDER BY p.postid
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
).to_a
break if posts.size < 1
next if all_records_exist? :posts, posts.map {|p| p["postid"] }
break if posts.empty?
last_post_id = posts[-1]["postid"]
posts.reject! { |p| @lookup.post_already_imported?(p["postid"].to_i) }
create_posts(posts, total: post_count, offset: offset) do |post|
raw = preprocess_post_raw(post["raw"]) rescue nil
@ -374,16 +394,17 @@ EOM
WHERE a.attachmentid = #{attachment_id}"
results = mysql_query(sql)
unless (row = results.first)
unless row = results.first
puts "Couldn't find attachment record for post.id = #{post.id}, import_id = #{post.custom_fields['import_id']}"
return nil
return
end
filename = File.join(ATTACHMENT_DIR, row['user_id'].to_s.split('').join('/'), "#{row['file_id']}.attach")
unless File.exists?(filename)
puts "Attachment file doesn't exist: #{filename}"
return nil
return
end
real_filename = row['filename']
real_filename.prepend SecureRandom.hex if real_filename[0] == '.'
upload = create_upload(post.user.id, filename, real_filename)
@ -391,15 +412,14 @@ EOM
if upload.nil? || !upload.valid?
puts "Upload not valid :("
puts upload.errors.inspect if upload
return nil
return
end
return upload, real_filename
[upload, real_filename]
rescue Mysql2::Error => e
puts "SQL Error"
puts e.message
puts sql
return nil
end
@ -408,17 +428,22 @@ EOM
topic_count = mysql_query("SELECT COUNT(pmtextid) count FROM #{TABLE_PREFIX}pmtext").first["count"]
batches(BATCH_SIZE) do |offset|
private_messages = mysql_query <<-SQL
SELECT pmtextid, fromuserid, title, message, touserarray, dateline
FROM #{TABLE_PREFIX}pmtext
ORDER BY pmtextid
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
last_private_message_id = -1
break if private_messages.size < 1
next if all_records_exist? :posts, private_messages.map {|pm| "pm-#{pm['pmtextid']}" }
batches(BATCH_SIZE) do |offset|
private_messages = mysql_query(<<-SQL
SELECT pmtextid, fromuserid, title, message, touserarray, dateline
FROM #{TABLE_PREFIX}pmtext
WHERE pmtextid > #{last_private_message_id}
ORDER BY pmtextid
LIMIT #{BATCH_SIZE}
SQL
).to_a
break if private_messages.empty?
last_private_message_id = private_messages[-1]["pmtextid"]
private_messages.reject! { |pm| @lookup.post_already_imported?("pm-#{pm['pmtextid']}") }
title_username_of_pm_first_post = {}
@ -476,12 +501,13 @@ EOM
if title =~ /^Re:/
parent_id = title_username_of_pm_first_post[[title[3..-1], participants]]
parent_id = title_username_of_pm_first_post[[title[4..-1], participants]] unless parent_id
parent_id = title_username_of_pm_first_post[[title[5..-1], participants]] unless parent_id
parent_id = title_username_of_pm_first_post[[title[6..-1], participants]] unless parent_id
parent_id = title_username_of_pm_first_post[[title[7..-1], participants]] unless parent_id
parent_id = title_username_of_pm_first_post[[title[8..-1], participants]] unless parent_id
parent_id = title_username_of_pm_first_post[[title[3..-1], participants]] ||
title_username_of_pm_first_post[[title[4..-1], participants]] ||
title_username_of_pm_first_post[[title[5..-1], participants]] ||
title_username_of_pm_first_post[[title[6..-1], participants]] ||
title_username_of_pm_first_post[[title[7..-1], participants]] ||
title_username_of_pm_first_post[[title[8..-1], participants]]
if parent_id
if t = topic_lookup_from_imported_post_id("pm-#{parent_id}")
topic_id = t[:topic_id]
@ -496,7 +522,7 @@ EOM
mapped[:archetype] = Archetype.private_message
mapped[:target_usernames] = target_usernames.join(',')
if mapped[:target_usernames].empty? # pm with yourself?
if mapped[:target_usernames].size < 1 # pm with yourself?
# skip = true
mapped[:target_usernames] = "system"
puts "pm-#{m['pmtextid']} has no target (#{m['touserarray']})"
@ -515,7 +541,14 @@ EOM
puts '', 'importing attachments...'
current_count = 0
total_count = mysql_query("SELECT COUNT(postid) count FROM #{TABLE_PREFIX}post WHERE postid NOT IN (SELECT firstpostid FROM #{TABLE_PREFIX}thread)").first["count"]
total_count = mysql_query(<<-SQL
SELECT COUNT(postid) count
FROM #{TABLE_PREFIX}post p
JOIN #{TABLE_PREFIX}thread t ON t.threadid = p.threadid
WHERE t.firstpostid <> p.postid
SQL
).first["count"]
success_count = 0
fail_count = 0