50% faster vBulletin 4 importer
This commit is contained in:
parent
18007ed34b
commit
0d250c3935
|
@ -203,24 +203,23 @@ class ImportScripts::Base
|
||||||
def all_records_exist?(type, import_ids)
|
def all_records_exist?(type, import_ids)
|
||||||
return false if import_ids.empty?
|
return false if import_ids.empty?
|
||||||
|
|
||||||
orig_conn = ActiveRecord::Base.connection
|
connection = ActiveRecord::Base.connection.raw_connection
|
||||||
conn = orig_conn.raw_connection
|
connection.exec('CREATE TEMP TABLE import_ids(val text PRIMARY KEY)')
|
||||||
|
|
||||||
conn.exec('CREATE TEMP TABLE import_ids(val varchar(200) PRIMARY KEY)')
|
|
||||||
|
|
||||||
import_id_clause = import_ids.map { |id| "('#{PG::Connection.escape_string(id.to_s)}')" }.join(",")
|
import_id_clause = import_ids.map { |id| "('#{PG::Connection.escape_string(id.to_s)}')" }.join(",")
|
||||||
|
|
||||||
conn.exec("INSERT INTO import_ids VALUES #{import_id_clause}")
|
connection.exec("INSERT INTO import_ids VALUES #{import_id_clause}")
|
||||||
|
|
||||||
existing = "#{type.to_s.classify}CustomField".constantize.where(name: 'import_id')
|
existing = "#{type.to_s.classify}CustomField".constantize
|
||||||
existing = existing.joins('JOIN import_ids ON val = value')
|
existing = existing.where(name: 'import_id')
|
||||||
|
.joins('JOIN import_ids ON val = value')
|
||||||
if existing.count == import_ids.length
|
.count
|
||||||
|
if existing == import_ids.length
|
||||||
puts "Skipping #{import_ids.length} already imported #{type}"
|
puts "Skipping #{import_ids.length} already imported #{type}"
|
||||||
return true
|
return true
|
||||||
end
|
end
|
||||||
ensure
|
ensure
|
||||||
conn.exec('DROP TABLE import_ids')
|
connection.exec('DROP TABLE import_ids')
|
||||||
end
|
end
|
||||||
|
|
||||||
def created_user(user)
|
def created_user(user)
|
||||||
|
|
|
@ -95,5 +95,14 @@ module ImportScripts
|
||||||
url: post.url,
|
url: post.url,
|
||||||
}
|
}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def user_already_imported?(import_id)
|
||||||
|
@users.has_key?(import_id) || @users.has_key?(import_id.to_s)
|
||||||
|
end
|
||||||
|
|
||||||
|
def post_already_imported?(import_id)
|
||||||
|
@posts.has_key?(import_id) || @posts.has_key?(import_id.to_s)
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -73,6 +73,8 @@ EOM
|
||||||
|
|
||||||
|
|
||||||
def execute
|
def execute
|
||||||
|
mysql_query("CREATE INDEX firstpostid_index ON #{TABLE_PREFIX}thread (firstpostid)") rescue nil
|
||||||
|
|
||||||
import_groups
|
import_groups
|
||||||
import_users
|
import_users
|
||||||
create_groups_membership
|
create_groups_membership
|
||||||
|
@ -111,27 +113,35 @@ EOM
|
||||||
|
|
||||||
user_count = mysql_query("SELECT COUNT(userid) count FROM #{TABLE_PREFIX}user").first["count"]
|
user_count = mysql_query("SELECT COUNT(userid) count FROM #{TABLE_PREFIX}user").first["count"]
|
||||||
|
|
||||||
|
last_user_id = -1
|
||||||
|
|
||||||
batches(BATCH_SIZE) do |offset|
|
batches(BATCH_SIZE) do |offset|
|
||||||
users = mysql_query <<-SQL
|
users = mysql_query(<<-SQL
|
||||||
SELECT userid, username, homepage, usertitle, usergroupid, joindate, email
|
SELECT userid, username, homepage, usertitle, usergroupid, joindate, email
|
||||||
FROM #{TABLE_PREFIX}user
|
FROM #{TABLE_PREFIX}user
|
||||||
|
WHERE userid > #{last_user_id}
|
||||||
ORDER BY userid
|
ORDER BY userid
|
||||||
LIMIT #{BATCH_SIZE}
|
LIMIT #{BATCH_SIZE}
|
||||||
OFFSET #{offset}
|
|
||||||
SQL
|
SQL
|
||||||
|
).to_a
|
||||||
|
|
||||||
break if users.size < 1
|
break if users.empty?
|
||||||
|
|
||||||
next if all_records_exist? :users, users.map {|u| u["userid"].to_i}
|
last_user_id = users[-1]["userid"]
|
||||||
|
before = users.size
|
||||||
|
users.reject! { |u| @lookup.user_already_imported?(u["userid"].to_i) }
|
||||||
|
|
||||||
create_users(users, total: user_count, offset: offset) do |user|
|
create_users(users, total: user_count, offset: offset) do |user|
|
||||||
|
email = user["email"].presence || fake_email
|
||||||
|
email = fake_email unless email[EmailValidator.email_regex]
|
||||||
|
|
||||||
username = @htmlentities.decode(user["username"]).strip
|
username = @htmlentities.decode(user["username"]).strip
|
||||||
|
|
||||||
{
|
{
|
||||||
id: user["userid"],
|
id: user["userid"],
|
||||||
name: username,
|
name: username,
|
||||||
username: username,
|
username: username,
|
||||||
email: user["email"].presence || fake_email,
|
email: email,
|
||||||
website: user["homepage"].strip,
|
website: user["homepage"].strip,
|
||||||
title: @htmlentities.decode(user["usertitle"]).strip,
|
title: @htmlentities.decode(user["usertitle"]).strip,
|
||||||
primary_group_id: group_id_from_imported_group_id(user["usergroupid"].to_i),
|
primary_group_id: group_id_from_imported_group_id(user["usergroupid"].to_i),
|
||||||
|
@ -275,19 +285,24 @@ EOM
|
||||||
|
|
||||||
topic_count = mysql_query("SELECT COUNT(threadid) count FROM #{TABLE_PREFIX}thread").first["count"]
|
topic_count = mysql_query("SELECT COUNT(threadid) count FROM #{TABLE_PREFIX}thread").first["count"]
|
||||||
|
|
||||||
|
last_topic_id = -1
|
||||||
|
|
||||||
batches(BATCH_SIZE) do |offset|
|
batches(BATCH_SIZE) do |offset|
|
||||||
topics = mysql_query <<-SQL
|
topics = mysql_query(<<-SQL
|
||||||
SELECT t.threadid threadid, t.title title, forumid, open, postuserid, t.dateline dateline, views, t.visible visible, sticky,
|
SELECT t.threadid threadid, t.title title, forumid, open, postuserid, t.dateline dateline, views, t.visible visible, sticky,
|
||||||
p.pagetext raw
|
p.pagetext raw
|
||||||
FROM #{TABLE_PREFIX}thread t
|
FROM #{TABLE_PREFIX}thread t
|
||||||
JOIN #{TABLE_PREFIX}post p ON p.postid = t.firstpostid
|
JOIN #{TABLE_PREFIX}post p ON p.postid = t.firstpostid
|
||||||
|
WHERE t.threadid > #{last_topic_id}
|
||||||
ORDER BY t.threadid
|
ORDER BY t.threadid
|
||||||
LIMIT #{BATCH_SIZE}
|
LIMIT #{BATCH_SIZE}
|
||||||
OFFSET #{offset}
|
|
||||||
SQL
|
SQL
|
||||||
|
).to_a
|
||||||
|
|
||||||
break if topics.size < 1
|
break if topics.empty?
|
||||||
next if all_records_exist? :posts, topics.map {|t| "thread-#{t["threadid"]}" }
|
|
||||||
|
last_topic_id = topics[-1]["threadid"]
|
||||||
|
topics.reject! { |t| @lookup.post_already_imported?("thread-#{t["threadid"]}") }
|
||||||
|
|
||||||
create_posts(topics, total: topic_count, offset: offset) do |topic|
|
create_posts(topics, total: topic_count, offset: offset) do |topic|
|
||||||
raw = preprocess_post_raw(topic["raw"]) rescue nil
|
raw = preprocess_post_raw(topic["raw"]) rescue nil
|
||||||
|
@ -324,27 +339,32 @@ EOM
|
||||||
def import_posts
|
def import_posts
|
||||||
puts "", "importing posts..."
|
puts "", "importing posts..."
|
||||||
|
|
||||||
# make sure `firstpostid` is indexed
|
post_count = mysql_query(<<-SQL
|
||||||
begin
|
SELECT COUNT(postid) count
|
||||||
mysql_query("CREATE INDEX firstpostid_index ON #{TABLE_PREFIX}thread (firstpostid)")
|
FROM #{TABLE_PREFIX}post p
|
||||||
rescue Mysql2::Error
|
JOIN #{TABLE_PREFIX}thread t ON t.threadid = p.threadid
|
||||||
puts 'Index already exists'
|
WHERE t.firstpostid <> p.postid
|
||||||
end
|
SQL
|
||||||
|
).first["count"]
|
||||||
|
|
||||||
post_count = mysql_query("SELECT COUNT(postid) count FROM #{TABLE_PREFIX}post WHERE postid NOT IN (SELECT firstpostid FROM #{TABLE_PREFIX}thread)").first["count"]
|
last_post_id = -1
|
||||||
|
|
||||||
batches(BATCH_SIZE) do |offset|
|
batches(BATCH_SIZE) do |offset|
|
||||||
posts = mysql_query <<-SQL
|
posts = mysql_query(<<-SQL
|
||||||
SELECT postid, userid, threadid, pagetext raw, dateline, visible, parentid
|
SELECT p.postid, p.userid, p.threadid, p.pagetext raw, p.dateline, p.visible, p.parentid
|
||||||
FROM #{TABLE_PREFIX}post
|
FROM #{TABLE_PREFIX}post p
|
||||||
WHERE postid NOT IN (SELECT firstpostid FROM #{TABLE_PREFIX}thread)
|
JOIN #{TABLE_PREFIX}thread t ON t.threadid = p.threadid
|
||||||
ORDER BY postid
|
WHERE t.firstpostid <> p.postid
|
||||||
|
AND p.postid > #{last_post_id}
|
||||||
|
ORDER BY p.postid
|
||||||
LIMIT #{BATCH_SIZE}
|
LIMIT #{BATCH_SIZE}
|
||||||
OFFSET #{offset}
|
|
||||||
SQL
|
SQL
|
||||||
|
).to_a
|
||||||
|
|
||||||
break if posts.size < 1
|
break if posts.empty?
|
||||||
next if all_records_exist? :posts, posts.map {|p| p["postid"] }
|
|
||||||
|
last_post_id = posts[-1]["postid"]
|
||||||
|
posts.reject! { |p| @lookup.post_already_imported?(p["postid"].to_i) }
|
||||||
|
|
||||||
create_posts(posts, total: post_count, offset: offset) do |post|
|
create_posts(posts, total: post_count, offset: offset) do |post|
|
||||||
raw = preprocess_post_raw(post["raw"]) rescue nil
|
raw = preprocess_post_raw(post["raw"]) rescue nil
|
||||||
|
@ -374,16 +394,17 @@ EOM
|
||||||
WHERE a.attachmentid = #{attachment_id}"
|
WHERE a.attachmentid = #{attachment_id}"
|
||||||
results = mysql_query(sql)
|
results = mysql_query(sql)
|
||||||
|
|
||||||
unless (row = results.first)
|
unless row = results.first
|
||||||
puts "Couldn't find attachment record for post.id = #{post.id}, import_id = #{post.custom_fields['import_id']}"
|
puts "Couldn't find attachment record for post.id = #{post.id}, import_id = #{post.custom_fields['import_id']}"
|
||||||
return nil
|
return
|
||||||
end
|
end
|
||||||
|
|
||||||
filename = File.join(ATTACHMENT_DIR, row['user_id'].to_s.split('').join('/'), "#{row['file_id']}.attach")
|
filename = File.join(ATTACHMENT_DIR, row['user_id'].to_s.split('').join('/'), "#{row['file_id']}.attach")
|
||||||
unless File.exists?(filename)
|
unless File.exists?(filename)
|
||||||
puts "Attachment file doesn't exist: #{filename}"
|
puts "Attachment file doesn't exist: #{filename}"
|
||||||
return nil
|
return
|
||||||
end
|
end
|
||||||
|
|
||||||
real_filename = row['filename']
|
real_filename = row['filename']
|
||||||
real_filename.prepend SecureRandom.hex if real_filename[0] == '.'
|
real_filename.prepend SecureRandom.hex if real_filename[0] == '.'
|
||||||
upload = create_upload(post.user.id, filename, real_filename)
|
upload = create_upload(post.user.id, filename, real_filename)
|
||||||
|
@ -391,15 +412,14 @@ EOM
|
||||||
if upload.nil? || !upload.valid?
|
if upload.nil? || !upload.valid?
|
||||||
puts "Upload not valid :("
|
puts "Upload not valid :("
|
||||||
puts upload.errors.inspect if upload
|
puts upload.errors.inspect if upload
|
||||||
return nil
|
return
|
||||||
end
|
end
|
||||||
|
|
||||||
return upload, real_filename
|
[upload, real_filename]
|
||||||
rescue Mysql2::Error => e
|
rescue Mysql2::Error => e
|
||||||
puts "SQL Error"
|
puts "SQL Error"
|
||||||
puts e.message
|
puts e.message
|
||||||
puts sql
|
puts sql
|
||||||
return nil
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
@ -408,17 +428,22 @@ EOM
|
||||||
|
|
||||||
topic_count = mysql_query("SELECT COUNT(pmtextid) count FROM #{TABLE_PREFIX}pmtext").first["count"]
|
topic_count = mysql_query("SELECT COUNT(pmtextid) count FROM #{TABLE_PREFIX}pmtext").first["count"]
|
||||||
|
|
||||||
batches(BATCH_SIZE) do |offset|
|
last_private_message_id = -1
|
||||||
private_messages = mysql_query <<-SQL
|
|
||||||
SELECT pmtextid, fromuserid, title, message, touserarray, dateline
|
|
||||||
FROM #{TABLE_PREFIX}pmtext
|
|
||||||
ORDER BY pmtextid
|
|
||||||
LIMIT #{BATCH_SIZE}
|
|
||||||
OFFSET #{offset}
|
|
||||||
SQL
|
|
||||||
|
|
||||||
break if private_messages.size < 1
|
batches(BATCH_SIZE) do |offset|
|
||||||
next if all_records_exist? :posts, private_messages.map {|pm| "pm-#{pm['pmtextid']}" }
|
private_messages = mysql_query(<<-SQL
|
||||||
|
SELECT pmtextid, fromuserid, title, message, touserarray, dateline
|
||||||
|
FROM #{TABLE_PREFIX}pmtext
|
||||||
|
WHERE pmtextid > #{last_private_message_id}
|
||||||
|
ORDER BY pmtextid
|
||||||
|
LIMIT #{BATCH_SIZE}
|
||||||
|
SQL
|
||||||
|
).to_a
|
||||||
|
|
||||||
|
break if private_messages.empty?
|
||||||
|
|
||||||
|
last_private_message_id = private_messages[-1]["pmtextid"]
|
||||||
|
private_messages.reject! { |pm| @lookup.post_already_imported?("pm-#{pm['pmtextid']}") }
|
||||||
|
|
||||||
title_username_of_pm_first_post = {}
|
title_username_of_pm_first_post = {}
|
||||||
|
|
||||||
|
@ -476,12 +501,13 @@ EOM
|
||||||
|
|
||||||
if title =~ /^Re:/
|
if title =~ /^Re:/
|
||||||
|
|
||||||
parent_id = title_username_of_pm_first_post[[title[3..-1], participants]]
|
parent_id = title_username_of_pm_first_post[[title[3..-1], participants]] ||
|
||||||
parent_id = title_username_of_pm_first_post[[title[4..-1], participants]] unless parent_id
|
title_username_of_pm_first_post[[title[4..-1], participants]] ||
|
||||||
parent_id = title_username_of_pm_first_post[[title[5..-1], participants]] unless parent_id
|
title_username_of_pm_first_post[[title[5..-1], participants]] ||
|
||||||
parent_id = title_username_of_pm_first_post[[title[6..-1], participants]] unless parent_id
|
title_username_of_pm_first_post[[title[6..-1], participants]] ||
|
||||||
parent_id = title_username_of_pm_first_post[[title[7..-1], participants]] unless parent_id
|
title_username_of_pm_first_post[[title[7..-1], participants]] ||
|
||||||
parent_id = title_username_of_pm_first_post[[title[8..-1], participants]] unless parent_id
|
title_username_of_pm_first_post[[title[8..-1], participants]]
|
||||||
|
|
||||||
if parent_id
|
if parent_id
|
||||||
if t = topic_lookup_from_imported_post_id("pm-#{parent_id}")
|
if t = topic_lookup_from_imported_post_id("pm-#{parent_id}")
|
||||||
topic_id = t[:topic_id]
|
topic_id = t[:topic_id]
|
||||||
|
@ -496,7 +522,7 @@ EOM
|
||||||
mapped[:archetype] = Archetype.private_message
|
mapped[:archetype] = Archetype.private_message
|
||||||
mapped[:target_usernames] = target_usernames.join(',')
|
mapped[:target_usernames] = target_usernames.join(',')
|
||||||
|
|
||||||
if mapped[:target_usernames].empty? # pm with yourself?
|
if mapped[:target_usernames].size < 1 # pm with yourself?
|
||||||
# skip = true
|
# skip = true
|
||||||
mapped[:target_usernames] = "system"
|
mapped[:target_usernames] = "system"
|
||||||
puts "pm-#{m['pmtextid']} has no target (#{m['touserarray']})"
|
puts "pm-#{m['pmtextid']} has no target (#{m['touserarray']})"
|
||||||
|
@ -515,7 +541,14 @@ EOM
|
||||||
puts '', 'importing attachments...'
|
puts '', 'importing attachments...'
|
||||||
|
|
||||||
current_count = 0
|
current_count = 0
|
||||||
total_count = mysql_query("SELECT COUNT(postid) count FROM #{TABLE_PREFIX}post WHERE postid NOT IN (SELECT firstpostid FROM #{TABLE_PREFIX}thread)").first["count"]
|
|
||||||
|
total_count = mysql_query(<<-SQL
|
||||||
|
SELECT COUNT(postid) count
|
||||||
|
FROM #{TABLE_PREFIX}post p
|
||||||
|
JOIN #{TABLE_PREFIX}thread t ON t.threadid = p.threadid
|
||||||
|
WHERE t.firstpostid <> p.postid
|
||||||
|
SQL
|
||||||
|
).first["count"]
|
||||||
|
|
||||||
success_count = 0
|
success_count = 0
|
||||||
fail_count = 0
|
fail_count = 0
|
||||||
|
|
Loading…
Reference in New Issue