Improves the base importer

- Move some methods into their own classes in order to make it easier
  to reuse them outside of classes extending the base importer. For
  compatibility reasons the old methods are still in the base importer
  and delegate to the new objects. The following methods and hashes were
  extracted:
    - all the lookup maps for existing and imported data
    - all the methods used for uploads and attachments
- No need to store failed users and groups. This information wasn't
  used anyway.
- Print progress instead of category names when importing categories.
- Allow importers to override if bbcode_to_md should be used (until now
  it always used ARGV)
- Allow importers to add additional site settings that automatically get
  restored after the importer finishes.
- Show how many posts and messages are imported per minute. This should
  help detecting when the import is slowing down and needs to be
  restarted.
- Use max_image_width and max_image_height from settings instead of
  hard-coded values for uploaded images.
This commit is contained in:
Gerhard Schlager 2015-05-04 23:09:58 +02:00
parent 7f43653cad
commit 2dd01c61b0
3 changed files with 261 additions and 137 deletions

View File

@ -7,13 +7,13 @@ if ARGV.include?('bbcode-to-md')
# git clone https://github.com/nlalonde/ruby-bbcode-to-md.git # git clone https://github.com/nlalonde/ruby-bbcode-to-md.git
# cd ruby-bbcode-to-md # cd ruby-bbcode-to-md
# gem build ruby-bbcode-to-md.gemspec # gem build ruby-bbcode-to-md.gemspec
# gem install ruby-bbcode-to-md-0.0.13.gem # gem install ruby-bbcode-to-md-*.gem
require 'ruby-bbcode-to-md' require 'ruby-bbcode-to-md'
end end
require_relative '../../config/environment' require_relative '../../config/environment'
require_dependency 'url_helper' require_relative 'base/lookup_container'
require_dependency 'file_helper' require_relative 'base/uploader'
module ImportScripts; end module ImportScripts; end
@ -24,46 +24,13 @@ class ImportScripts::Base
def initialize def initialize
preload_i18n preload_i18n
@bbcode_to_md = true if ARGV.include?('bbcode-to-md') @lookup = ImportScripts::LookupContainer.new
@existing_groups = {} @uploader = ImportScripts::Uploader.new
@failed_groups = []
@existing_users = {} @bbcode_to_md = true if use_bbcode_to_md?
@failed_users = [] @site_settings_during_import = {}
@categories_lookup = {}
@existing_posts = {}
@topic_lookup = {}
@site_settings_during_import = nil
@old_site_settings = {} @old_site_settings = {}
@start_time = Time.now @start_times = {import: Time.now}
puts "loading existing groups..."
GroupCustomField.where(name: 'import_id').pluck(:group_id, :value).each do |group_id, import_id|
@existing_groups[import_id] = group_id
end
puts "loading existing users..."
UserCustomField.where(name: 'import_id').pluck(:user_id, :value).each do |user_id, import_id|
@existing_users[import_id] = user_id
end
puts "loading existing categories..."
CategoryCustomField.where(name: 'import_id').pluck(:category_id, :value).each do |category_id, import_id|
@categories_lookup[import_id] = category_id
end
puts "loading existing posts..."
PostCustomField.where(name: 'import_id').pluck(:post_id, :value).each do |post_id, import_id|
@existing_posts[import_id] = post_id
end
puts "loading existing topics..."
Post.joins(:topic).pluck("posts.id, posts.topic_id, posts.post_number, topics.slug").each do |p|
@topic_lookup[p[0]] = {
topic_id: p[1],
post_number: p[2],
url: Post.url(p[3], p[1], p[2]),
}
end
end end
def preload_i18n def preload_i18n
@ -87,15 +54,15 @@ class ImportScripts::Base
update_topic_count_replies update_topic_count_replies
reset_topic_counters reset_topic_counters
elapsed = Time.now - @start_time elapsed = Time.now - @start_times[:import]
puts '', "Done (#{elapsed.to_s} seconds)" puts '', '', 'Done (%02dh %02dmin %02dsec)' % [elapsed/3600, elapsed/60%60, elapsed%60]
ensure ensure
reset_site_settings reset_site_settings
end end
def change_site_settings def get_site_settings_for_import
@site_settings_during_import = { {
email_domains_blacklist: '', email_domains_blacklist: '',
min_topic_title_length: 1, min_topic_title_length: 1,
min_post_length: 1, min_post_length: 1,
@ -106,6 +73,10 @@ class ImportScripts::Base
disable_emails: true, disable_emails: true,
authorized_extensions: '*' authorized_extensions: '*'
} }
end
def change_site_settings
@site_settings_during_import = get_site_settings_for_import
@site_settings_during_import.each do |key, value| @site_settings_during_import.each do |key, value|
@old_site_settings[key] = SiteSetting.send(key) @old_site_settings[key] = SiteSetting.send(key)
@ -124,44 +95,42 @@ class ImportScripts::Base
RateLimiter.enable RateLimiter.enable
end end
def use_bbcode_to_md?
ARGV.include?("bbcode-to-md")
end
# Implementation will do most of its work in its execute method. # Implementation will do most of its work in its execute method.
# It will need to call create_users, create_categories, and create_posts. # It will need to call create_users, create_categories, and create_posts.
def execute def execute
raise NotImplementedError raise NotImplementedError
end end
# Get the Discourse Post id based on the id of the source record
def post_id_from_imported_post_id(import_id) def post_id_from_imported_post_id(import_id)
@existing_posts[import_id] || @existing_posts[import_id.to_s] @lookup.post_id_from_imported_post_id(import_id)
end end
# Get the Discourse topic info (a hash) based on the id of the source record
def topic_lookup_from_imported_post_id(import_id) def topic_lookup_from_imported_post_id(import_id)
post_id = post_id_from_imported_post_id(import_id) @lookup.topic_lookup_from_imported_post_id(import_id)
post_id ? @topic_lookup[post_id] : nil
end end
# Get the Discourse Group id based on the id of the source group
def group_id_from_imported_group_id(import_id) def group_id_from_imported_group_id(import_id)
@existing_groups[import_id] || @existing_groups[import_id.to_s] || find_group_by_import_id(import_id).try(:id) @lookup.group_id_from_imported_group_id(import_id)
end end
def find_group_by_import_id(import_id) def find_group_by_import_id(import_id)
GroupCustomField.where(name: 'import_id', value: import_id.to_s).first.try(:group) @lookup.find_group_by_import_id(import_id)
end end
# Get the Discourse User id based on the id of the source user
def user_id_from_imported_user_id(import_id) def user_id_from_imported_user_id(import_id)
@existing_users[import_id] || @existing_users[import_id.to_s] || find_user_by_import_id(import_id).try(:id) @lookup.user_id_from_imported_user_id(import_id)
end end
def find_user_by_import_id(import_id) def find_user_by_import_id(import_id)
UserCustomField.where(name: 'import_id', value: import_id.to_s).first.try(:user) @lookup.find_user_by_import_id(import_id)
end end
# Get the Discourse Category id based on the id of the source category
def category_id_from_imported_category_id(import_id) def category_id_from_imported_category_id(import_id)
@categories_lookup[import_id] || @categories_lookup[import_id.to_s] @lookup.category_id_from_imported_category_id(import_id)
end end
def create_admin(opts={}) def create_admin(opts={})
@ -183,31 +152,32 @@ class ImportScripts::Base
# group in the original datasource. The given id will not be used # group in the original datasource. The given id will not be used
# to create the Discourse group record. # to create the Discourse group record.
def create_groups(results, opts={}) def create_groups(results, opts={})
groups_created = 0 created = 0
groups_skipped = 0 skipped = 0
failed = 0
total = opts[:total] || results.size total = opts[:total] || results.size
results.each do |result| results.each do |result|
g = yield(result) g = yield(result)
if group_id_from_imported_group_id(g[:id]) if @lookup.group_id_from_imported_group_id(g[:id])
groups_skipped += 1 skipped += 1
else else
new_group = create_group(g, g[:id]) new_group = create_group(g, g[:id])
if new_group.valid? if new_group.valid?
@existing_groups[g[:id].to_s] = new_group.id @lookup.add_group(g[:id].to_s, new_group)
groups_created += 1 created += 1
else else
@failed_groups << g failed += 1
puts "Failed to create group id #{g[:id]} #{new_group.name}: #{new_group.errors.full_messages}" puts "Failed to create group id #{g[:id]} #{new_group.name}: #{new_group.errors.full_messages}"
end end
end end
print_status groups_created + groups_skipped + @failed_groups.length + (opts[:offset] || 0), total print_status created + skipped + failed + (opts[:offset] || 0), total
end end
return [groups_created, groups_skipped] [created, skipped]
end end
def create_group(opts, import_id) def create_group(opts, import_id)
@ -231,8 +201,9 @@ class ImportScripts::Base
# user in the original datasource. The given id will not be used to # user in the original datasource. The given id will not be used to
# create the Discourse user record. # create the Discourse user record.
def create_users(results, opts={}) def create_users(results, opts={})
users_created = 0 created = 0
users_skipped = 0 skipped = 0
failed = 0
total = opts[:total] || results.size total = opts[:total] || results.size
results.each do |result| results.each do |result|
@ -240,34 +211,34 @@ class ImportScripts::Base
# block returns nil to skip a user # block returns nil to skip a user
if u.nil? if u.nil?
users_skipped += 1 skipped += 1
else else
import_id = u[:id] import_id = u[:id]
if user_id_from_imported_user_id(import_id) if @lookup.user_id_from_imported_user_id(import_id)
users_skipped += 1 skipped += 1
elsif u[:email].present? elsif u[:email].present?
new_user = create_user(u, import_id) new_user = create_user(u, import_id)
if new_user.valid? && new_user.user_profile.valid? if new_user.valid? && new_user.user_profile.valid?
@existing_users[import_id.to_s] = new_user.id @lookup.add_user(import_id.to_s, new_user)
users_created += 1 created += 1
else else
@failed_users << u failed += 1
puts "Failed to create user id: #{import_id}, username: #{new_user.username}, email: #{new_user.email}" puts "Failed to create user id: #{import_id}, username: #{new_user.username}, email: #{new_user.email}"
puts "user errors: #{new_user.errors.full_messages}" puts "user errors: #{new_user.errors.full_messages}"
puts "user_profile errors: #{new_user.user_profiler.errors.full_messages}" puts "user_profile errors: #{new_user.user_profiler.errors.full_messages}"
end end
else else
@failed_users << u failed += 1
puts "Skipping user id #{import_id} because email is blank" puts "Skipping user id #{import_id} because email is blank"
end end
end end
print_status users_created + users_skipped + @failed_users.length + (opts[:offset] || 0), total print_status created + skipped + failed + (opts[:offset] || 0), total
end end
return [users_created, users_skipped] [created, skipped]
end end
def create_user(opts, import_id) def create_user(opts, import_id)
@ -334,19 +305,22 @@ class ImportScripts::Base
# create the Discourse category record. # create the Discourse category record.
# Optional attributes are position, description, and parent_category_id. # Optional attributes are position, description, and parent_category_id.
def create_categories(results) def create_categories(results)
created = 0
skipped = 0
total = results.size
results.each do |c| results.each do |c|
params = yield(c) params = yield(c)
# block returns nil to skip # block returns nil to skip
next if params.nil? || category_id_from_imported_category_id(params[:id]) if params.nil? || @lookup.category_id_from_imported_category_id(params[:id])
skipped += 1
else
# Basic massaging on the category name # Basic massaging on the category name
params[:name] = "Blank" if params[:name].blank? params[:name] = "Blank" if params[:name].blank?
params[:name].strip! params[:name].strip!
params[:name] = params[:name][0..49] params[:name] = params[:name][0..49]
puts "\t#{params[:name]}"
# make sure categories don't go more than 2 levels deep # make sure categories don't go more than 2 levels deep
if params[:parent_category_id] if params[:parent_category_id]
top = Category.find_by_id(params[:parent_category_id]) top = Category.find_by_id(params[:parent_category_id])
@ -355,8 +329,15 @@ class ImportScripts::Base
end end
new_category = create_category(params, params[:id]) new_category = create_category(params, params[:id])
@categories_lookup[params[:id]] = new_category.id @lookup.add_category(params[:id], new_category)
created += 1
end end
print_status created + skipped, total
end
[created, skipped]
end end
def create_category(opts, import_id) def create_category(opts, import_id)
@ -396,6 +377,7 @@ class ImportScripts::Base
skipped = 0 skipped = 0
created = 0 created = 0
total = opts[:total] || results.size total = opts[:total] || results.size
start_time = get_start_time("posts-#{total}") # the post count should be unique enough to differentiate between posts and PMs
results.each do |r| results.each do |r|
params = yield(r) params = yield(r)
@ -406,18 +388,14 @@ class ImportScripts::Base
else else
import_id = params.delete(:id).to_s import_id = params.delete(:id).to_s
if post_id_from_imported_post_id(import_id) if @lookup.post_id_from_imported_post_id(import_id)
skipped += 1 # already imported this post skipped += 1 # already imported this post
else else
begin begin
new_post = create_post(params, import_id) new_post = create_post(params, import_id)
if new_post.is_a?(Post) if new_post.is_a?(Post)
@existing_posts[import_id] = new_post.id @lookup.add_post(import_id, new_post)
@topic_lookup[new_post.id] = { @lookup.add_topic(new_post)
post_number: new_post.post_number,
topic_id: new_post.topic_id,
url: new_post.url,
}
created_post(new_post) created_post(new_post)
@ -439,10 +417,10 @@ class ImportScripts::Base
end end
end end
print_status skipped + created + (opts[:offset] || 0), total print_status(created + skipped + (opts[:offset] || 0), total, start_time)
end end
return [created, skipped] [created, skipped]
end end
def create_post(opts, import_id) def create_post(opts, import_id)
@ -463,19 +441,8 @@ class ImportScripts::Base
post ? post : post_creator.errors.full_messages post ? post : post_creator.errors.full_messages
end end
# Creates an upload.
# Expects path to be the full path and filename of the source file.
def create_upload(user_id, path, source_filename) def create_upload(user_id, path, source_filename)
tmp = Tempfile.new('discourse-upload') @uploader.create_upload(user_id, path, source_filename)
src = File.open(path)
FileUtils.copy_stream(src, tmp)
src.close
tmp.rewind
Upload.create_for(user_id, tmp, source_filename, tmp.size)
ensure
tmp.close rescue nil
tmp.unlink rescue nil
end end
# Iterate through a list of bookmark records to be imported. # Iterate through a list of bookmark records to be imported.
@ -484,8 +451,8 @@ class ImportScripts::Base
# Required fields are :user_id and :post_id, where both ids are # Required fields are :user_id and :post_id, where both ids are
# the values in the original datasource. # the values in the original datasource.
def create_bookmarks(results, opts={}) def create_bookmarks(results, opts={})
bookmarks_created = 0 created = 0
bookmarks_skipped = 0 skipped = 0
total = opts[:total] || results.size total = opts[:total] || results.size
user = User.new user = User.new
@ -495,23 +462,29 @@ class ImportScripts::Base
params = yield(result) params = yield(result)
# only the IDs are needed, so this should be enough # only the IDs are needed, so this should be enough
user.id = user_id_from_imported_user_id(params[:user_id]) if params.nil?
post.id = post_id_from_imported_post_id(params[:post_id]) skipped += 1
else
user.id = @lookup.user_id_from_imported_user_id(params[:user_id])
post.id = @lookup.post_id_from_imported_post_id(params[:post_id])
if user.id.nil? || post.id.nil? if user.id.nil? || post.id.nil?
bookmarks_skipped += 1 skipped += 1
puts "Skipping bookmark for user id #{params[:user_id]} and post id #{params[:post_id]}" puts "Skipping bookmark for user id #{params[:user_id]} and post id #{params[:post_id]}"
else else
begin begin
PostAction.act(user, post, PostActionType.types[:bookmark]) PostAction.act(user, post, PostActionType.types[:bookmark])
bookmarks_created += 1 created += 1
rescue PostAction::AlreadyActed rescue PostAction::AlreadyActed
bookmarks_skipped += 1 skipped += 1
end
end
end end
print_status bookmarks_created + bookmarks_skipped + (opts[:offset] || 0), total print_status created + skipped + (opts[:offset] || 0), total
end
end end
[created, skipped]
end end
def close_inactive_topics(opts={}) def close_inactive_topics(opts={})
@ -633,23 +606,26 @@ class ImportScripts::Base
end end
def html_for_upload(upload, display_filename) def html_for_upload(upload, display_filename)
if FileHelper.is_image?(upload.url) @uploader.html_for_upload(upload, display_filename)
embedded_image_html(upload)
else
attachment_html(upload, display_filename)
end
end end
def embedded_image_html(upload) def embedded_image_html(upload)
%Q[<img src="#{upload.url}" width="#{[upload.width, 640].compact.min}" height="#{[upload.height,480].compact.min}"><br/>] @uploader.embedded_image_html(upload)
end end
def attachment_html(upload, display_filename) def attachment_html(upload, display_filename)
"<a class='attachment' href='#{upload.url}'>#{display_filename}</a> (#{number_to_human_size(upload.filesize)})" @uploader.attachment_html(upload, display_filename)
end end
def print_status(current, max) def print_status(current, max, start_time = nil)
print "\r%9d / %d (%5.1f%%) " % [current, max, ((current.to_f / max.to_f) * 100).round(1)] if start_time.present?
elapsed_seconds = Time.now - start_time
elements_per_minute = '[%.0f items/min] ' % [current / elapsed_seconds.to_f * 60]
else
elements_per_minute = ''
end
print "\r%9d / %d (%5.1f%%) %s" % [current, max, current / max.to_f * 100, elements_per_minute]
end end
def print_spinner def print_spinner
@ -658,6 +634,10 @@ class ImportScripts::Base
print "\b#{@spinner_chars[0]}" print "\b#{@spinner_chars[0]}"
end end
def get_start_time(key)
@start_times.fetch(key) {|k| @start_times[k] = Time.now}
end
def batches(batch_size) def batches(batch_size)
offset = 0 offset = 0
loop do loop do

View File

@ -0,0 +1,99 @@
module ImportScripts
class LookupContainer
def initialize
puts 'loading existing groups...'
@groups = {}
GroupCustomField.where(name: 'import_id').pluck(:group_id, :value).each do |group_id, import_id|
@groups[import_id] = group_id
end
puts 'loading existing users...'
@users = {}
UserCustomField.where(name: 'import_id').pluck(:user_id, :value).each do |user_id, import_id|
@users[import_id] = user_id
end
puts 'loading existing categories...'
@categories = {}
CategoryCustomField.where(name: 'import_id').pluck(:category_id, :value).each do |category_id, import_id|
@categories[import_id] = category_id
end
puts 'loading existing posts...'
@posts = {}
PostCustomField.where(name: 'import_id').pluck(:post_id, :value).each do |post_id, import_id|
@posts[import_id] = post_id
end
puts 'loading existing topics...'
@topics = {}
Post.joins(:topic).pluck('posts.id, posts.topic_id, posts.post_number, topics.slug').each do |p|
@topics[p[0]] = {
topic_id: p[1],
post_number: p[2],
url: Post.url(p[3], p[1], p[2])
}
end
end
# Get the Discourse Post id based on the id of the source record
def post_id_from_imported_post_id(import_id)
@posts[import_id] || @posts[import_id.to_s]
end
# Get the Discourse topic info (a hash) based on the id of the source record
def topic_lookup_from_imported_post_id(import_id)
post_id = post_id_from_imported_post_id(import_id)
post_id ? @topics[post_id] : nil
end
# Get the Discourse Group id based on the id of the source group
def group_id_from_imported_group_id(import_id)
@groups[import_id] || @groups[import_id.to_s] || find_group_by_import_id(import_id).try(:id)
end
# Get the Discourse Group based on the id of the source group
def find_group_by_import_id(import_id)
GroupCustomField.where(name: 'import_id', value: import_id.to_s).first.try(:group)
end
# Get the Discourse User id based on the id of the source user
def user_id_from_imported_user_id(import_id)
@users[import_id] || @users[import_id.to_s] || find_user_by_import_id(import_id).try(:id)
end
# Get the Discourse User based on the id of the source user
def find_user_by_import_id(import_id)
UserCustomField.where(name: 'import_id', value: import_id.to_s).first.try(:user)
end
# Get the Discourse Category id based on the id of the source category
def category_id_from_imported_category_id(import_id)
@categories[import_id] || @categories[import_id.to_s]
end
def add_group(import_id, group)
@groups[import_id] = group.id
end
def add_user(import_id, user)
@users[import_id] = user.id
end
def add_category(import_id, category)
@categories[import_id] = category.id
end
def add_post(import_id, post)
@posts[import_id] = post.id
end
def add_topic(post)
@topics[post.id] = {
post_number: post.post_number,
topic_id: post.topic_id,
url: post.url,
}
end
end
end

View File

@ -0,0 +1,45 @@
require_dependency 'url_helper'
require_dependency 'file_helper'
module ImportScripts
class Uploader
include ActionView::Helpers::NumberHelper
# Creates an upload.
# Expects path to be the full path and filename of the source file.
# @return [Upload]
def create_upload(user_id, path, source_filename)
tmp = Tempfile.new('discourse-upload')
src = File.open(path)
FileUtils.copy_stream(src, tmp)
src.close
tmp.rewind
Upload.create_for(user_id, tmp, source_filename, tmp.size)
rescue => e
Rails.logger.error("Failed to create upload: #{e}")
nil
ensure
tmp.close rescue nil
tmp.unlink rescue nil
end
def html_for_upload(upload, display_filename)
if FileHelper.is_image?(upload.url)
embedded_image_html(upload)
else
attachment_html(upload, display_filename)
end
end
def embedded_image_html(upload)
image_width = [upload.width, SiteSetting.max_image_width].compact.min
image_height = [upload.height, SiteSetting.max_image_height].compact.min
%Q[<img src="#{upload.url}" width="#{image_width}" height="#{image_height}"><br/>]
end
def attachment_html(upload, display_filename)
"<a class='attachment' href='#{upload.url}'>#{display_filename}</a> (#{number_to_human_size(upload.filesize)})"
end
end
end