Improves the base importer

- Move some methods into their own classes in order to make it easier
  to reuse them outside of classes extending the base importer. For
  compatibility reasons the old methods are still in the base importer
  and delegate to the new objects. The following methods and hashes were
  extracted:
    - all the lookup maps for existing and imported data
    - all the methods used for uploads and attachments
- No need to store failed users and groups. This information wasn't
  used anyway.
- Print progress instead of category names when importing categories.
- Allow importers to override if bbcode_to_md should be used (until now
  it always used ARGV)
- Allow importers to add additional site settings that automatically get
  restored after the importer finishes.
- Show how many posts and messages are imported per minute. This should
  help detecting when the import is slowing down and needs to be
  restarted.
- Use max_image_width and max_image_height from settings instead of
  hard-coded values for uploaded images.
This commit is contained in:
Gerhard Schlager 2015-05-04 23:09:58 +02:00
parent 7f43653cad
commit 2dd01c61b0
3 changed files with 261 additions and 137 deletions

View File

@ -7,13 +7,13 @@ if ARGV.include?('bbcode-to-md')
# git clone https://github.com/nlalonde/ruby-bbcode-to-md.git
# cd ruby-bbcode-to-md
# gem build ruby-bbcode-to-md.gemspec
# gem install ruby-bbcode-to-md-0.0.13.gem
# gem install ruby-bbcode-to-md-*.gem
require 'ruby-bbcode-to-md'
end
require_relative '../../config/environment'
require_dependency 'url_helper'
require_dependency 'file_helper'
require_relative 'base/lookup_container'
require_relative 'base/uploader'
module ImportScripts; end
@ -24,46 +24,13 @@ class ImportScripts::Base
def initialize
preload_i18n
@bbcode_to_md = true if ARGV.include?('bbcode-to-md')
@existing_groups = {}
@failed_groups = []
@existing_users = {}
@failed_users = []
@categories_lookup = {}
@existing_posts = {}
@topic_lookup = {}
@site_settings_during_import = nil
@lookup = ImportScripts::LookupContainer.new
@uploader = ImportScripts::Uploader.new
@bbcode_to_md = true if use_bbcode_to_md?
@site_settings_during_import = {}
@old_site_settings = {}
@start_time = Time.now
puts "loading existing groups..."
GroupCustomField.where(name: 'import_id').pluck(:group_id, :value).each do |group_id, import_id|
@existing_groups[import_id] = group_id
end
puts "loading existing users..."
UserCustomField.where(name: 'import_id').pluck(:user_id, :value).each do |user_id, import_id|
@existing_users[import_id] = user_id
end
puts "loading existing categories..."
CategoryCustomField.where(name: 'import_id').pluck(:category_id, :value).each do |category_id, import_id|
@categories_lookup[import_id] = category_id
end
puts "loading existing posts..."
PostCustomField.where(name: 'import_id').pluck(:post_id, :value).each do |post_id, import_id|
@existing_posts[import_id] = post_id
end
puts "loading existing topics..."
Post.joins(:topic).pluck("posts.id, posts.topic_id, posts.post_number, topics.slug").each do |p|
@topic_lookup[p[0]] = {
topic_id: p[1],
post_number: p[2],
url: Post.url(p[3], p[1], p[2]),
}
end
@start_times = {import: Time.now}
end
def preload_i18n
@ -87,15 +54,15 @@ class ImportScripts::Base
update_topic_count_replies
reset_topic_counters
elapsed = Time.now - @start_time
puts '', "Done (#{elapsed.to_s} seconds)"
elapsed = Time.now - @start_times[:import]
puts '', '', 'Done (%02dh %02dmin %02dsec)' % [elapsed/3600, elapsed/60%60, elapsed%60]
ensure
reset_site_settings
end
def change_site_settings
@site_settings_during_import = {
def get_site_settings_for_import
{
email_domains_blacklist: '',
min_topic_title_length: 1,
min_post_length: 1,
@ -106,6 +73,10 @@ class ImportScripts::Base
disable_emails: true,
authorized_extensions: '*'
}
end
def change_site_settings
@site_settings_during_import = get_site_settings_for_import
@site_settings_during_import.each do |key, value|
@old_site_settings[key] = SiteSetting.send(key)
@ -124,44 +95,42 @@ class ImportScripts::Base
RateLimiter.enable
end
def use_bbcode_to_md?
ARGV.include?("bbcode-to-md")
end
# Implementation will do most of its work in its execute method.
# It will need to call create_users, create_categories, and create_posts.
def execute
raise NotImplementedError
end
# Get the Discourse Post id based on the id of the source record
def post_id_from_imported_post_id(import_id)
@existing_posts[import_id] || @existing_posts[import_id.to_s]
@lookup.post_id_from_imported_post_id(import_id)
end
# Get the Discourse topic info (a hash) based on the id of the source record
def topic_lookup_from_imported_post_id(import_id)
post_id = post_id_from_imported_post_id(import_id)
post_id ? @topic_lookup[post_id] : nil
@lookup.topic_lookup_from_imported_post_id(import_id)
end
# Get the Discourse Group id based on the id of the source group
def group_id_from_imported_group_id(import_id)
@existing_groups[import_id] || @existing_groups[import_id.to_s] || find_group_by_import_id(import_id).try(:id)
@lookup.group_id_from_imported_group_id(import_id)
end
def find_group_by_import_id(import_id)
GroupCustomField.where(name: 'import_id', value: import_id.to_s).first.try(:group)
@lookup.find_group_by_import_id(import_id)
end
# Get the Discourse User id based on the id of the source user
def user_id_from_imported_user_id(import_id)
@existing_users[import_id] || @existing_users[import_id.to_s] || find_user_by_import_id(import_id).try(:id)
@lookup.user_id_from_imported_user_id(import_id)
end
def find_user_by_import_id(import_id)
UserCustomField.where(name: 'import_id', value: import_id.to_s).first.try(:user)
@lookup.find_user_by_import_id(import_id)
end
# Get the Discourse Category id based on the id of the source category
def category_id_from_imported_category_id(import_id)
@categories_lookup[import_id] || @categories_lookup[import_id.to_s]
@lookup.category_id_from_imported_category_id(import_id)
end
def create_admin(opts={})
@ -183,31 +152,32 @@ class ImportScripts::Base
# group in the original datasource. The given id will not be used
# to create the Discourse group record.
def create_groups(results, opts={})
groups_created = 0
groups_skipped = 0
created = 0
skipped = 0
failed = 0
total = opts[:total] || results.size
results.each do |result|
g = yield(result)
if group_id_from_imported_group_id(g[:id])
groups_skipped += 1
if @lookup.group_id_from_imported_group_id(g[:id])
skipped += 1
else
new_group = create_group(g, g[:id])
if new_group.valid?
@existing_groups[g[:id].to_s] = new_group.id
groups_created += 1
@lookup.add_group(g[:id].to_s, new_group)
created += 1
else
@failed_groups << g
failed += 1
puts "Failed to create group id #{g[:id]} #{new_group.name}: #{new_group.errors.full_messages}"
end
end
print_status groups_created + groups_skipped + @failed_groups.length + (opts[:offset] || 0), total
print_status created + skipped + failed + (opts[:offset] || 0), total
end
return [groups_created, groups_skipped]
[created, skipped]
end
def create_group(opts, import_id)
@ -231,8 +201,9 @@ class ImportScripts::Base
# user in the original datasource. The given id will not be used to
# create the Discourse user record.
def create_users(results, opts={})
users_created = 0
users_skipped = 0
created = 0
skipped = 0
failed = 0
total = opts[:total] || results.size
results.each do |result|
@ -240,34 +211,34 @@ class ImportScripts::Base
# block returns nil to skip a user
if u.nil?
users_skipped += 1
skipped += 1
else
import_id = u[:id]
if user_id_from_imported_user_id(import_id)
users_skipped += 1
if @lookup.user_id_from_imported_user_id(import_id)
skipped += 1
elsif u[:email].present?
new_user = create_user(u, import_id)
if new_user.valid? && new_user.user_profile.valid?
@existing_users[import_id.to_s] = new_user.id
users_created += 1
@lookup.add_user(import_id.to_s, new_user)
created += 1
else
@failed_users << u
failed += 1
puts "Failed to create user id: #{import_id}, username: #{new_user.username}, email: #{new_user.email}"
puts "user errors: #{new_user.errors.full_messages}"
puts "user_profile errors: #{new_user.user_profiler.errors.full_messages}"
end
else
@failed_users << u
failed += 1
puts "Skipping user id #{import_id} because email is blank"
end
end
print_status users_created + users_skipped + @failed_users.length + (opts[:offset] || 0), total
print_status created + skipped + failed + (opts[:offset] || 0), total
end
return [users_created, users_skipped]
[created, skipped]
end
def create_user(opts, import_id)
@ -334,29 +305,39 @@ class ImportScripts::Base
# create the Discourse category record.
# Optional attributes are position, description, and parent_category_id.
def create_categories(results)
created = 0
skipped = 0
total = results.size
results.each do |c|
params = yield(c)
# block returns nil to skip
next if params.nil? || category_id_from_imported_category_id(params[:id])
if params.nil? || @lookup.category_id_from_imported_category_id(params[:id])
skipped += 1
else
# Basic massaging on the category name
params[:name] = "Blank" if params[:name].blank?
params[:name].strip!
params[:name] = params[:name][0..49]
# Basic massaging on the category name
params[:name] = "Blank" if params[:name].blank?
params[:name].strip!
params[:name] = params[:name][0..49]
# make sure categories don't go more than 2 levels deep
if params[:parent_category_id]
top = Category.find_by_id(params[:parent_category_id])
top = top.parent_category while top && !top.parent_category.nil?
params[:parent_category_id] = top.id if top
end
puts "\t#{params[:name]}"
new_category = create_category(params, params[:id])
@lookup.add_category(params[:id], new_category)
# make sure categories don't go more than 2 levels deep
if params[:parent_category_id]
top = Category.find_by_id(params[:parent_category_id])
top = top.parent_category while top && !top.parent_category.nil?
params[:parent_category_id] = top.id if top
created += 1
end
new_category = create_category(params, params[:id])
@categories_lookup[params[:id]] = new_category.id
print_status created + skipped, total
end
[created, skipped]
end
def create_category(opts, import_id)
@ -396,6 +377,7 @@ class ImportScripts::Base
skipped = 0
created = 0
total = opts[:total] || results.size
start_time = get_start_time("posts-#{total}") # the post count should be unique enough to differentiate between posts and PMs
results.each do |r|
params = yield(r)
@ -406,18 +388,14 @@ class ImportScripts::Base
else
import_id = params.delete(:id).to_s
if post_id_from_imported_post_id(import_id)
if @lookup.post_id_from_imported_post_id(import_id)
skipped += 1 # already imported this post
else
begin
new_post = create_post(params, import_id)
if new_post.is_a?(Post)
@existing_posts[import_id] = new_post.id
@topic_lookup[new_post.id] = {
post_number: new_post.post_number,
topic_id: new_post.topic_id,
url: new_post.url,
}
@lookup.add_post(import_id, new_post)
@lookup.add_topic(new_post)
created_post(new_post)
@ -439,10 +417,10 @@ class ImportScripts::Base
end
end
print_status skipped + created + (opts[:offset] || 0), total
print_status(created + skipped + (opts[:offset] || 0), total, start_time)
end
return [created, skipped]
[created, skipped]
end
def create_post(opts, import_id)
@ -463,19 +441,8 @@ class ImportScripts::Base
post ? post : post_creator.errors.full_messages
end
# Creates an upload.
# Expects path to be the full path and filename of the source file.
def create_upload(user_id, path, source_filename)
tmp = Tempfile.new('discourse-upload')
src = File.open(path)
FileUtils.copy_stream(src, tmp)
src.close
tmp.rewind
Upload.create_for(user_id, tmp, source_filename, tmp.size)
ensure
tmp.close rescue nil
tmp.unlink rescue nil
@uploader.create_upload(user_id, path, source_filename)
end
# Iterate through a list of bookmark records to be imported.
@ -484,8 +451,8 @@ class ImportScripts::Base
# Required fields are :user_id and :post_id, where both ids are
# the values in the original datasource.
def create_bookmarks(results, opts={})
bookmarks_created = 0
bookmarks_skipped = 0
created = 0
skipped = 0
total = opts[:total] || results.size
user = User.new
@ -495,23 +462,29 @@ class ImportScripts::Base
params = yield(result)
# only the IDs are needed, so this should be enough
user.id = user_id_from_imported_user_id(params[:user_id])
post.id = post_id_from_imported_post_id(params[:post_id])
if user.id.nil? || post.id.nil?
bookmarks_skipped += 1
puts "Skipping bookmark for user id #{params[:user_id]} and post id #{params[:post_id]}"
if params.nil?
skipped += 1
else
begin
PostAction.act(user, post, PostActionType.types[:bookmark])
bookmarks_created += 1
rescue PostAction::AlreadyActed
bookmarks_skipped += 1
end
user.id = @lookup.user_id_from_imported_user_id(params[:user_id])
post.id = @lookup.post_id_from_imported_post_id(params[:post_id])
print_status bookmarks_created + bookmarks_skipped + (opts[:offset] || 0), total
if user.id.nil? || post.id.nil?
skipped += 1
puts "Skipping bookmark for user id #{params[:user_id]} and post id #{params[:post_id]}"
else
begin
PostAction.act(user, post, PostActionType.types[:bookmark])
created += 1
rescue PostAction::AlreadyActed
skipped += 1
end
end
end
print_status created + skipped + (opts[:offset] || 0), total
end
[created, skipped]
end
def close_inactive_topics(opts={})
@ -633,23 +606,26 @@ class ImportScripts::Base
end
def html_for_upload(upload, display_filename)
if FileHelper.is_image?(upload.url)
embedded_image_html(upload)
else
attachment_html(upload, display_filename)
end
@uploader.html_for_upload(upload, display_filename)
end
def embedded_image_html(upload)
%Q[<img src="#{upload.url}" width="#{[upload.width, 640].compact.min}" height="#{[upload.height,480].compact.min}"><br/>]
@uploader.embedded_image_html(upload)
end
def attachment_html(upload, display_filename)
"<a class='attachment' href='#{upload.url}'>#{display_filename}</a> (#{number_to_human_size(upload.filesize)})"
@uploader.attachment_html(upload, display_filename)
end
def print_status(current, max)
print "\r%9d / %d (%5.1f%%) " % [current, max, ((current.to_f / max.to_f) * 100).round(1)]
def print_status(current, max, start_time = nil)
if start_time.present?
elapsed_seconds = Time.now - start_time
elements_per_minute = '[%.0f items/min] ' % [current / elapsed_seconds.to_f * 60]
else
elements_per_minute = ''
end
print "\r%9d / %d (%5.1f%%) %s" % [current, max, current / max.to_f * 100, elements_per_minute]
end
def print_spinner
@ -658,6 +634,10 @@ class ImportScripts::Base
print "\b#{@spinner_chars[0]}"
end
def get_start_time(key)
@start_times.fetch(key) {|k| @start_times[k] = Time.now}
end
def batches(batch_size)
offset = 0
loop do

View File

@ -0,0 +1,99 @@
module ImportScripts
class LookupContainer
def initialize
puts 'loading existing groups...'
@groups = {}
GroupCustomField.where(name: 'import_id').pluck(:group_id, :value).each do |group_id, import_id|
@groups[import_id] = group_id
end
puts 'loading existing users...'
@users = {}
UserCustomField.where(name: 'import_id').pluck(:user_id, :value).each do |user_id, import_id|
@users[import_id] = user_id
end
puts 'loading existing categories...'
@categories = {}
CategoryCustomField.where(name: 'import_id').pluck(:category_id, :value).each do |category_id, import_id|
@categories[import_id] = category_id
end
puts 'loading existing posts...'
@posts = {}
PostCustomField.where(name: 'import_id').pluck(:post_id, :value).each do |post_id, import_id|
@posts[import_id] = post_id
end
puts 'loading existing topics...'
@topics = {}
Post.joins(:topic).pluck('posts.id, posts.topic_id, posts.post_number, topics.slug').each do |p|
@topics[p[0]] = {
topic_id: p[1],
post_number: p[2],
url: Post.url(p[3], p[1], p[2])
}
end
end
# Get the Discourse Post id based on the id of the source record
def post_id_from_imported_post_id(import_id)
@posts[import_id] || @posts[import_id.to_s]
end
# Get the Discourse topic info (a hash) based on the id of the source record
def topic_lookup_from_imported_post_id(import_id)
post_id = post_id_from_imported_post_id(import_id)
post_id ? @topics[post_id] : nil
end
# Get the Discourse Group id based on the id of the source group
def group_id_from_imported_group_id(import_id)
@groups[import_id] || @groups[import_id.to_s] || find_group_by_import_id(import_id).try(:id)
end
# Get the Discourse Group based on the id of the source group
def find_group_by_import_id(import_id)
GroupCustomField.where(name: 'import_id', value: import_id.to_s).first.try(:group)
end
# Get the Discourse User id based on the id of the source user
def user_id_from_imported_user_id(import_id)
@users[import_id] || @users[import_id.to_s] || find_user_by_import_id(import_id).try(:id)
end
# Get the Discourse User based on the id of the source user
def find_user_by_import_id(import_id)
UserCustomField.where(name: 'import_id', value: import_id.to_s).first.try(:user)
end
# Get the Discourse Category id based on the id of the source category
def category_id_from_imported_category_id(import_id)
@categories[import_id] || @categories[import_id.to_s]
end
def add_group(import_id, group)
@groups[import_id] = group.id
end
def add_user(import_id, user)
@users[import_id] = user.id
end
def add_category(import_id, category)
@categories[import_id] = category.id
end
def add_post(import_id, post)
@posts[import_id] = post.id
end
def add_topic(post)
@topics[post.id] = {
post_number: post.post_number,
topic_id: post.topic_id,
url: post.url,
}
end
end
end

View File

@ -0,0 +1,45 @@
require_dependency 'url_helper'
require_dependency 'file_helper'
module ImportScripts
class Uploader
include ActionView::Helpers::NumberHelper
# Creates an upload.
# Expects path to be the full path and filename of the source file.
# @return [Upload]
def create_upload(user_id, path, source_filename)
tmp = Tempfile.new('discourse-upload')
src = File.open(path)
FileUtils.copy_stream(src, tmp)
src.close
tmp.rewind
Upload.create_for(user_id, tmp, source_filename, tmp.size)
rescue => e
Rails.logger.error("Failed to create upload: #{e}")
nil
ensure
tmp.close rescue nil
tmp.unlink rescue nil
end
def html_for_upload(upload, display_filename)
if FileHelper.is_image?(upload.url)
embedded_image_html(upload)
else
attachment_html(upload, display_filename)
end
end
def embedded_image_html(upload)
image_width = [upload.width, SiteSetting.max_image_width].compact.min
image_height = [upload.height, SiteSetting.max_image_height].compact.min
%Q[<img src="#{upload.url}" width="#{image_width}" height="#{image_height}"><br/>]
end
def attachment_html(upload, display_filename)
"<a class='attachment' href='#{upload.url}'>#{display_filename}</a> (#{number_to_human_size(upload.filesize)})"
end
end
end