Improve the import script for "Get Satisfaction"
* Works with the current column layout exported as Excel file * Tries to fix invalid CSV when it wasn't exported from Excel * Imports categories * Imports topics into the correct category * Allows skipping archived topics * Allows skipping private topics * Makes use of the latest features from the base importer * Some minor fixes and documentation updates
This commit is contained in:
parent
c5d26992d4
commit
a0d5e243fd
|
@ -1,22 +1,41 @@
|
|||
# getsatisfaction importer
|
||||
#
|
||||
# pre-req: you will get a bunch of CSV files, be sure to rename them all so
|
||||
# pre-req: You will either get an Excel or a bunch of CSV files. Be sure to rename them all so that
|
||||
#
|
||||
# - users.csv is the users table export (it may come from getsatisfaction as Users-Table 1.csv
|
||||
# - users.csv is the users table export
|
||||
# - replies.csv is the reply table export
|
||||
# - topics.csv is the topics table export
|
||||
# - categories.csv is the categories table export
|
||||
# - topics_categories.csv is the mapping between the topics and categories table
|
||||
#
|
||||
# Make sure that the CSV files use UTF-8 encoding, have consistent line endings and use comma as column separator.
|
||||
# That's usually the case when you export Excel sheets as CSV.
|
||||
# When you get MalformedCSVError during the import, try converting the line endings of the CSV into the Unix format.
|
||||
# Mixed line endings in CSV files can create weird errors!
|
||||
#
|
||||
# note, the importer will import all topics into a new category called 'Old Forum' and optionally close all the topics
|
||||
#
|
||||
# You need to call fix_quotes_in_csv() for CSV files that use \" to escape quotes within quoted fields.
|
||||
# The import script expects quotes to be escaped with "".
|
||||
|
||||
require 'csv'
|
||||
require 'set'
|
||||
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
|
||||
require 'reverse_markdown' # gem 'reverse_markdown'
|
||||
|
||||
# Call it like this:
|
||||
# RAILS_ENV=production bundle exec ruby script/import_scripts/getsatisfaction.rb
|
||||
# RAILS_ENV=production bundle exec ruby script/import_scripts/getsatisfaction.rb DIRNAME
|
||||
class ImportScripts::GetSatisfaction < ImportScripts::Base
|
||||
|
||||
IMPORT_ARCHIVED_TOPICS = false
|
||||
|
||||
# The script classifies each topic as private when at least one associated category
|
||||
# in "topics_categories.csv" is unknown (not included i "categories.csv").
|
||||
IMPORT_PRIVATE_TOPICS = false
|
||||
|
||||
# Should the creation of permalinks be skipped? Make sure you configure OLD_DOMAIN if you
|
||||
CREATE_PERMALINKS = true
|
||||
|
||||
# Replace "http://community.example.com/" with the URL of your community for permalinks
|
||||
OLD_DOMAIN = "http://community.example.com/"
|
||||
BATCH_SIZE = 1000
|
||||
|
||||
def initialize(path)
|
||||
|
@ -24,116 +43,78 @@ class ImportScripts::GetSatisfaction < ImportScripts::Base
|
|||
super()
|
||||
@bbcode_to_md = true
|
||||
@topic_slug = {}
|
||||
|
||||
puts "loading post mappings..."
|
||||
@post_number_map = {}
|
||||
Post.pluck(:id, :post_number).each do |post_id, post_number|
|
||||
@post_number_map[post_id] = post_number
|
||||
end
|
||||
end
|
||||
|
||||
def created_post(post)
|
||||
@post_number_map[post.id] = post.post_number
|
||||
super
|
||||
@topic_categories = {}
|
||||
@skipped_topics = Set.new
|
||||
end
|
||||
|
||||
def execute
|
||||
c = Category.find_by(name: 'Old Forum') ||
|
||||
Category.create!(name: 'Old Forum', user: Discourse.system_user)
|
||||
# TODO Remove the call to fix_quotes_in_csv() if your replies.csv uses the double quotes ("").
|
||||
# That's usually the case when you exported the file from Excel.
|
||||
fix_quotes_in_csv("replies")
|
||||
|
||||
import_users
|
||||
import_posts(c)
|
||||
import_categories
|
||||
import_topics
|
||||
import_posts
|
||||
|
||||
create_permalinks
|
||||
|
||||
# uncomment if you want to close all the topics
|
||||
# Topic.where(category: c).update_all(closed: true)
|
||||
create_permalinks if CREATE_PERMALINKS
|
||||
end
|
||||
|
||||
class RowResolver
|
||||
def load(row)
|
||||
@row = row
|
||||
def csv_filename(table_name, use_fixed: true)
|
||||
if use_fixed
|
||||
filename = File.join(@path, "#{table_name}_fixed.csv")
|
||||
return filename if File.exists?(filename)
|
||||
end
|
||||
|
||||
def self.create(cols)
|
||||
Class.new(RowResolver).new(cols)
|
||||
File.join(@path, "#{table_name}.csv")
|
||||
end
|
||||
|
||||
def initialize(cols)
|
||||
cols.each_with_index do |col, idx|
|
||||
self.class.send(:define_method, col) do
|
||||
@row[idx]
|
||||
end
|
||||
def fix_quotes_in_csv(*table_names)
|
||||
puts "", "fixing CSV files"
|
||||
|
||||
table_names.each do |table_name|
|
||||
source_filename = csv_filename(table_name, use_fixed: false)
|
||||
target_filename = csv_filename("#{table_name}_fixed", use_fixed: false)
|
||||
|
||||
previous_line = nil
|
||||
|
||||
File.open(target_filename, "w") do |file|
|
||||
File.open(source_filename).each_line do |line|
|
||||
line.gsub!(/(?<![^\\]\\)\\"/, '""')
|
||||
line.gsub!(/\\\\/, '\\')
|
||||
|
||||
if previous_line
|
||||
previous_line << "\n" unless line.starts_with?(",")
|
||||
line = "#{previous_line}#{line}"
|
||||
previous_line = nil
|
||||
end
|
||||
|
||||
if line.gsub!(/,\+1\\\R$/m, ',"+1"').present?
|
||||
previous_line = line
|
||||
else
|
||||
file.puts(line)
|
||||
end
|
||||
end
|
||||
|
||||
def load_user_batch!(users, offset, total)
|
||||
if users.length > 0
|
||||
create_users(users, offset: offset, total: total) do |user|
|
||||
user
|
||||
file.puts(previous_line) if previous_line
|
||||
end
|
||||
users.clear
|
||||
end
|
||||
end
|
||||
|
||||
def csv_parse(name)
|
||||
filename = "#{@path}/#{name}.csv"
|
||||
first = true
|
||||
row = nil
|
||||
|
||||
current_row = "";
|
||||
double_quote_count = 0
|
||||
|
||||
# In case of Excel export file, I converted it to CSV and used:
|
||||
# CSV.open(filename, encoding:'iso-8859-1:utf-8').each do |raw|
|
||||
File.open(filename).each_line do |line|
|
||||
|
||||
line.strip!
|
||||
|
||||
current_row << "\n" unless current_row.empty?
|
||||
current_row << line
|
||||
|
||||
raw = begin
|
||||
CSV.parse(current_row, col_sep: ";")
|
||||
rescue CSV::MalformedCSVError => e
|
||||
puts e.message
|
||||
puts "*" * 100
|
||||
puts "Bad row skipped, line is: #{line}"
|
||||
puts
|
||||
puts current_row
|
||||
puts
|
||||
puts "double quote count is : #{double_quote_count}"
|
||||
puts "*" * 100
|
||||
|
||||
current_row = ""
|
||||
double_quote_count = 0
|
||||
|
||||
next
|
||||
end[0]
|
||||
|
||||
if first
|
||||
row = RowResolver.create(raw)
|
||||
|
||||
current_row = ""
|
||||
double_quote_count = 0
|
||||
first = false
|
||||
next
|
||||
def csv_parse(table_name)
|
||||
CSV.foreach(csv_filename(table_name),
|
||||
headers: true,
|
||||
header_converters: :symbol,
|
||||
skip_blanks: true,
|
||||
encoding: 'bom|utf-8') { |row| yield row }
|
||||
end
|
||||
|
||||
row.load(raw)
|
||||
|
||||
yield row
|
||||
|
||||
current_row = ""
|
||||
double_quote_count = 0
|
||||
end
|
||||
end
|
||||
|
||||
def total_rows(table)
|
||||
# In case of Excel export file, I converted it to CSV and used:
|
||||
# CSV.foreach("#{@path}/#{table}.csv", encoding:'iso-8859-1:utf-8').inject(0) {|c, line| c+1} - 1
|
||||
File.foreach("#{@path}/#{table}.csv").inject(0) { |c, line| c + 1 } - 1
|
||||
def total_rows(table_name)
|
||||
CSV.foreach(csv_filename(table_name),
|
||||
headers: true,
|
||||
skip_blanks: true,
|
||||
encoding: 'bom|utf-8')
|
||||
.inject(0) { |c, _| c + 1 }
|
||||
end
|
||||
|
||||
def import_users
|
||||
|
@ -145,52 +126,44 @@ class ImportScripts::GetSatisfaction < ImportScripts::Base
|
|||
total = total_rows("users")
|
||||
|
||||
csv_parse("users") do |row|
|
||||
|
||||
if row.suspended_at
|
||||
puts "skipping suspended user"
|
||||
p row
|
||||
next
|
||||
end
|
||||
|
||||
id = row.user_id
|
||||
email = row.email
|
||||
|
||||
# fake it
|
||||
if row.email.blank? || row.email !~ /@/
|
||||
email = SecureRandom.hex << "@domain.com"
|
||||
end
|
||||
|
||||
name = row.real_name
|
||||
username = row.nick
|
||||
created_at = DateTime.parse(row.m_created)
|
||||
|
||||
username = name if username == "NULL"
|
||||
username = email.split("@")[0] if username.blank?
|
||||
name = email.split("@")[0] if name.blank?
|
||||
|
||||
users << {
|
||||
id: id,
|
||||
email: email,
|
||||
name: name,
|
||||
username: username,
|
||||
created_at: created_at,
|
||||
active: false
|
||||
id: row[:user_id],
|
||||
email: row[:email],
|
||||
name: row[:realname],
|
||||
username: row[:nickname],
|
||||
created_at: DateTime.parse(row[:joined_date]),
|
||||
active: true
|
||||
}
|
||||
|
||||
count += 1
|
||||
if count % BATCH_SIZE == 0
|
||||
load_user_batch! users, count - users.length, total
|
||||
import_users_batch!(users, count - users.length, total)
|
||||
end
|
||||
end
|
||||
|
||||
import_users_batch!(users, count - users.length, total)
|
||||
end
|
||||
|
||||
load_user_batch! users, count, total
|
||||
def import_users_batch!(users, offset, total)
|
||||
return if users.empty?
|
||||
|
||||
create_users(users, offset: offset, total: total) do |user|
|
||||
user
|
||||
end
|
||||
users.clear
|
||||
end
|
||||
|
||||
def import_categories
|
||||
puts "", "creating categories"
|
||||
|
||||
rows = []
|
||||
|
||||
csv_parse("categories") do |row|
|
||||
rows << { id: row.id, name: row.name, description: row.description }
|
||||
rows << {
|
||||
id: row[:category_id],
|
||||
name: row[:name],
|
||||
description: row[:description].present? ? normalize_raw!(row[:description]) : nil
|
||||
}
|
||||
end
|
||||
|
||||
create_categories(rows) do |row|
|
||||
|
@ -198,8 +171,159 @@ class ImportScripts::GetSatisfaction < ImportScripts::Base
|
|||
end
|
||||
end
|
||||
|
||||
def import_topic_id(topic_id)
|
||||
"T#{topic_id}"
|
||||
end
|
||||
|
||||
def import_topics
|
||||
read_topic_categories
|
||||
|
||||
puts "", "creating topics"
|
||||
|
||||
count = 0
|
||||
topics = []
|
||||
|
||||
total = total_rows("topics")
|
||||
|
||||
csv_parse("topics") do |row|
|
||||
topic = nil
|
||||
topic_id = import_topic_id(row[:topic_id])
|
||||
|
||||
if skip_topic?(row)
|
||||
@skipped_topics.add(topic_id)
|
||||
else
|
||||
topic = map_post(row)
|
||||
topic[:id] = topic_id
|
||||
topic[:title] = row[:subject].present? ? row[:subject].strip[0...255] : "Topic title missing"
|
||||
topic[:category] = category_id(row)
|
||||
topic[:archived] = row[:archived_at].present?
|
||||
|
||||
@topic_slug[topic[:id]] = row[:url] if CREATE_PERMALINKS
|
||||
end
|
||||
|
||||
topics << topic
|
||||
count += 1
|
||||
|
||||
if count % BATCH_SIZE == 0
|
||||
import_topics_batch!(topics, count - topics.length, total)
|
||||
end
|
||||
end
|
||||
|
||||
import_topics_batch!(topics, count - topics.length, total)
|
||||
end
|
||||
|
||||
def skip_topic?(row)
|
||||
return true if row[:removed] == "1"
|
||||
return true unless IMPORT_ARCHIVED_TOPICS || row[:archived_at].blank?
|
||||
|
||||
unless IMPORT_PRIVATE_TOPICS
|
||||
categories = @topic_categories[row[:topic_id]]
|
||||
return true if categories && categories[:has_unknown_category]
|
||||
end
|
||||
|
||||
false
|
||||
end
|
||||
|
||||
def category_id(row)
|
||||
categories = @topic_categories[row[:topic_id]]
|
||||
return categories[:category_ids].last if categories
|
||||
|
||||
SiteSetting.uncategorized_category_id
|
||||
end
|
||||
|
||||
def read_topic_categories
|
||||
puts "", "reading topic_categories"
|
||||
|
||||
count = 0
|
||||
total = total_rows("topics_categories")
|
||||
|
||||
csv_parse("topics_categories") do |row|
|
||||
topic_id = row[:topic_id]
|
||||
category_id = category_id_from_imported_category_id(row[:category_id])
|
||||
|
||||
@topic_categories[topic_id] ||= { category_ids: [], has_unknown_category: false }
|
||||
|
||||
if category_id.nil?
|
||||
@topic_categories[topic_id][:has_unknown_category] = true
|
||||
else
|
||||
@topic_categories[topic_id][:category_ids] << category_id
|
||||
end
|
||||
|
||||
count += 1
|
||||
print_status(count, total)
|
||||
end
|
||||
end
|
||||
|
||||
def import_topics_batch!(topics, offset, total)
|
||||
return if topics.empty?
|
||||
|
||||
create_posts(topics, total: total, offset: offset) { |topic| topic }
|
||||
topics.clear
|
||||
end
|
||||
|
||||
def import_posts
|
||||
puts "", "creating posts"
|
||||
|
||||
count = 0
|
||||
posts = []
|
||||
|
||||
total = total_rows("replies")
|
||||
|
||||
csv_parse("replies") do |row|
|
||||
post = nil
|
||||
|
||||
if row[:removed] != "1"
|
||||
parent = topic_lookup_from_imported_post_id(row[:parent_id]) if row[:parent_id] != "NULL"
|
||||
|
||||
post = map_post(row)
|
||||
post[:id] = row[:reply_id]
|
||||
post[:topic_id] = import_topic_id(row[:topic_id])
|
||||
post[:reply_to_post_number] = parent[:post_number] if parent
|
||||
end
|
||||
|
||||
posts << post
|
||||
count += 1
|
||||
|
||||
if count % BATCH_SIZE == 0
|
||||
import_posts_batch!(posts, count - posts.length, total)
|
||||
end
|
||||
end
|
||||
|
||||
import_posts_batch!(posts, count - posts.length, total)
|
||||
end
|
||||
|
||||
def import_posts_batch!(posts, offset, total)
|
||||
return if posts.empty?
|
||||
|
||||
create_posts(posts, total: total, offset: offset) do |post|
|
||||
next if post.nil? || @skipped_topics.include?(post[:topic_id])
|
||||
|
||||
topic = topic_lookup_from_imported_post_id(post[:topic_id])
|
||||
|
||||
if topic
|
||||
post[:topic_id] = topic[:topic_id]
|
||||
else
|
||||
p "MISSING TOPIC #{post[:topic_id]}"
|
||||
p post
|
||||
next
|
||||
end
|
||||
|
||||
post
|
||||
end
|
||||
|
||||
posts.clear
|
||||
end
|
||||
|
||||
def map_post(row)
|
||||
{
|
||||
user_id: user_id_from_imported_user_id(row[:user_id]) || Discourse.system_user.id,
|
||||
created_at: DateTime.parse(row[:created_at]),
|
||||
raw: normalize_raw!(row[:formatted_content])
|
||||
}
|
||||
end
|
||||
|
||||
def normalize_raw!(raw)
|
||||
return "<missing>" if raw.nil?
|
||||
return "<missing>" if raw.blank?
|
||||
raw = raw.dup
|
||||
|
||||
# hoist code
|
||||
|
@ -229,120 +353,14 @@ class ImportScripts::GetSatisfaction < ImportScripts::Base
|
|||
raw
|
||||
end
|
||||
|
||||
def import_post_batch!(posts, topics, offset, total)
|
||||
create_posts(posts, total: total, offset: offset) do |post|
|
||||
|
||||
mapped = {}
|
||||
|
||||
mapped[:id] = post[:id]
|
||||
mapped[:user_id] = user_id_from_imported_user_id(post[:user_id]) || -1
|
||||
mapped[:raw] = post[:body]
|
||||
mapped[:created_at] = post[:created_at]
|
||||
|
||||
topic = topics[post[:topic_id]]
|
||||
|
||||
unless topic
|
||||
p "MISSING TOPIC #{post[:topic_id]}"
|
||||
p post
|
||||
next
|
||||
end
|
||||
|
||||
unless topic[:post_id]
|
||||
mapped[:title] = post[:title] || "Topic title missing"
|
||||
topic[:post_id] = post[:id]
|
||||
mapped[:category] = post[:category]
|
||||
else
|
||||
parent = topic_lookup_from_imported_post_id(topic[:post_id])
|
||||
next unless parent
|
||||
|
||||
mapped[:topic_id] = parent[:topic_id]
|
||||
|
||||
reply_to_post_id = post_id_from_imported_post_id(post[:reply_id])
|
||||
if reply_to_post_id
|
||||
reply_to_post_number = @post_number_map[reply_to_post_id]
|
||||
if reply_to_post_number && reply_to_post_number > 1
|
||||
mapped[:reply_to_post_number] = reply_to_post_number
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
next if topic[:deleted] || post[:deleted]
|
||||
|
||||
mapped
|
||||
end
|
||||
|
||||
posts.clear
|
||||
end
|
||||
|
||||
def import_posts(category)
|
||||
puts "", "creating topics and posts"
|
||||
|
||||
topic_map = {}
|
||||
|
||||
csv_parse("topics") do |topic|
|
||||
@topic_slug[topic.id.to_i] = topic.url
|
||||
|
||||
topic_map[topic.id] = {
|
||||
id: topic.id,
|
||||
topic_id: topic.id,
|
||||
title: topic.subject,
|
||||
deleted: topic.removed == "1",
|
||||
closed: true,
|
||||
body: normalize_raw!(topic.additional_detail || topic.subject || "<missing>"),
|
||||
created_at: DateTime.parse(topic.created_at),
|
||||
user_id: topic.UserId,
|
||||
category: category.name
|
||||
}
|
||||
end
|
||||
|
||||
total = total_rows("replies")
|
||||
|
||||
posts = []
|
||||
count = 0
|
||||
|
||||
topic_map.each do |_, topic|
|
||||
# a bit lazy
|
||||
posts << topic if topic[:body]
|
||||
end
|
||||
|
||||
csv_parse("replies") do |row|
|
||||
|
||||
unless row.created_at
|
||||
puts "NO CREATION DATE FOR POST"
|
||||
p row
|
||||
next
|
||||
end
|
||||
|
||||
row = {
|
||||
id: row.id,
|
||||
topic_id: row.topic_id,
|
||||
reply_id: row.parent_id,
|
||||
user_id: row.UserId,
|
||||
body: normalize_raw!(row.content),
|
||||
created_at: DateTime.parse(row.created_at)
|
||||
}
|
||||
posts << row
|
||||
count += 1
|
||||
|
||||
if posts.length > 0 && posts.length % BATCH_SIZE == 0
|
||||
import_post_batch!(posts, topic_map, count - posts.length, total)
|
||||
end
|
||||
end
|
||||
|
||||
import_post_batch!(posts, topic_map, count - posts.length, total) if posts.length > 0
|
||||
end
|
||||
|
||||
def create_permalinks
|
||||
puts '', 'Creating Permalinks...', ''
|
||||
|
||||
topic_mapping = []
|
||||
|
||||
Topic.listable_topics.find_each do |topic|
|
||||
tcf = topic.first_post.custom_fields
|
||||
if tcf && tcf["import_id"]
|
||||
slug = @topic_slug[tcf["import_id"].to_i]
|
||||
# TODO: replace "http://community.example.com/" with the URL of your community
|
||||
slug = slug.gsub("http://community.example.com/", "")
|
||||
slug = @topic_slug[tcf["import_id"]]
|
||||
slug = slug.gsub(OLD_DOMAIN, "")
|
||||
Permalink.create(url: slug, topic_id: topic.id)
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue