diff --git a/script/import_scripts/getsatisfaction.rb b/script/import_scripts/getsatisfaction.rb index ffa0b0388e8..4c6af219389 100644 --- a/script/import_scripts/getsatisfaction.rb +++ b/script/import_scripts/getsatisfaction.rb @@ -1,22 +1,41 @@ # getsatisfaction importer # -# pre-req: you will get a bunch of CSV files, be sure to rename them all so +# pre-req: You will either get an Excel or a bunch of CSV files. Be sure to rename them all so that # -# - users.csv is the users table export (it may come from getsatisfaction as Users-Table 1.csv +# - users.csv is the users table export # - replies.csv is the reply table export # - topics.csv is the topics table export +# - categories.csv is the categories table export +# - topics_categories.csv is the mapping between the topics and categories table # +# Make sure that the CSV files use UTF-8 encoding, have consistent line endings and use comma as column separator. +# That's usually the case when you export Excel sheets as CSV. +# When you get MalformedCSVError during the import, try converting the line endings of the CSV into the Unix format. +# Mixed line endings in CSV files can create weird errors! # -# note, the importer will import all topics into a new category called 'Old Forum' and optionally close all the topics -# +# You need to call fix_quotes_in_csv() for CSV files that use \" to escape quotes within quoted fields. +# The import script expects quotes to be escaped with "". + require 'csv' +require 'set' require File.expand_path(File.dirname(__FILE__) + "/base.rb") require 'reverse_markdown' # gem 'reverse_markdown' # Call it like this: -# RAILS_ENV=production bundle exec ruby script/import_scripts/getsatisfaction.rb +# RAILS_ENV=production bundle exec ruby script/import_scripts/getsatisfaction.rb DIRNAME class ImportScripts::GetSatisfaction < ImportScripts::Base + IMPORT_ARCHIVED_TOPICS = false + + # The script classifies each topic as private when at least one associated category + # in "topics_categories.csv" is unknown (not included i "categories.csv"). + IMPORT_PRIVATE_TOPICS = false + + # Should the creation of permalinks be skipped? Make sure you configure OLD_DOMAIN if you + CREATE_PERMALINKS = true + + # Replace "http://community.example.com/" with the URL of your community for permalinks + OLD_DOMAIN = "http://community.example.com/" BATCH_SIZE = 1000 def initialize(path) @@ -24,116 +43,78 @@ class ImportScripts::GetSatisfaction < ImportScripts::Base super() @bbcode_to_md = true @topic_slug = {} - - puts "loading post mappings..." - @post_number_map = {} - Post.pluck(:id, :post_number).each do |post_id, post_number| - @post_number_map[post_id] = post_number - end - end - - def created_post(post) - @post_number_map[post.id] = post.post_number - super + @topic_categories = {} + @skipped_topics = Set.new end def execute - c = Category.find_by(name: 'Old Forum') || - Category.create!(name: 'Old Forum', user: Discourse.system_user) + # TODO Remove the call to fix_quotes_in_csv() if your replies.csv uses the double quotes (""). + # That's usually the case when you exported the file from Excel. + fix_quotes_in_csv("replies") import_users - import_posts(c) + import_categories + import_topics + import_posts - create_permalinks - - # uncomment if you want to close all the topics - # Topic.where(category: c).update_all(closed: true) + create_permalinks if CREATE_PERMALINKS end - class RowResolver - def load(row) - @row = row + def csv_filename(table_name, use_fixed: true) + if use_fixed + filename = File.join(@path, "#{table_name}_fixed.csv") + return filename if File.exists?(filename) end - def self.create(cols) - Class.new(RowResolver).new(cols) - end + File.join(@path, "#{table_name}.csv") + end - def initialize(cols) - cols.each_with_index do |col, idx| - self.class.send(:define_method, col) do - @row[idx] + def fix_quotes_in_csv(*table_names) + puts "", "fixing CSV files" + + table_names.each do |table_name| + source_filename = csv_filename(table_name, use_fixed: false) + target_filename = csv_filename("#{table_name}_fixed", use_fixed: false) + + previous_line = nil + + File.open(target_filename, "w") do |file| + File.open(source_filename).each_line do |line| + line.gsub!(/(? 0 - create_users(users, offset: offset, total: total) do |user| - user - end - users.clear - end + def csv_parse(table_name) + CSV.foreach(csv_filename(table_name), + headers: true, + header_converters: :symbol, + skip_blanks: true, + encoding: 'bom|utf-8') { |row| yield row } end - def csv_parse(name) - filename = "#{@path}/#{name}.csv" - first = true - row = nil - - current_row = ""; - double_quote_count = 0 - - # In case of Excel export file, I converted it to CSV and used: - # CSV.open(filename, encoding:'iso-8859-1:utf-8').each do |raw| - File.open(filename).each_line do |line| - - line.strip! - - current_row << "\n" unless current_row.empty? - current_row << line - - raw = begin - CSV.parse(current_row, col_sep: ";") - rescue CSV::MalformedCSVError => e - puts e.message - puts "*" * 100 - puts "Bad row skipped, line is: #{line}" - puts - puts current_row - puts - puts "double quote count is : #{double_quote_count}" - puts "*" * 100 - - current_row = "" - double_quote_count = 0 - - next - end[0] - - if first - row = RowResolver.create(raw) - - current_row = "" - double_quote_count = 0 - first = false - next - end - - row.load(raw) - - yield row - - current_row = "" - double_quote_count = 0 - end - end - - def total_rows(table) - # In case of Excel export file, I converted it to CSV and used: - # CSV.foreach("#{@path}/#{table}.csv", encoding:'iso-8859-1:utf-8').inject(0) {|c, line| c+1} - 1 - File.foreach("#{@path}/#{table}.csv").inject(0) { |c, line| c + 1 } - 1 + def total_rows(table_name) + CSV.foreach(csv_filename(table_name), + headers: true, + skip_blanks: true, + encoding: 'bom|utf-8') + .inject(0) { |c, _| c + 1 } end def import_users @@ -145,52 +126,44 @@ class ImportScripts::GetSatisfaction < ImportScripts::Base total = total_rows("users") csv_parse("users") do |row| - - if row.suspended_at - puts "skipping suspended user" - p row - next - end - - id = row.user_id - email = row.email - - # fake it - if row.email.blank? || row.email !~ /@/ - email = SecureRandom.hex << "@domain.com" - end - - name = row.real_name - username = row.nick - created_at = DateTime.parse(row.m_created) - - username = name if username == "NULL" - username = email.split("@")[0] if username.blank? - name = email.split("@")[0] if name.blank? - users << { - id: id, - email: email, - name: name, - username: username, - created_at: created_at, - active: false + id: row[:user_id], + email: row[:email], + name: row[:realname], + username: row[:nickname], + created_at: DateTime.parse(row[:joined_date]), + active: true } count += 1 if count % BATCH_SIZE == 0 - load_user_batch! users, count - users.length, total + import_users_batch!(users, count - users.length, total) end - end - load_user_batch! users, count, total + import_users_batch!(users, count - users.length, total) + end + + def import_users_batch!(users, offset, total) + return if users.empty? + + create_users(users, offset: offset, total: total) do |user| + user + end + users.clear end def import_categories + puts "", "creating categories" + rows = [] + csv_parse("categories") do |row| - rows << { id: row.id, name: row.name, description: row.description } + rows << { + id: row[:category_id], + name: row[:name], + description: row[:description].present? ? normalize_raw!(row[:description]) : nil + } end create_categories(rows) do |row| @@ -198,8 +171,159 @@ class ImportScripts::GetSatisfaction < ImportScripts::Base end end + def import_topic_id(topic_id) + "T#{topic_id}" + end + + def import_topics + read_topic_categories + + puts "", "creating topics" + + count = 0 + topics = [] + + total = total_rows("topics") + + csv_parse("topics") do |row| + topic = nil + topic_id = import_topic_id(row[:topic_id]) + + if skip_topic?(row) + @skipped_topics.add(topic_id) + else + topic = map_post(row) + topic[:id] = topic_id + topic[:title] = row[:subject].present? ? row[:subject].strip[0...255] : "Topic title missing" + topic[:category] = category_id(row) + topic[:archived] = row[:archived_at].present? + + @topic_slug[topic[:id]] = row[:url] if CREATE_PERMALINKS + end + + topics << topic + count += 1 + + if count % BATCH_SIZE == 0 + import_topics_batch!(topics, count - topics.length, total) + end + end + + import_topics_batch!(topics, count - topics.length, total) + end + + def skip_topic?(row) + return true if row[:removed] == "1" + return true unless IMPORT_ARCHIVED_TOPICS || row[:archived_at].blank? + + unless IMPORT_PRIVATE_TOPICS + categories = @topic_categories[row[:topic_id]] + return true if categories && categories[:has_unknown_category] + end + + false + end + + def category_id(row) + categories = @topic_categories[row[:topic_id]] + return categories[:category_ids].last if categories + + SiteSetting.uncategorized_category_id + end + + def read_topic_categories + puts "", "reading topic_categories" + + count = 0 + total = total_rows("topics_categories") + + csv_parse("topics_categories") do |row| + topic_id = row[:topic_id] + category_id = category_id_from_imported_category_id(row[:category_id]) + + @topic_categories[topic_id] ||= { category_ids: [], has_unknown_category: false } + + if category_id.nil? + @topic_categories[topic_id][:has_unknown_category] = true + else + @topic_categories[topic_id][:category_ids] << category_id + end + + count += 1 + print_status(count, total) + end + end + + def import_topics_batch!(topics, offset, total) + return if topics.empty? + + create_posts(topics, total: total, offset: offset) { |topic| topic } + topics.clear + end + + def import_posts + puts "", "creating posts" + + count = 0 + posts = [] + + total = total_rows("replies") + + csv_parse("replies") do |row| + post = nil + + if row[:removed] != "1" + parent = topic_lookup_from_imported_post_id(row[:parent_id]) if row[:parent_id] != "NULL" + + post = map_post(row) + post[:id] = row[:reply_id] + post[:topic_id] = import_topic_id(row[:topic_id]) + post[:reply_to_post_number] = parent[:post_number] if parent + end + + posts << post + count += 1 + + if count % BATCH_SIZE == 0 + import_posts_batch!(posts, count - posts.length, total) + end + end + + import_posts_batch!(posts, count - posts.length, total) + end + + def import_posts_batch!(posts, offset, total) + return if posts.empty? + + create_posts(posts, total: total, offset: offset) do |post| + next if post.nil? || @skipped_topics.include?(post[:topic_id]) + + topic = topic_lookup_from_imported_post_id(post[:topic_id]) + + if topic + post[:topic_id] = topic[:topic_id] + else + p "MISSING TOPIC #{post[:topic_id]}" + p post + next + end + + post + end + + posts.clear + end + + def map_post(row) + { + user_id: user_id_from_imported_user_id(row[:user_id]) || Discourse.system_user.id, + created_at: DateTime.parse(row[:created_at]), + raw: normalize_raw!(row[:formatted_content]) + } + end + def normalize_raw!(raw) - return "" if raw.nil? + return "" if raw.blank? raw = raw.dup # hoist code @@ -229,120 +353,14 @@ class ImportScripts::GetSatisfaction < ImportScripts::Base raw end - def import_post_batch!(posts, topics, offset, total) - create_posts(posts, total: total, offset: offset) do |post| - - mapped = {} - - mapped[:id] = post[:id] - mapped[:user_id] = user_id_from_imported_user_id(post[:user_id]) || -1 - mapped[:raw] = post[:body] - mapped[:created_at] = post[:created_at] - - topic = topics[post[:topic_id]] - - unless topic - p "MISSING TOPIC #{post[:topic_id]}" - p post - next - end - - unless topic[:post_id] - mapped[:title] = post[:title] || "Topic title missing" - topic[:post_id] = post[:id] - mapped[:category] = post[:category] - else - parent = topic_lookup_from_imported_post_id(topic[:post_id]) - next unless parent - - mapped[:topic_id] = parent[:topic_id] - - reply_to_post_id = post_id_from_imported_post_id(post[:reply_id]) - if reply_to_post_id - reply_to_post_number = @post_number_map[reply_to_post_id] - if reply_to_post_number && reply_to_post_number > 1 - mapped[:reply_to_post_number] = reply_to_post_number - end - end - end - - next if topic[:deleted] || post[:deleted] - - mapped - end - - posts.clear - end - - def import_posts(category) - puts "", "creating topics and posts" - - topic_map = {} - - csv_parse("topics") do |topic| - @topic_slug[topic.id.to_i] = topic.url - - topic_map[topic.id] = { - id: topic.id, - topic_id: topic.id, - title: topic.subject, - deleted: topic.removed == "1", - closed: true, - body: normalize_raw!(topic.additional_detail || topic.subject || ""), - created_at: DateTime.parse(topic.created_at), - user_id: topic.UserId, - category: category.name - } - end - - total = total_rows("replies") - - posts = [] - count = 0 - - topic_map.each do |_, topic| - # a bit lazy - posts << topic if topic[:body] - end - - csv_parse("replies") do |row| - - unless row.created_at - puts "NO CREATION DATE FOR POST" - p row - next - end - - row = { - id: row.id, - topic_id: row.topic_id, - reply_id: row.parent_id, - user_id: row.UserId, - body: normalize_raw!(row.content), - created_at: DateTime.parse(row.created_at) - } - posts << row - count += 1 - - if posts.length > 0 && posts.length % BATCH_SIZE == 0 - import_post_batch!(posts, topic_map, count - posts.length, total) - end - end - - import_post_batch!(posts, topic_map, count - posts.length, total) if posts.length > 0 - end - def create_permalinks puts '', 'Creating Permalinks...', '' - topic_mapping = [] - Topic.listable_topics.find_each do |topic| tcf = topic.first_post.custom_fields if tcf && tcf["import_id"] - slug = @topic_slug[tcf["import_id"].to_i] - # TODO: replace "http://community.example.com/" with the URL of your community - slug = slug.gsub("http://community.example.com/", "") + slug = @topic_slug[tcf["import_id"]] + slug = slug.gsub(OLD_DOMAIN, "") Permalink.create(url: slug, topic_id: topic.id) end end