Improve the import script for "Get Satisfaction"

* Works with the current column layout exported as Excel file * Tries to fix invalid CSV when it wasn't exported from Excel * Imports categories * Imports topics into the correct category * Allows skipping archived topics * Allows skipping private topics * Makes use of the latest features from the base importer * Some minor fixes and documentation updates
2018-03-19 13:51:01 +01:00 · 2018-03-19 13:51:01 +01:00 · a0d5e243fd
parent c5d26992d4
commit a0d5e243fd
1 changed files with 257 additions and 239 deletions
--- a/script/import_scripts/getsatisfaction.rb
+++ b/script/import_scripts/getsatisfaction.rb
@ -1,22 +1,41 @@
 # getsatisfaction importer
 #
-# pre-req: you will get a bunch of CSV files, be sure to rename them all so
+# pre-req: You will either get an Excel or a bunch of CSV files. Be sure to rename them all so that
 #
-# - users.csv is the users table export (it may come from getsatisfaction as Users-Table 1.csv
+# - users.csv is the users table export
 # - replies.csv is the reply table export
 # - topics.csv is the topics table export
+# - categories.csv is the categories table export
+# - topics_categories.csv is the mapping between the topics and categories table
 #
+# Make sure that the CSV files use UTF-8 encoding, have consistent line endings and use comma as column separator.
+# That's usually the case when you export Excel sheets as CSV.
+# When you get MalformedCSVError during the import, try converting the line endings of the CSV into the Unix format.
+# Mixed line endings in CSV files can create weird errors!
 #
-# note, the importer will import all topics into a new category called 'Old Forum' and optionally close all the topics
-#
+# You need to call fix_quotes_in_csv() for CSV files that use \" to escape quotes within quoted fields.
+# The import script expects quotes to be escaped with "".
+
 require 'csv'
+require 'set'
 require File.expand_path(File.dirname(__FILE__) + "/base.rb")
 require 'reverse_markdown' # gem 'reverse_markdown'

 # Call it like this:
-#   RAILS_ENV=production bundle exec ruby script/import_scripts/getsatisfaction.rb
+#   RAILS_ENV=production bundle exec ruby script/import_scripts/getsatisfaction.rb DIRNAME
 class ImportScripts::GetSatisfaction < ImportScripts::Base

+  IMPORT_ARCHIVED_TOPICS = false
+
+  # The script classifies each topic as private when at least one associated category
+  # in "topics_categories.csv" is unknown (not included i "categories.csv").
+  IMPORT_PRIVATE_TOPICS = false
+
+  # Should the creation of permalinks be skipped? Make sure you configure OLD_DOMAIN if you
+  CREATE_PERMALINKS = true
+
+  # Replace "http://community.example.com/" with the URL of your community for permalinks
+  OLD_DOMAIN = "http://community.example.com/"
  BATCH_SIZE = 1000

  def initialize(path)
@ -24,116 +43,78 @@ class ImportScripts::GetSatisfaction < ImportScripts::Base
    super()
    @bbcode_to_md = true
    @topic_slug = {}
-
-    puts "loading post mappings..."
-    @post_number_map = {}
-    Post.pluck(:id, :post_number).each do |post_id, post_number|
-      @post_number_map[post_id] = post_number
-    end
-  end
-
-  def created_post(post)
-    @post_number_map[post.id] = post.post_number
-    super
+    @topic_categories = {}
+    @skipped_topics = Set.new
  end

  def execute
-    c = Category.find_by(name: 'Old Forum') ||
-      Category.create!(name: 'Old Forum', user: Discourse.system_user)
+    # TODO Remove the call to fix_quotes_in_csv() if your replies.csv uses the double quotes ("").
+    # That's usually the case when you exported the file from Excel.
+    fix_quotes_in_csv("replies")

    import_users
-    import_posts(c)
+    import_categories
+    import_topics
+    import_posts

-    create_permalinks
-
-    # uncomment if you want to close all the topics
-    # Topic.where(category: c).update_all(closed: true)
+    create_permalinks if CREATE_PERMALINKS
  end

-  class RowResolver
-    def load(row)
-      @row = row
+  def csv_filename(table_name, use_fixed: true)
+    if use_fixed
+      filename = File.join(@path, "#{table_name}_fixed.csv")
+      return filename if File.exists?(filename)
    end

-    def self.create(cols)
-      Class.new(RowResolver).new(cols)
+    File.join(@path, "#{table_name}.csv")
  end

-    def initialize(cols)
-      cols.each_with_index do |col, idx|
-        self.class.send(:define_method, col) do
-          @row[idx]
-        end
+  def fix_quotes_in_csv(*table_names)
+    puts "", "fixing CSV files"
+
+    table_names.each do |table_name|
+      source_filename = csv_filename(table_name, use_fixed: false)
+      target_filename = csv_filename("#{table_name}_fixed", use_fixed: false)
+
+      previous_line = nil
+
+      File.open(target_filename, "w") do |file|
+        File.open(source_filename).each_line do |line|
+          line.gsub!(/(?<![^\\]\\)\\"/, '""')
+          line.gsub!(/\\\\/, '\\')
+
+          if previous_line
+            previous_line << "\n" unless line.starts_with?(",")
+            line = "#{previous_line}#{line}"
+            previous_line = nil
          end
+
+          if line.gsub!(/,\+1\\\R$/m, ',"+1"').present?
+            previous_line = line
+          else
+            file.puts(line)
          end
        end

-  def load_user_batch!(users, offset, total)
-    if users.length > 0
-      create_users(users, offset: offset, total: total) do |user|
-        user
+        file.puts(previous_line) if previous_line
      end
-      users.clear
    end
  end

-  def csv_parse(name)
-    filename = "#{@path}/#{name}.csv"
-    first = true
-    row = nil
-
-    current_row = "";
-    double_quote_count = 0
-
-    # In case of Excel export file, I converted it to CSV and used:
-    # CSV.open(filename, encoding:'iso-8859-1:utf-8').each do |raw|
-    File.open(filename).each_line do |line|
-
-      line.strip!
-
-      current_row << "\n" unless current_row.empty?
-      current_row << line
-
-      raw = begin
-              CSV.parse(current_row, col_sep: ";")
-            rescue CSV::MalformedCSVError => e
-              puts e.message
-              puts "*" * 100
-              puts "Bad row skipped, line is: #{line}"
-              puts
-              puts current_row
-              puts
-              puts "double quote count is : #{double_quote_count}"
-              puts "*" * 100
-
-              current_row = ""
-              double_quote_count = 0
-
-              next
-            end[0]
-
-      if first
-        row = RowResolver.create(raw)
-
-        current_row = ""
-        double_quote_count = 0
-        first = false
-        next
+  def csv_parse(table_name)
+    CSV.foreach(csv_filename(table_name),
+                headers: true,
+                header_converters: :symbol,
+                skip_blanks: true,
+                encoding: 'bom|utf-8') { |row| yield row }
  end

-      row.load(raw)
-
-      yield row
-
-      current_row = ""
-      double_quote_count = 0
-    end
-  end
-
-  def total_rows(table)
-    # In case of Excel export file, I converted it to CSV and used:
-    # CSV.foreach("#{@path}/#{table}.csv", encoding:'iso-8859-1:utf-8').inject(0) {|c, line| c+1} - 1
-    File.foreach("#{@path}/#{table}.csv").inject(0) { |c, line| c + 1 } - 1
+  def total_rows(table_name)
+    CSV.foreach(csv_filename(table_name),
+                headers: true,
+                skip_blanks: true,
+                encoding: 'bom|utf-8')
+      .inject(0) { |c, _| c + 1 }
  end

  def import_users
@ -145,52 +126,44 @@ class ImportScripts::GetSatisfaction < ImportScripts::Base
    total = total_rows("users")

    csv_parse("users") do |row|
-
-      if row.suspended_at
-        puts "skipping suspended user"
-        p row
-        next
-      end
-
-      id = row.user_id
-      email = row.email
-
-      # fake it
-      if row.email.blank? || row.email !~ /@/
-        email = SecureRandom.hex << "@domain.com"
-      end
-
-      name = row.real_name
-      username = row.nick
-      created_at = DateTime.parse(row.m_created)
-
-      username = name if username == "NULL"
-      username = email.split("@")[0] if username.blank?
-      name = email.split("@")[0] if name.blank?
-
      users << {
-        id: id,
-        email: email,
-        name: name,
-        username: username,
-        created_at: created_at,
-        active: false
+        id: row[:user_id],
+        email: row[:email],
+        name: row[:realname],
+        username: row[:nickname],
+        created_at: DateTime.parse(row[:joined_date]),
+        active: true
      }

      count += 1
      if count % BATCH_SIZE == 0
-        load_user_batch! users, count - users.length, total
+        import_users_batch!(users, count - users.length, total)
+      end
    end

+    import_users_batch!(users, count - users.length, total)
  end

-    load_user_batch! users, count, total
+  def import_users_batch!(users, offset, total)
+    return if users.empty?
+
+    create_users(users, offset: offset, total: total) do |user|
+      user
+    end
+    users.clear
  end

  def import_categories
+    puts "", "creating categories"
+
    rows = []
+
    csv_parse("categories") do |row|
-      rows << { id: row.id, name: row.name, description: row.description }
+      rows << {
+        id: row[:category_id],
+        name: row[:name],
+        description: row[:description].present? ? normalize_raw!(row[:description]) : nil
+      }
    end

    create_categories(rows) do |row|
@ -198,8 +171,159 @@ class ImportScripts::GetSatisfaction < ImportScripts::Base
    end
  end

+  def import_topic_id(topic_id)
+    "T#{topic_id}"
+  end
+
+  def import_topics
+    read_topic_categories
+
+    puts "", "creating topics"
+
+    count = 0
+    topics = []
+
+    total = total_rows("topics")
+
+    csv_parse("topics") do |row|
+      topic = nil
+      topic_id = import_topic_id(row[:topic_id])
+
+      if skip_topic?(row)
+        @skipped_topics.add(topic_id)
+      else
+        topic = map_post(row)
+        topic[:id] = topic_id
+        topic[:title] = row[:subject].present? ? row[:subject].strip[0...255] : "Topic title missing"
+        topic[:category] = category_id(row)
+        topic[:archived] = row[:archived_at].present?
+
+        @topic_slug[topic[:id]] = row[:url] if CREATE_PERMALINKS
+      end
+
+      topics << topic
+      count += 1
+
+      if count % BATCH_SIZE == 0
+        import_topics_batch!(topics, count - topics.length, total)
+      end
+    end
+
+    import_topics_batch!(topics, count - topics.length, total)
+  end
+
+  def skip_topic?(row)
+    return true if row[:removed] == "1"
+    return true unless IMPORT_ARCHIVED_TOPICS || row[:archived_at].blank?
+
+    unless IMPORT_PRIVATE_TOPICS
+      categories = @topic_categories[row[:topic_id]]
+      return true if categories && categories[:has_unknown_category]
+    end
+
+    false
+  end
+
+  def category_id(row)
+    categories = @topic_categories[row[:topic_id]]
+    return categories[:category_ids].last if categories
+
+    SiteSetting.uncategorized_category_id
+  end
+
+  def read_topic_categories
+    puts "", "reading topic_categories"
+
+    count = 0
+    total = total_rows("topics_categories")
+
+    csv_parse("topics_categories") do |row|
+      topic_id = row[:topic_id]
+      category_id = category_id_from_imported_category_id(row[:category_id])
+
+      @topic_categories[topic_id] ||= { category_ids: [], has_unknown_category: false }
+
+      if category_id.nil?
+        @topic_categories[topic_id][:has_unknown_category] = true
+      else
+        @topic_categories[topic_id][:category_ids] << category_id
+      end
+
+      count += 1
+      print_status(count, total)
+    end
+  end
+
+  def import_topics_batch!(topics, offset, total)
+    return if topics.empty?
+
+    create_posts(topics, total: total, offset: offset) { |topic| topic }
+    topics.clear
+  end
+
+  def import_posts
+    puts "", "creating posts"
+
+    count = 0
+    posts = []
+
+    total = total_rows("replies")
+
+    csv_parse("replies") do |row|
+      post = nil
+
+      if row[:removed] != "1"
+        parent = topic_lookup_from_imported_post_id(row[:parent_id]) if row[:parent_id] != "NULL"
+
+        post = map_post(row)
+        post[:id] = row[:reply_id]
+        post[:topic_id] = import_topic_id(row[:topic_id])
+        post[:reply_to_post_number] = parent[:post_number] if parent
+      end
+
+      posts << post
+      count += 1
+
+      if count % BATCH_SIZE == 0
+        import_posts_batch!(posts, count - posts.length, total)
+      end
+    end
+
+    import_posts_batch!(posts, count - posts.length, total)
+  end
+
+  def import_posts_batch!(posts, offset, total)
+    return if posts.empty?
+
+    create_posts(posts, total: total, offset: offset) do |post|
+      next if post.nil? || @skipped_topics.include?(post[:topic_id])
+
+      topic = topic_lookup_from_imported_post_id(post[:topic_id])
+
+      if topic
+        post[:topic_id] = topic[:topic_id]
+      else
+        p "MISSING TOPIC #{post[:topic_id]}"
+        p post
+        next
+      end
+
+      post
+    end
+
+    posts.clear
+  end
+
+  def map_post(row)
+    {
+      user_id: user_id_from_imported_user_id(row[:user_id]) || Discourse.system_user.id,
+      created_at: DateTime.parse(row[:created_at]),
+      raw: normalize_raw!(row[:formatted_content])
+    }
+  end
+
  def normalize_raw!(raw)
-    return "<missing>" if raw.nil?
+    return "<missing>" if raw.blank?
    raw = raw.dup

    # hoist code
@ -229,120 +353,14 @@ class ImportScripts::GetSatisfaction < ImportScripts::Base
    raw
  end

-  def import_post_batch!(posts, topics, offset, total)
-    create_posts(posts, total: total, offset: offset) do |post|
-
-      mapped = {}
-
-      mapped[:id] = post[:id]
-      mapped[:user_id] = user_id_from_imported_user_id(post[:user_id]) || -1
-      mapped[:raw] = post[:body]
-      mapped[:created_at] = post[:created_at]
-
-      topic = topics[post[:topic_id]]
-
-      unless topic
-        p "MISSING TOPIC #{post[:topic_id]}"
-        p post
-        next
-      end
-
-      unless topic[:post_id]
-        mapped[:title] = post[:title] || "Topic title missing"
-        topic[:post_id] = post[:id]
-        mapped[:category] = post[:category]
-      else
-        parent = topic_lookup_from_imported_post_id(topic[:post_id])
-        next unless parent
-
-        mapped[:topic_id] = parent[:topic_id]
-
-        reply_to_post_id = post_id_from_imported_post_id(post[:reply_id])
-        if reply_to_post_id
-          reply_to_post_number = @post_number_map[reply_to_post_id]
-          if reply_to_post_number && reply_to_post_number > 1
-            mapped[:reply_to_post_number] = reply_to_post_number
-          end
-        end
-      end
-
-      next if topic[:deleted] || post[:deleted]
-
-      mapped
-    end
-
-      posts.clear
-  end
-
-  def import_posts(category)
-    puts "", "creating topics and posts"
-
-    topic_map = {}
-
-    csv_parse("topics") do |topic|
-      @topic_slug[topic.id.to_i] = topic.url
-
-      topic_map[topic.id] = {
-        id: topic.id,
-        topic_id: topic.id,
-        title: topic.subject,
-        deleted: topic.removed == "1",
-        closed: true,
-        body: normalize_raw!(topic.additional_detail || topic.subject || "<missing>"),
-        created_at: DateTime.parse(topic.created_at),
-        user_id: topic.UserId,
-        category: category.name
-      }
-    end
-
-    total = total_rows("replies")
-
-    posts = []
-    count = 0
-
-    topic_map.each do |_, topic|
-      # a bit lazy
-      posts << topic if topic[:body]
-    end
-
-    csv_parse("replies") do |row|
-
-      unless row.created_at
-        puts "NO CREATION DATE FOR POST"
-        p row
-        next
-      end
-
-      row = {
-        id: row.id,
-        topic_id: row.topic_id,
-        reply_id: row.parent_id,
-        user_id: row.UserId,
-        body: normalize_raw!(row.content),
-        created_at: DateTime.parse(row.created_at)
-      }
-      posts << row
-      count += 1
-
-      if posts.length > 0 && posts.length % BATCH_SIZE == 0
-        import_post_batch!(posts, topic_map, count - posts.length, total)
-      end
-    end
-
-    import_post_batch!(posts, topic_map, count - posts.length, total) if posts.length > 0
-  end
-
  def create_permalinks
    puts '', 'Creating Permalinks...', ''

-    topic_mapping = []
-
    Topic.listable_topics.find_each do |topic|
      tcf = topic.first_post.custom_fields
      if tcf && tcf["import_id"]
-        slug = @topic_slug[tcf["import_id"].to_i]
-        # TODO: replace "http://community.example.com/" with the URL of your community
-        slug = slug.gsub("http://community.example.com/", "")
+        slug = @topic_slug[tcf["import_id"]]
+        slug = slug.gsub(OLD_DOMAIN, "")
        Permalink.create(url: slug, topic_id: topic.id)
      end
    end