Adds import script for Zendesk

It also adds a generic SQLite database that can be used when the data needs some transformation before the actual import.
2018-05-22 21:42:49 +02:00 · 2018-05-22 21:42:49 +02:00 · 2f0e230dba
parent eceeef8413
commit 2f0e230dba
2 changed files with 468 additions and 0 deletions
--- a/script/import_scripts/base/generic_database.rb
+++ b/script/import_scripts/base/generic_database.rb
@ -0,0 +1,220 @@
+require 'sqlite3'
+
+module ImportScripts
+  class GenericDatabase
+    def initialize(directory, batch_size:, recreate: false)
+      filename = "#{directory}/index.db"
+      File.delete(filename) if recreate && File.exists?(filename)
+
+      @db = SQLite3::Database.new(filename, results_as_hash: true)
+      @batch_size = batch_size
+
+      configure_database
+      create_category_table
+      create_user_table
+      create_topic_table
+      create_post_table
+    end
+
+    def insert_category(category)
+      @db.execute(<<-SQL, prepare(category))
+        INSERT OR REPLACE INTO category (id, name, description, position, url)
+        VALUES (:id, :name, :description, :position, :url)
+      SQL
+    end
+
+    def insert_user(user)
+      @db.execute(<<-SQL, prepare(user))
+        INSERT OR REPLACE INTO user (id, email, username, name, created_at, last_seen_at, active)
+        VALUES (:id, :email, :username, :name, :created_at, :last_seen_at, :active)
+      SQL
+    end
+
+    def insert_topic(topic)
+      @db.execute(<<-SQL, prepare(topic))
+        INSERT OR REPLACE INTO topic (id, title, raw, category_id, closed, user_id, created_at, url)
+        VALUES (:id, :title, :raw, :category_id, :closed, :user_id, :created_at, :url)
+      SQL
+    end
+
+    def insert_post(post)
+      @db.execute(<<-SQL, prepare(post))
+        INSERT OR REPLACE INTO post (id, raw, topic_id, user_id, created_at, reply_to_post_id, url)
+        VALUES (:id, :raw, :topic_id, :user_id, :created_at, :reply_to_post_id, :url)
+      SQL
+    end
+
+    def sort_posts_by_created_at
+      @db.execute 'DELETE FROM post_order'
+
+      @db.execute <<-SQL
+        INSERT INTO post_order (id)
+        SELECT id
+        FROM post
+        ORDER BY created_at, topic_id, id
+      SQL
+    end
+
+    def fetch_categories
+      @db.execute(<<-SQL)
+        SELECT *
+        FROM category
+        ORDER BY position, name
+      SQL
+    end
+
+    def count_users
+      @db.get_first_value(<<-SQL)
+        SELECT COUNT(*)
+        FROM user
+      SQL
+    end
+
+    def fetch_users(last_id)
+      rows = @db.execute(<<-SQL, last_id)
+        SELECT *
+        FROM user
+        WHERE id > :last_id
+        ORDER BY id
+        LIMIT #{@batch_size}
+      SQL
+
+      add_last_column_value(rows, 'id')
+    end
+
+    def count_topics
+      @db.get_first_value(<<-SQL)
+        SELECT COUNT(*)
+        FROM topic
+      SQL
+    end
+
+    def fetch_topics(last_id)
+      rows = @db.execute(<<-SQL, last_id)
+        SELECT *
+        FROM topic
+        WHERE id > :last_id
+        ORDER BY id
+        LIMIT #{@batch_size}
+      SQL
+
+      add_last_column_value(rows, 'id')
+    end
+
+    def count_posts
+      @db.get_first_value(<<-SQL)
+        SELECT COUNT(*)
+        FROM post
+      SQL
+    end
+
+    def fetch_posts(last_row_id)
+      rows = @db.execute(<<-SQL, last_row_id)
+        SELECT o.ROWID, p.*
+        FROM post p
+          JOIN post_order o USING (id)
+        WHERE o.ROWID > :last_row_id
+        ORDER BY o.ROWID
+        LIMIT #{@batch_size}
+      SQL
+
+      add_last_column_value(rows, 'rowid')
+    end
+
+    def execute_sql(sql)
+      @db.execute(sql)
+    end
+
+    private
+
+    def configure_database
+      @db.execute 'PRAGMA journal_mode = OFF'
+      @db.execute 'PRAGMA locking_mode = EXCLUSIVE'
+    end
+
+    def create_category_table
+      @db.execute <<-SQL
+        CREATE TABLE IF NOT EXISTS category (
+          id TEXT NOT NULL PRIMARY KEY,
+          name TEXT NOT NULL,
+          description TEXT,
+          position INTEGER,
+          url TEXT
+        )
+      SQL
+    end
+
+    def create_user_table
+      @db.execute <<-SQL
+        CREATE TABLE IF NOT EXISTS user (
+          id TEXT NOT NULL PRIMARY KEY,
+          email TEXT,
+          username TEXT,
+          name TEXT,
+          created_at DATETIME,
+          last_seen_at DATETIME,
+          active BOOLEAN NOT NULL DEFAULT true
+        )
+      SQL
+    end
+
+    def create_topic_table
+      @db.execute <<-SQL
+        CREATE TABLE IF NOT EXISTS topic (
+          id TEXT NOT NULL PRIMARY KEY,
+          title TEXT,
+          raw TEXT,
+          category_id TEXT NOT NULL,
+          closed BOOLEAN NOT NULL DEFAULT false,
+          user_id TEXT NOT NULL,
+          created_at DATETIME,
+          url TEXT
+        )
+      SQL
+
+      @db.execute 'CREATE INDEX IF NOT EXISTS topic_by_user_id ON topic (user_id)'
+    end
+
+    def create_post_table
+      @db.execute <<-SQL
+        CREATE TABLE IF NOT EXISTS post (
+          id TEXT NOT NULL PRIMARY KEY,
+          raw TEXT,
+          topic_id TEXT NOT NULL,
+          user_id TEXT NOT NULL,
+          created_at DATETIME,
+          reply_to_post_id TEXT,
+          url TEXT
+        )
+      SQL
+
+      @db.execute 'CREATE INDEX IF NOT EXISTS post_by_user_id ON post (user_id)'
+
+      @db.execute <<-SQL
+        CREATE TABLE IF NOT EXISTS post_order (
+          id TEXT NOT NULL PRIMARY KEY
+        )
+      SQL
+    end
+
+    def prepare(hash)
+      hash.each do |key, value|
+        if value.is_a?(TrueClass) || value.is_a?(FalseClass)
+          hash[key] = value ? 1 : 0
+        elsif value.is_a?(Date)
+          hash[key] = value.to_s
+        end
+      end
+    end
+
+    def add_last_column_value(rows, *last_columns)
+      return rows if last_columns.empty?
+
+      result = [rows]
+      last_row = rows.last
+
+      last_columns.each { |column| result.push(last_row ? last_row[column] : nil) }
+      result
+    end
+  end
+end
--- a/script/import_scripts/zendesk.rb
+++ b/script/import_scripts/zendesk.rb
@ -0,0 +1,248 @@
+# Zendesk importer
+#
+# You will need a bunch of CSV files:
+#
+# - users.csv
+# - topics.csv (topics in Zendesk are categories in Discourse)
+# - posts.csv (posts in Zendesk are topics in Discourse)
+# - comments.csv (comments in Zendesk are posts in Discourse)
+
+require 'csv'
+require 'reverse_markdown'
+require_relative 'base'
+require_relative 'base/generic_database'
+
+# Call it like this:
+#   RAILS_ENV=production bundle exec ruby script/import_scripts/zendesk.rb DIRNAME
+class ImportScripts::Zendesk < ImportScripts::Base
+  OLD_DOMAIN = "https://support.example.com"
+  BATCH_SIZE = 1000
+
+  def initialize(path)
+    super()
+
+    @path = path
+    @db = ImportScripts::GenericDatabase.new(@path, batch_size: BATCH_SIZE, recreate: true)
+  end
+
+  def execute
+    read_csv_files
+
+    import_categories
+    import_users
+    import_topics
+    import_posts
+  end
+
+  def read_csv_files
+    puts "", "reading CSV files"
+
+    csv_parse("topics") do |row|
+      @db.insert_category(
+        id: row[:id],
+        name: row[:name],
+        description: row[:description],
+        position: row[:position],
+        url: row[:htmlurl]
+      )
+    end
+
+    csv_parse("users") do |row|
+      @db.insert_user(
+        id: row[:id],
+        email: row[:email],
+        name: row[:name],
+        created_at: parse_datetime(row[:createdat]),
+        last_seen_at: parse_datetime(row[:lastloginat]),
+        active: true
+      )
+    end
+
+    csv_parse("posts") do |row|
+      @db.insert_topic(
+        id: row[:id],
+        title: row[:title],
+        raw: row[:details],
+        category_id: row[:topicid],
+        closed: row[:closed] == "TRUE",
+        user_id: row[:authorid],
+        created_at: parse_datetime(row[:createdat]),
+        url: row[:htmlurl]
+      )
+    end
+
+    csv_parse("comments") do |row|
+      @db.insert_post(
+        id: row[:id],
+        raw: row[:body],
+        topic_id: row[:postid],
+        user_id: row[:authorid],
+        created_at: parse_datetime(row[:createdat]),
+        url: row[:htmlurl]
+      )
+    end
+
+    @db.execute_sql(<<~SQL)
+      DELETE FROM user
+      WHERE NOT EXISTS(
+          SELECT 1
+          FROM topic
+          WHERE topic.user_id = user.id
+      ) AND NOT EXISTS(
+          SELECT 1
+          FROM post
+          WHERE post.user_id = user.id
+      )
+    SQL
+
+    @db.sort_posts_by_created_at
+  end
+
+  def parse_datetime(text)
+    return nil if text.blank? || text == "null"
+    DateTime.parse(text)
+  end
+
+  def import_categories
+    puts "", "creating categories"
+    rows = @db.fetch_categories
+
+    create_categories(rows) do |row|
+      {
+        id: row['id'],
+        name: row['name'],
+        description: row['description'],
+        position: row['position'],
+        post_create_action: proc do |category|
+          url = remove_domain(row['url'])
+          Permalink.create(url: url, category_id: category.id) unless permalink_exists?(url)
+        end
+      }
+    end
+  end
+
+  def batches
+    super(BATCH_SIZE)
+  end
+
+  def import_users
+    puts "", "creating users"
+    total_count = @db.count_users
+    last_id = ''
+
+    batches do |offset|
+      rows, last_id = @db.fetch_users(last_id)
+      break if rows.empty?
+
+      next if all_records_exist?(:users, rows.map { |row| row['id'] })
+
+      create_users(rows, total: total_count, offset: offset) do |row|
+        {
+          id: row['id'],
+          email: row['email'],
+          name: row['name'],
+          created_at: row['created_at'],
+          last_seen_at: row['last_seen_at'],
+          active: row['active'] == 1
+        }
+      end
+    end
+  end
+
+  def import_topics
+    puts "", "creating topics"
+    total_count = @db.count_topics
+    last_id = ''
+
+    batches do |offset|
+      rows, last_id = @db.fetch_topics(last_id)
+      break if rows.empty?
+
+      next if all_records_exist?(:posts, rows.map { |row| import_topic_id(row['id']) })
+
+      create_posts(rows, total: total_count, offset: offset) do |row|
+        {
+          id: import_topic_id(row['id']),
+          title: row['title'].present? ? row['title'].strip[0...255] : "Topic title missing",
+          raw: normalize_raw(row['raw']),
+          category: category_id_from_imported_category_id(row['category_id']),
+          user_id: user_id_from_imported_user_id(row['user_id']) || Discourse.system_user.id,
+          created_at: row['created_at'],
+          closed: row['closed'] == 1,
+          post_create_action: proc do |post|
+            url = remove_domain(row['url'])
+            Permalink.create(url: url, topic_id: post.topic.id) unless permalink_exists?(url)
+          end
+        }
+      end
+    end
+  end
+
+  def import_topic_id(topic_id)
+    "T#{topic_id}"
+  end
+
+  def import_posts
+    puts "", "creating posts"
+    total_count = @db.count_posts
+    last_row_id = 0
+
+    batches do |offset|
+      rows, last_row_id = @db.fetch_posts(last_row_id)
+      break if rows.empty?
+
+      next if all_records_exist?(:posts, rows.map { |row| row['id'] })
+
+      create_posts(rows, total: total_count, offset: offset) do |row|
+        topic = topic_lookup_from_imported_post_id(import_topic_id(row['topic_id']))
+
+        if topic.nil?
+          p "MISSING TOPIC #{row['topic_id']}"
+          p row
+          next
+        end
+
+        {
+          id: import_topic_id(row['id']),
+          raw: normalize_raw(row['raw']),
+          user_id: user_id_from_imported_user_id(row['user_id']) || Discourse.system_user.id,
+          topic_id: topic[:topic_id],
+          created_at: row['created_at'],
+          post_create_action: proc do |post|
+            url = remove_domain(row['url'])
+            Permalink.create(url: url, post_id: post.id) unless permalink_exists?(url)
+          end
+        }
+      end
+    end
+  end
+
+  def normalize_raw(raw)
+    raw = raw.gsub('\n', '')
+    raw = ReverseMarkdown.convert(raw)
+    raw
+  end
+
+  def remove_domain(url)
+    url.sub(OLD_DOMAIN, "")
+  end
+
+  def permalink_exists?(url)
+    Permalink.find_by(url: url)
+  end
+
+  def csv_parse(table_name)
+    CSV.foreach(File.join(@path, "#{table_name}.csv"),
+                headers: true,
+                header_converters: :symbol,
+                skip_blanks: true,
+                encoding: 'bom|utf-8') { |row| yield row }
+  end
+end
+
+unless ARGV[0] && Dir.exist?(ARGV[0])
+  puts "", "Usage:", "", "bundle exec ruby script/import_scripts/zendesk.rb DIRNAME", ""
+  exit 1
+end
+
+ImportScripts::Zendesk.new(ARGV[0]).perform