discourse/script/import_scripts/zendesk.rb

# frozen_string_literal: true

# Zendesk importer
#
# You will need a bunch of CSV files:
#
# - users.csv
# - topics.csv (topics in Zendesk are categories in Discourse)
# - posts.csv (posts in Zendesk are topics in Discourse)
# - comments.csv (comments in Zendesk are posts in Discourse)

require "csv"
require "reverse_markdown"
require_relative "base"
require_relative "base/generic_database"

# Call it like this:
#   RAILS_ENV=production bundle exec ruby script/import_scripts/zendesk.rb DIRNAME
class ImportScripts::Zendesk < ImportScripts::Base
  OLD_DOMAIN = "https://support.example.com"
  BATCH_SIZE = 1000

  def initialize(path)
    super()

    @path = path
    @db = ImportScripts::GenericDatabase.new(@path, batch_size: BATCH_SIZE, recreate: true)
  end

  def execute
    read_csv_files

    import_categories
    import_users
    import_topics
    import_posts
  end

  def read_csv_files
    puts "", "reading CSV files"

    csv_parse("topics") do |row|
      @db.insert_category(
        id: row[:id],
        name: row[:name],
        description: row[:description],
        position: row[:position],
        url: row[:htmlurl],
      )
    end

    csv_parse("users") do |row|
      @db.insert_user(
        id: row[:id],
        email: row[:email],
        name: row[:name],
        created_at: parse_datetime(row[:createdat]),
        last_seen_at: parse_datetime(row[:lastloginat]),
        active: true,
      )
    end

    csv_parse("posts") do |row|
      @db.insert_topic(
        id: row[:id],
        title: row[:title],
        raw: row[:details],
        category_id: row[:topicid],
        closed: row[:closed] == "TRUE",
        user_id: row[:authorid],
        created_at: parse_datetime(row[:createdat]),
        url: row[:htmlurl],
      )
    end

    csv_parse("comments") do |row|
      @db.insert_post(
        id: row[:id],
        raw: row[:body],
        topic_id: row[:postid],
        user_id: row[:authorid],
        created_at: parse_datetime(row[:createdat]),
        url: row[:htmlurl],
      )
    end

    @db.delete_unused_users
    @db.sort_posts_by_created_at
  end

  def parse_datetime(text)
    return nil if text.blank? || text == "null"
    DateTime.parse(text)
  end

  def import_categories
    puts "", "creating categories"
    rows = @db.fetch_categories

    create_categories(rows) do |row|
      {
        id: row["id"],
        name: row["name"],
        description: row["description"],
        position: row["position"],
        post_create_action:
          proc do |category|
            url = remove_domain(row["url"])
            Permalink.create(url: url, category_id: category.id) unless permalink_exists?(url)
          end,
      }
    end
  end

  def batches
    super(BATCH_SIZE)
  end

  def import_users
    puts "", "creating users"
    total_count = @db.count_users
    last_id = ""

    batches do |offset|
      rows, last_id = @db.fetch_users(last_id)
      break if rows.empty?

      next if all_records_exist?(:users, rows.map { |row| row["id"] })

      create_users(rows, total: total_count, offset: offset) do |row|
        {
          id: row["id"],
          email: row["email"],
          name: row["name"],
          created_at: row["created_at"],
          last_seen_at: row["last_seen_at"],
          active: row["active"] == 1,
        }
      end
    end
  end

  def import_topics
    puts "", "creating topics"
    total_count = @db.count_topics
    last_id = ""

    batches do |offset|
      rows, last_id = @db.fetch_topics(last_id)
      break if rows.empty?

      next if all_records_exist?(:posts, rows.map { |row| import_topic_id(row["id"]) })

      create_posts(rows, total: total_count, offset: offset) do |row|
        {
          id: import_topic_id(row["id"]),
          title: row["title"].present? ? row["title"].strip[0...255] : "Topic title missing",
          raw: normalize_raw(row["raw"]),
          category: category_id_from_imported_category_id(row["category_id"]),
          user_id: user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id,
          created_at: row["created_at"],
          closed: row["closed"] == 1,
          post_create_action:
            proc do |post|
              url = remove_domain(row["url"])
              Permalink.create(url: url, topic_id: post.topic.id) unless permalink_exists?(url)
            end,
        }
      end
    end
  end

  def import_topic_id(topic_id)
    "T#{topic_id}"
  end

  def import_posts
    puts "", "creating posts"
    total_count = @db.count_posts
    last_row_id = 0

    batches do |offset|
      rows, last_row_id = @db.fetch_sorted_posts(last_row_id)
      break if rows.empty?

      next if all_records_exist?(:posts, rows.map { |row| row["id"] })

      create_posts(rows, total: total_count, offset: offset) do |row|
        topic = topic_lookup_from_imported_post_id(import_topic_id(row["topic_id"]))

        if topic.nil?
          p "MISSING TOPIC #{row["topic_id"]}"
          p row
          next
        end

        {
          id: row["id"],
          raw: normalize_raw(row["raw"]),
          user_id: user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id,
          topic_id: topic[:topic_id],
          created_at: row["created_at"],
          post_create_action:
            proc do |post|
              url = remove_domain(row["url"])
              Permalink.create(url: url, post_id: post.id) unless permalink_exists?(url)
            end,
        }
      end
    end
  end

  def normalize_raw(raw)
    return "<missing>" if raw.blank?

    raw = raw.gsub('\n', "")
    raw = ReverseMarkdown.convert(raw)
    raw
  end

  def remove_domain(url)
    url.sub(OLD_DOMAIN, "")
  end

  def permalink_exists?(url)
    Permalink.find_by(url: url)
  end

  def csv_parse(table_name)
    CSV.foreach(
      File.join(@path, "#{table_name}.csv"),
      headers: true,
      header_converters: :symbol,
      skip_blanks: true,
      encoding: "bom|utf-8",
    ) { |row| yield row }
  end
end

unless ARGV[0] && Dir.exist?(ARGV[0])
  puts "", "Usage:", "", "bundle exec ruby script/import_scripts/zendesk.rb DIRNAME", ""
  exit 1
end

ImportScripts::Zendesk.new(ARGV[0]).perform
DEV: enable frozen string literal on all files This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging 2019-05-02 18:17:27 -04:00			`# frozen_string_literal: true`

Adds import script for Zendesk It also adds a generic SQLite database that can be used when the data needs some transformation before the actual import. 2018-05-22 15:42:49 -04:00			`# Zendesk importer`
			`#`
			`# You will need a bunch of CSV files:`
			`#`
			`# - users.csv`
			`# - topics.csv (topics in Zendesk are categories in Discourse)`
			`# - posts.csv (posts in Zendesk are topics in Discourse)`
			`# - comments.csv (comments in Zendesk are posts in Discourse)`

			`require "csv"`
			`require "reverse_markdown"`
			`require_relative "base"`
			`require_relative "base/generic_database"`

			`# Call it like this:`
			`# RAILS_ENV=production bundle exec ruby script/import_scripts/zendesk.rb DIRNAME`
			`class ImportScripts::Zendesk < ImportScripts::Base`
			`OLD_DOMAIN = "https://support.example.com"`
			`BATCH_SIZE = 1000`

			`def initialize(path)`
			`super()`

			`@path = path`
			`@db = ImportScripts::GenericDatabase.new(@path, batch_size: BATCH_SIZE, recreate: true)`
			`end`

			`def execute`
			`read_csv_files`

			`import_categories`
			`import_users`
			`import_topics`
			`import_posts`
			`end`

			`def read_csv_files`
			`puts "", "reading CSV files"`

			`csv_parse("topics") do \|row\|`
			`@db.insert_category(`
			`id: row[:id],`
			`name: row[:name],`
			`description: row[:description],`
			`position: row[:position],`
			`url: row[:htmlurl],`
			`)`
			`end`

			`csv_parse("users") do \|row\|`
			`@db.insert_user(`
			`id: row[:id],`
			`email: row[:email],`
			`name: row[:name],`
			`created_at: parse_datetime(row[:createdat]),`
			`last_seen_at: parse_datetime(row[:lastloginat]),`
			`active: true,`
			`)`
			`end`

			`csv_parse("posts") do \|row\|`
			`@db.insert_topic(`
			`id: row[:id],`
			`title: row[:title],`
			`raw: row[:details],`
			`category_id: row[:topicid],`
			`closed: row[:closed] == "TRUE",`
			`user_id: row[:authorid],`
			`created_at: parse_datetime(row[:createdat]),`
			`url: row[:htmlurl],`
			`)`
			`end`

			`csv_parse("comments") do \|row\|`
			`@db.insert_post(`
			`id: row[:id],`
			`raw: row[:body],`
			`topic_id: row[:postid],`
			`user_id: row[:authorid],`
			`created_at: parse_datetime(row[:createdat]),`
			`url: row[:htmlurl],`
			`)`
			`end`

FEATURE: Import script for AnswerBase Improves the generic database used by some import scripts: * Adds additional columns for users * Adds support for attachments * Allows setting the data type for keys (numeric or string) to ensure correct sorting 2019-02-28 15:59:36 -05:00			`@db.delete_unused_users`
Adds import script for Zendesk It also adds a generic SQLite database that can be used when the data needs some transformation before the actual import. 2018-05-22 15:42:49 -04:00			`@db.sort_posts_by_created_at`
			`end`

			`def parse_datetime(text)`
			`return nil if text.blank? \|\| text == "null"`
			`DateTime.parse(text)`
			`end`

			`def import_categories`
			`puts "", "creating categories"`
			`rows = @db.fetch_categories`

			`create_categories(rows) do \|row\|`
			`{`
			`id: row["id"],`
			`name: row["name"],`
			`description: row["description"],`
			`position: row["position"],`
			`post_create_action:`
			`proc do \|category\|`
			`url = remove_domain(row["url"])`
			`Permalink.create(url: url, category_id: category.id) unless permalink_exists?(url)`
			`end,`
			`}`
			`end`
			`end`

			`def batches`
			`super(BATCH_SIZE)`
			`end`

			`def import_users`
			`puts "", "creating users"`
			`total_count = @db.count_users`
			`last_id = ""`

			`batches do \|offset\|`
			`rows, last_id = @db.fetch_users(last_id)`
			`break if rows.empty?`

			`next if all_records_exist?(:users, rows.map { \|row\| row["id"] })`

			`create_users(rows, total: total_count, offset: offset) do \|row\|`
			`{`
			`id: row["id"],`
			`email: row["email"],`
			`name: row["name"],`
			`created_at: row["created_at"],`
			`last_seen_at: row["last_seen_at"],`
			`active: row["active"] == 1,`
			`}`
			`end`
			`end`
			`end`

			`def import_topics`
			`puts "", "creating topics"`
			`total_count = @db.count_topics`
			`last_id = ""`

			`batches do \|offset\|`
			`rows, last_id = @db.fetch_topics(last_id)`
			`break if rows.empty?`

			`next if all_records_exist?(:posts, rows.map { \|row\| import_topic_id(row["id"]) })`

			`create_posts(rows, total: total_count, offset: offset) do \|row\|`
			`{`
			`id: import_topic_id(row["id"]),`
			`title: row["title"].present? ? row["title"].strip[0...255] : "Topic title missing",`
			`raw: normalize_raw(row["raw"]),`
			`category: category_id_from_imported_category_id(row["category_id"]),`
			`user_id: user_id_from_imported_user_id(row["user_id"]) \|\| Discourse.system_user.id,`
			`created_at: row["created_at"],`
			`closed: row["closed"] == 1,`
			`post_create_action:`
			`proc do \|post\|`
			`url = remove_domain(row["url"])`
			`Permalink.create(url: url, topic_id: post.topic.id) unless permalink_exists?(url)`
			`end,`
			`}`
			`end`
			`end`
			`end`

			`def import_topic_id(topic_id)`
			`"T#{topic_id}"`
			`end`

			`def import_posts`
			`puts "", "creating posts"`
			`total_count = @db.count_posts`
			`last_row_id = 0`

			`batches do \|offset\|`
FEATURE: Import script for AnswerBase Improves the generic database used by some import scripts: * Adds additional columns for users * Adds support for attachments * Allows setting the data type for keys (numeric or string) to ensure correct sorting 2019-02-28 15:59:36 -05:00			`rows, last_row_id = @db.fetch_sorted_posts(last_row_id)`
Adds import script for Zendesk It also adds a generic SQLite database that can be used when the data needs some transformation before the actual import. 2018-05-22 15:42:49 -04:00			`break if rows.empty?`

			`next if all_records_exist?(:posts, rows.map { \|row\| row["id"] })`

			`create_posts(rows, total: total_count, offset: offset) do \|row\|`
			`topic = topic_lookup_from_imported_post_id(import_topic_id(row["topic_id"]))`

			`if topic.nil?`
			`p "MISSING TOPIC #{row["topic_id"]}"`
			`p row`
			`next`
			`end`

			`{`
Use correct post id in zendesk importer 2018-06-17 16:52:26 -04:00			`id: row["id"],`
Adds import script for Zendesk It also adds a generic SQLite database that can be used when the data needs some transformation before the actual import. 2018-05-22 15:42:49 -04:00			`raw: normalize_raw(row["raw"]),`
			`user_id: user_id_from_imported_user_id(row["user_id"]) \|\| Discourse.system_user.id,`
			`topic_id: topic[:topic_id],`
			`created_at: row["created_at"],`
			`post_create_action:`
			`proc do \|post\|`
			`url = remove_domain(row["url"])`
			`Permalink.create(url: url, post_id: post.id) unless permalink_exists?(url)`
			`end,`
			`}`
			`end`
			`end`
			`end`

			`def normalize_raw(raw)`
FIX: fix normalize_raw method for nil inputs in migration scripts (#22304) Various migration scripts define a normalize_raw method to do custom processing of post contents before storing it in the Post.raw and other fields. They normally do not handle nil inputs, but it's a relatively common occurrence in data dumps. Since this method is used from various points in the migration script, as it stands, the experience of using a migration script is that it will fail multiple times at different points, forcing you to fix the data or apply logic hacks every time then restarting. This PR generalizes handling of nil input by returning a <missing> string. Pros: no more messy repeated crashes + restarts consistency Cons: it might hide data issues OTOH we can't print a warning on that method because it will flood the console since it's called from inside loops. * FIX: zendesk import script: support nil inputs in normalize_raw * FIX: return '<missing>' instead of empty string; do it for all methods 2023-06-29 12:22:47 -04:00			`return "<missing>" if raw.blank?`

Adds import script for Zendesk It also adds a generic SQLite database that can be used when the data needs some transformation before the actual import. 2018-05-22 15:42:49 -04:00			`raw = raw.gsub('\n', "")`
			`raw = ReverseMarkdown.convert(raw)`
			`raw`
			`end`

			`def remove_domain(url)`
			`url.sub(OLD_DOMAIN, "")`
			`end`

			`def permalink_exists?(url)`
			`Permalink.find_by(url: url)`
			`end`

			`def csv_parse(table_name)`
			`CSV.foreach(`
			`File.join(@path, "#{table_name}.csv"),`
			`headers: true,`
			`header_converters: :symbol,`
			`skip_blanks: true,`
			`encoding: "bom\|utf-8",`
			`) { \|row\| yield row }`
			`end`
			`end`

			`unless ARGV[0] && Dir.exist?(ARGV[0])`
			`puts "", "Usage:", "", "bundle exec ruby script/import_scripts/zendesk.rb DIRNAME", ""`
			`exit 1`
			`end`

			`ImportScripts::Zendesk.new(ARGV[0]).perform`