discourse/script/import_scripts/telligent.rb

# frozen_string_literal: true

require_relative 'base'
require 'tiny_tds'

# Import script for Telligent communities
#
# It's really hard to find all attachments, but the script tries to do it anyway.

class ImportScripts::Telligent < ImportScripts::Base
  BATCH_SIZE ||= 1000
  LOCAL_AVATAR_REGEX ||= /\A~\/.*(?<directory>communityserver-components-(?:selectable)?avatars)\/(?<path>[^\/]+)\/(?<filename>.+)/i
  REMOTE_AVATAR_REGEX ||= /\Ahttps?:\/\//i
  EMBEDDED_ATTACHMENT_REGEX ||= /<a href="\/cfs-file(?:\.ashx)?\/__key\/(?<directory>[^\/]+)\/(?<path>[^\/]+)\/(?<filename1>.+)">(?<filename2>.*?)<\/a>/i

  CATEGORY_LINK_NORMALIZATION = '/.*?(f\/\d+)$/\1'
  TOPIC_LINK_NORMALIZATION = '/.*?(f\/\d+\/t\/\d+)$/\1'

  def initialize
    super()

    @client = TinyTds::Client.new(
      host: ENV["DB_HOST"],
      username: ENV["DB_USERNAME"],
      password: ENV["DB_PASSWORD"],
      database: ENV["DB_NAME"],
      timeout: 60 # the user query is very slow
    )
  end

  def execute
    add_permalink_normalizations
    import_users
    import_categories
    import_topics
    import_posts
    mark_topics_as_solved
  end

  def import_users
    puts "", "Importing users..."

    user_conditions = <<~SQL
      (
        EXISTS(SELECT 1
               FROM te_Forum_Threads t
               WHERE t.UserId = u.UserID) OR
        EXISTS(SELECT 1
               FROM te_Forum_ThreadReplies r
               WHERE r.UserId = u.UserID)
      )
    SQL

    last_user_id = -1
    total_count = count(<<~SQL)
      SELECT COUNT(1) AS count
      FROM cs_Users u
      WHERE #{user_conditions}
    SQL
    import_count = 0

    loop do
      rows = query(<<~SQL)
        SELECT *
        FROM (
            SELECT TOP #{BATCH_SIZE}
                u.UserID,
                u.Email,
                u.UserName,
                u.CreateDate,
                p.PropertyName,
                p.PropertyValue
            FROM cs_Users u
                LEFT OUTER JOIN (
                    SELECT NULL AS UserID, ap.UserId AS MembershipID, x.PropertyName, x.PropertyValue
                    FROM aspnet_Profile ap
                             CROSS APPLY dbo.GetProperties(ap.PropertyNames, ap.PropertyValuesString) x
                    WHERE ap.PropertyNames NOT LIKE '%:-1%' AND
                        x.PropertyName IN ('bio', 'commonName', 'location', 'webAddress')
                    UNION
                    SELECT up.UserID, NULL AS MembershipID, x.PropertyName, CAST(x.PropertyValue AS NVARCHAR) AS PropertyValue
                    FROM cs_UserProfile up
                             CROSS APPLY dbo.GetProperties(up.PropertyNames, up.PropertyValues) x
                    WHERE up.PropertyNames NOT LIKE '%:-1%' AND
                        x.PropertyName IN ('avatarUrl', 'BannedUntil', 'UserBanReason')
                ) p ON p.UserID = u.UserID OR p.MembershipID = u.MembershipID
            WHERE u.UserID > #{last_user_id} AND #{user_conditions}
            ORDER BY u.UserID
        ) x
            PIVOT (
              MAX(PropertyValue)
              FOR PropertyName
              IN (bio, commonName, location, webAddress, avatarUrl, BannedUntil, UserBanReason)
            ) Y
        ORDER BY UserID
      SQL

      break if rows.blank?
      last_user_id = rows[-1]["UserID"]

      if all_records_exist?(:users, rows.map { |row| row["UserID"] })
        import_count += rows.size
        next
      end

      create_users(rows, total: total_count, offset: import_count) do |row|
        {
          id: row["UserID"],
          email: row["Email"],
          username: row["UserName"],
          name: row["commonName"],
          created_at: row["CreateDate"],
          bio_raw: html_to_markdown(row["bio"]),
          location: row["location"],
          website: row["webAddress"],
          post_create_action: proc do |user|
            import_avatar(user, row["avatarUrl"])
            suspend_user(user, row["BannedUntil"], row["UserBanReason"])
          end
        }
      end

      import_count += rows.size
    end
  end

  # TODO move into base importer (create_user) and use consistent error handling
  def import_avatar(user, avatar_url)
    return if ENV["FILE_BASE_DIR"].blank? || avatar_url.blank? || avatar_url.include?("anonymous")

    if match_data = avatar_url.match(LOCAL_AVATAR_REGEX)
      avatar_path = File.join(ENV["FILE_BASE_DIR"],
                              match_data[:directory].gsub("-", "."),
                              match_data[:path].split("-"),
                              match_data[:filename])

      if File.exists?(avatar_path)
        @uploader.create_avatar(user, avatar_path)
      else
        STDERR.puts "Could not find avatar: #{avatar_path}"
      end
    elsif avatar_url.match?(REMOTE_AVATAR_REGEX)
      UserAvatar.import_url_for_user(avatar_url, user) rescue nil
    end
  end

  def suspend_user(user, banned_until, ban_reason)
    return if banned_until.blank?

    if banned_until = DateTime.parse(banned_until) > DateTime.now
      user.suspended_till = banned_until
      user.suspended_at = DateTime.now
      user.save!

      StaffActionLogger.new(Discourse.system_user).log_user_suspend(user, ban_reason)
    end
  end

  def import_categories
    puts "", "Importing parent categories..."
    parent_categories = query(<<~SQL)
      SELECT
        GroupID,
        Name, HtmlDescription,
        DateCreated, SortOrder
      FROM cs_Groups g
      WHERE (SELECT COUNT(1)
             FROM te_Forum_Forums f
             WHERE f.GroupId = g.GroupID) > 1
      ORDER BY SortOrder, Name
    SQL

    create_categories(parent_categories) do |row|
      {
        id: "G#{row['GroupID']}",
        name: clean_category_name(row["Name"]),
        description: html_to_markdown(row["HtmlDescription"]),
        position: row["SortOrder"]
      }
    end

    puts "", "Importing child categories..."
    child_categories = query(<<~SQL)
      SELECT
        ForumId, GroupId,
        Name, Description,
        DateCreated, SortOrder
      FROM te_Forum_Forums
      ORDER BY GroupId, SortOrder, Name
    SQL

    create_categories(child_categories) do |row|
      parent_category_id = parent_category_id_for(row)

      if category_id = replace_with_category_id(child_categories, parent_category_id)
        add_category(row['ForumId'], Category.find_by_id(category_id))
        url = "f/#{row['ForumId']}"
        Permalink.create(url: url, category_id: category_id) unless Permalink.exists?(url: url)
        nil
      else
        {
          id: row['ForumId'],
          parent_category_id: parent_category_id,
          name: clean_category_name(row["Name"]),
          description: html_to_markdown(row["Description"]),
          position: row["SortOrder"]
        }
      end
    end
  end

  def parent_category_id_for(row)
    category_id_from_imported_category_id("G#{row['GroupId']}") if row.key?("GroupId")
  end

  def replace_with_category_id(child_categories, parent_category_id)
    parent_category_id if only_child?(child_categories, parent_category_id)
  end

  def only_child?(child_categories, parent_category_id)
    count = 0

    child_categories.each do |row|
      count += 1 if parent_category_id_for(row) == parent_category_id
    end

    count == 1
  end

  def clean_category_name(name)
    CGI.unescapeHTML(name)
      .strip
  end

  def import_topics
    puts "", "Importing topics..."

    last_topic_id = -1
    total_count = count("SELECT COUNT(1) AS count FROM te_Forum_Threads")

    batches do |offset|
      rows = query(<<~SQL)
        SELECT TOP #{BATCH_SIZE}
          t.ThreadId, t.ForumId, t.UserId,
          t.Subject, t.Body, t.DateCreated, t.IsLocked, t.StickyDate,
          a.ApplicationTypeId, a.ApplicationId, a.ApplicationContentTypeId, a.ContentId, a.FileName
        FROM te_Forum_Threads t
          LEFT JOIN te_Attachments a
            ON (a.ApplicationId = t.ForumId AND a.ApplicationTypeId = 0 AND a.ContentId = t.ThreadId AND
                a.ApplicationContentTypeId = 0)
        WHERE t.ThreadId > #{last_topic_id}
        ORDER BY t.ThreadId
      SQL

      break if rows.blank?
      last_topic_id = rows[-1]["ThreadId"]
      next if all_records_exist?(:post, rows.map { |row| import_topic_id(row["ThreadId"]) })

      create_posts(rows, total: total_count, offset: offset) do |row|
        user_id = user_id_from_imported_user_id(row["UserId"]) || Discourse::SYSTEM_USER_ID

        post = {
          id: import_topic_id(row["ThreadId"]),
          title: CGI.unescapeHTML(row["Subject"]),
          raw: raw_with_attachment(row, user_id),
          category: category_id_from_imported_category_id(row["ForumId"]),
          user_id: user_id,
          created_at: row["DateCreated"],
          closed: row["IsLocked"],
          post_create_action: proc do |action_post|
            topic = action_post.topic
            Jobs.enqueue_at(topic.pinned_until, :unpin_topic, topic_id: topic.id) if topic.pinned_until
            url = "f/#{row['ForumId']}/t/#{row['ThreadId']}"
            Permalink.create(url: url, topic_id: topic.id) unless Permalink.exists?(url: url)
          end
        }

        if row["StickyDate"] > Time.now
          post[:pinned_until] = row["StickyDate"]
          post[:pinned_at] = row["DateCreated"]
        end

        post
      end
    end
  end

  def import_topic_id(topic_id)
    "T#{topic_id}"
  end

  def import_posts
    puts "", "Importing posts..."

    last_post_id = -1
    total_count = count("SELECT COUNT(1) AS count FROM te_Forum_ThreadReplies")

    batches do |offset|
      rows = query(<<~SQL)
        SELECT TOP #{BATCH_SIZE}
          tr.ThreadReplyId, tr.ThreadId, tr.UserId, tr.ParentReplyId,
          tr.Body, tr.ThreadReplyDate,
          CONVERT(BIT,
                  CASE WHEN tr.AnswerVerifiedUtcDate IS NOT NULL AND NOT EXISTS(
                      SELECT 1
                      FROM te_Forum_ThreadReplies x
                      WHERE
                        x.ThreadId = tr.ThreadId AND x.ThreadReplyId < tr.ThreadReplyId AND x.AnswerVerifiedUtcDate IS NOT NULL
                  )
                    THEN 1
                  ELSE 0 END) AS IsFirstVerifiedAnswer,
          a.ApplicationTypeId, a.ApplicationId, a.ApplicationContentTypeId, a.ContentId, a.FileName
        FROM te_Forum_ThreadReplies tr
          JOIN te_Forum_Threads t ON (tr.ThreadId = t.ThreadId)
          LEFT JOIN te_Attachments a
            ON (a.ApplicationId = t.ForumId AND a.ApplicationTypeId = 0 AND a.ContentId = tr.ThreadReplyId AND
                a.ApplicationContentTypeId = 1)
        WHERE tr.ThreadReplyId > #{last_post_id}
        ORDER BY tr.ThreadReplyId
      SQL

      break if rows.blank?
      last_post_id = rows[-1]["ThreadReplyId"]
      next if all_records_exist?(:post, rows.map { |row| row["ThreadReplyId"] })

      create_posts(rows, total: total_count, offset: offset) do |row|
        imported_parent_id = row["ParentReplyId"] > 0 ? row["ParentReplyId"] : import_topic_id(row["ThreadId"])
        parent_post = topic_lookup_from_imported_post_id(imported_parent_id)
        user_id = user_id_from_imported_user_id(row["UserId"]) || Discourse::SYSTEM_USER_ID

        if parent_post
          post = {
            id: row["ThreadReplyId"],
            raw: raw_with_attachment(row, user_id),
            user_id: user_id,
            topic_id: parent_post[:topic_id],
            created_at: row["ThreadReplyDate"],
            reply_to_post_number: parent_post[:post_number]
          }

          post[:custom_fields] = { is_accepted_answer: "true" } if row["IsFirstVerifiedAnswer"]
          post
        else
          puts "Failed to import post #{row['ThreadReplyId']}. Parent was not found."
        end
      end
    end
  end

  def raw_with_attachment(row, user_id)
    raw, embedded_paths, upload_ids = replace_embedded_attachments(row["Body"], user_id)
    raw = html_to_markdown(raw) || ""

    filename = row["FileName"]
    return raw if ENV["FILE_BASE_DIR"].blank? || filename.blank?

    path = File.join(
      ENV["FILE_BASE_DIR"],
      "telligent.evolution.components.attachments",
      "%02d" % row["ApplicationTypeId"],
      "%02d" % row["ApplicationId"],
      "%02d" % row["ApplicationContentTypeId"],
      ("%010d" % row["ContentId"]).scan(/.{2}/),
      clean_filename(filename)
    )

    unless embedded_paths.include?(path)
      if File.exists?(path)
        upload = @uploader.create_upload(user_id, path, filename)

        if upload.present? && upload.persisted? && !upload_ids.include?(upload.id)
          raw = "#{raw}\n#{@uploader.html_for_upload(upload, filename)}"
        end
      else
        STDERR.puts "Could not find file: #{path}"
      end
    end

    raw
  end

  def replace_embedded_attachments(raw, user_id)
    paths = []
    upload_ids = []

    raw = raw.gsub(EMBEDDED_ATTACHMENT_REGEX) do
      filename, path = attachment_path(Regexp.last_match)

      if File.exists?(path)
        upload = @uploader.create_upload(user_id, path, filename)

        if upload.present? && upload.persisted?
          paths << path
          upload_ids << upload.id
          @uploader.html_for_upload(upload, filename)
        end
      else
        STDERR.puts "Could not find file: #{path}"
      end
    end

    [raw, paths, upload_ids]
  end

  def clean_filename(filename)
    CGI.unescapeHTML(filename)
      .gsub(/[\x00\/\\:\*\?\"<>\|]/, '_')
      .gsub(/_(?:2B00|2E00|2D00|5B00|5D00|5F00)/, '')
  end

  def attachment_path(match_data)
    filename, path = join_attachment_path(match_data, filename_index: 2)
    filename, path = join_attachment_path(match_data, filename_index: 1) unless File.exists?(path)
    [filename, path]
  end

  # filenames are a total mess - try to guess the correct filename
  # works for 70% of all files
  def join_attachment_path(match_data, filename_index:)
    filename = clean_filename(match_data[:"filename#{filename_index}"])
    base_path = File.join(
      ENV["FILE_BASE_DIR"],
      match_data[:directory].gsub("-", "."),
      match_data[:path].split("-")
    )

    path = File.join(base_path, filename)
    return [filename, path] if File.exists?(path)

    original_filename = filename.dup

    filename = filename.gsub("-", " ")
    path = File.join(base_path, filename)
    return [filename, path] if File.exists?(path)

    filename = filename.gsub("_", "-")
    path = File.join(base_path, filename)
    return [filename, path] if File.exists?(path)

    [original_filename, File.join(base_path, original_filename)]
  end

  def mark_topics_as_solved
    puts "", "Marking topics as solved..."

    DB.exec <<~SQL
      INSERT INTO topic_custom_fields (name, value, topic_id, created_at, updated_at)
      SELECT 'accepted_answer_post_id', pcf.post_id, p.topic_id, p.created_at, p.created_at
        FROM post_custom_fields pcf
        JOIN posts p ON p.id = pcf.post_id
       WHERE pcf.name = 'is_accepted_answer' AND pcf.value = 'true'
    SQL
  end

  def html_to_markdown(html)
    HtmlToMarkdown.new(html).to_markdown if html.present?
  end

  def add_permalink_normalizations
    normalizations = SiteSetting.permalink_normalizations
    normalizations = normalizations.blank? ? [] : normalizations.split('|')

    add_normalization(normalizations, CATEGORY_LINK_NORMALIZATION)
    add_normalization(normalizations, TOPIC_LINK_NORMALIZATION)

    SiteSetting.permalink_normalizations = normalizations.join('|')
  end

  def add_normalization(normalizations, normalization)
    normalizations << normalization unless normalizations.include?(normalization)
  end

  def batches
    super(BATCH_SIZE)
  end

  def query(sql)
    @client.execute(sql).to_a
  end

  def count(sql)
    query(sql).first["count"]
  end
end

ImportScripts::Telligent.new.perform