discourse/script/import_scripts/nabble.rb

# frozen_string_literal: true

require File.expand_path(File.dirname(__FILE__) + "/base.rb")
require "pg"
require_relative "base/uploader"

=begin
 if you want to create mock users for posts made by anonymous participants,
 run the following SQL prior to importing.

-- first attribute any anonymous posts to existing users (if any)

UPDATE node
SET owner_id = p.user_id, anonymous_name = NULL
FROM ( SELECT lower(name) AS name, user_id FROM user_ ) p
WHERE p.name = lower(node.anonymous_name)
  AND owner_id IS NULL;

-- then create mock users

INSERT INTO user_ (email, name, joined, registered)
  SELECT lower(anonymous_name) || '@dummy.com', MIN(anonymous_name), MIN(when_created), MIN(when_created)
  FROM node
  WHERE anonymous_name IS NOT NULL
  GROUP BY lower(anonymous_name);

-- then move these posts to the new users
-- (yes, this is the same query as the first one indeed)

UPDATE node
SET owner_id = p.user_id, anonymous_name = NULL
FROM ( SELECT lower(name) AS name, user_id FROM user_ ) p
WHERE p.name = lower(node.anonymous_name)
  AND owner_id IS NULL;

=end

class ImportScripts::Nabble < ImportScripts::Base
  # CHANGE THESE BEFORE RUNNING THE IMPORTER

  BATCH_SIZE = 1000

  DB_NAME = "nabble"
  CATEGORY_ID = 6

  def initialize
    super

    @tagmap = []
    @td = PG::TextDecoder::TimestampWithTimeZone.new
    @client = PG.connect(dbname: DB_NAME)
    @uploader = ImportScripts::Uploader.new
  end

  def execute
    import_users
    create_forum_topics
    import_replies
  end

  def import_users
    puts "", "importing users"

    total_count = @client.exec("SELECT COUNT(user_id) FROM user_")[0]["count"]

    batches(BATCH_SIZE) do |offset|
      users = @client.query(<<-SQL)
          SELECT user_id, name, email, joined
            FROM user_
        ORDER BY joined
           LIMIT #{BATCH_SIZE}
          OFFSET #{offset}
      SQL

      break if users.ntuples() < 1

      next if all_records_exist? :users, users.map { |u| u["user_id"].to_i }

      create_users(users, total: total_count, offset: offset) do |row|
        {
          id: row["user_id"],
          email: row["email"] || fake_email,
          created_at: Time.zone.at(@td.decode(row["joined"])),
          name: row["name"],
          post_create_action: proc { |user| import_avatar(user, row["user_id"]) },
        }
      end
    end
  end

  def import_avatar(user, org_id)
    filename = "avatar" + org_id.to_s
    path = File.join("/tmp/nab", filename)
    res =
      @client.exec(
        "SELECT content FROM file_avatar WHERE name='avatar100.png' AND user_id = #{org_id} LIMIT 1",
      )
    return if res.ntuples() < 1

    binary = res[0]["content"]
    File.open(path, "wb") { |f| f.write(PG::Connection.unescape_bytea(binary)) }

    upload = @uploader.create_upload(user.id, path, filename)

    if upload.persisted?
      user.import_mode = false
      user.create_user_avatar
      user.import_mode = true
      user.user_avatar.update(custom_upload_id: upload.id)
      user.update(uploaded_avatar_id: upload.id)
    else
      Rails.logger.error("Could not persist avatar for user #{user.username}")
    end
  end

  def parse_email(msg)
    receiver = Email::Receiver.new(msg)
    mail = Mail.read_from_string(msg)
    mail.body

    body, elided = receiver.select_body
    body.force_encoding(body.encoding).encode("UTF-8")
  end

  def create_forum_topics
    puts "", "creating forum topics"

    app_node_id = @client.exec("SELECT node_id FROM node WHERE is_app LIMIT 1")[0]["node_id"]
    topic_count =
      @client.exec("SELECT COUNT(node_id) AS count FROM node WHERE parent_id = #{app_node_id}")[0][
        "count"
      ]

    batches(BATCH_SIZE) do |offset|
      topics = @client.exec <<-SQL
        SELECT n.node_id, n.subject, n.owner_id, n.when_created, nm.message, n.msg_fmt
        FROM node AS n
        INNER JOIN node_msg AS nm ON nm.node_id = n.node_id
        WHERE n.parent_id = #{app_node_id}
        ORDER BY n.when_created
        LIMIT #{BATCH_SIZE}
        OFFSET #{offset}
      SQL

      break if topics.ntuples() < 1

      next if all_records_exist? :posts, topics.map { |t| t["node_id"].to_i }

      create_posts(topics, total: topic_count, offset: offset) do |t|
        raw = body_from(t)
        next unless raw
        raw = process_content(raw)
        raw = process_attachments(raw, t["node_id"])

        {
          id: t["node_id"],
          title: t["subject"],
          user_id: user_id_from_imported_user_id(t["owner_id"]) || Discourse::SYSTEM_USER_ID,
          created_at: Time.zone.at(@td.decode(t["when_created"])),
          category: CATEGORY_ID,
          raw: raw,
          cook_method: Post.cook_methods[:regular],
        }
      end
    end
  end

  def body_from(p)
    %w[m s].include?(p["msg_fmt"]) ? parse_email(p["message"]) : p["message"]
  rescue Email::Receiver::EmptyEmailError
    puts "Skipped #{p["node_id"]}"
  end

  def process_content(txt)
    txt.gsub! /\<quote author="(.*?)"\>/, '[quote="\1"]'
    txt.gsub! %r{\</quote\>}, "[/quote]"
    txt.gsub!(%r{\<raw\>(.*?)\</raw\>}m) do |match|
      c = Regexp.last_match[1].indent(4)
      "\n#{c}\n"
    end

    # lines starting with # are comments, not headings, insert a space to prevent markdown
    txt.gsub! /\n#/m, " #"

    # in the languagetool forum, quite a lot of XML was not marked as raw
    # so we treat <rule...>...</rule> and <category...>...</category> as raw

    # uncomment below if you want to use this

    #txt.gsub!(/<rule(.*?)>(.*?<\/rule>)/m) do |match|
    #   c = Regexp.last_match[2].indent(4);
    #   "\n    <rule#{Regexp.last_match[1]}>#{c}\n"
    #end
    #txt.gsub!(/<category(.*?)>(.*?<\/category>)/m) do |match|
    #   c = Regexp.last_match[2].indent(4);
    #   "\n    <rule#{Regexp.last_match[1]}>#{c}\n"
    #end
    txt
  end

  def process_attachments(txt, postid)
    txt.gsub!(/<nabble_img src="(.*?)" (.*?)>/m) do |match|
      basename = Regexp.last_match[1]
      get_attachment_upload(basename, postid) { |upload| @uploader.embedded_image_html(upload) }
    end

    txt.gsub!(%r{<nabble_a href="(.*?)">(.*?)</nabble_a>}m) do |match|
      basename = Regexp.last_match[1]
      get_attachment_upload(basename, postid) do |upload|
        @uploader.attachment_html(upload, basename)
      end
    end
    txt
  end

  def get_attachment_upload(basename, postid)
    contents =
      @client.exec("SELECT content FROM file_node WHERE name='#{basename}' AND node_id = #{postid}")
    if contents.any?
      binary = contents[0]["content"]
      fn = File.join("/tmp/nab", basename)
      File.open(fn, "wb") { |f| f.write(PG::Connection.unescape_bytea(binary)) }
      yield @uploader.create_upload(0, fn, basename)
    end
  end

  def import_replies
    puts "", "creating topic replies"

    app_node_id = @client.exec("SELECT node_id FROM node WHERE is_app LIMIT 1")[0]["node_id"]
    post_count =
      @client.exec("SELECT COUNT(node_id) AS count FROM node WHERE parent_id != #{app_node_id}")[0][
        "count"
      ]

    topic_ids = {}

    batches(BATCH_SIZE) do |offset|
      posts = @client.exec <<-SQL
        SELECT n.node_id, n.parent_id, n.subject, n.owner_id, n.when_created, nm.message, n.msg_fmt
        FROM node AS n
        INNER JOIN node_msg AS nm ON nm.node_id = n.node_id
        WHERE n.parent_id != #{app_node_id}
        ORDER BY n.when_created
        LIMIT #{BATCH_SIZE}
        OFFSET #{offset}
      SQL

      break if posts.ntuples() < 1

      next if all_records_exist? :posts, posts.map { |p| p["node_id"].to_i }

      create_posts(posts, total: post_count, offset: offset) do |p|
        parent_id = p["parent_id"]
        id = p["node_id"]

        topic_id = topic_ids[parent_id]
        unless topic_id
          topic = topic_lookup_from_imported_post_id(parent_id)
          topic_id = topic[:topic_id] if topic
        end
        next unless topic_id

        topic_ids[id] = topic_id

        raw = body_from(p)
        next unless raw
        raw = process_content(raw)
        raw = process_attachments(raw, id)
        {
          id: id,
          topic_id: topic_id,
          user_id: user_id_from_imported_user_id(p["owner_id"]) || Discourse::SYSTEM_USER_ID,
          created_at: Time.zone.at(@td.decode(p["when_created"])),
          raw: raw,
          cook_method: Post.cook_methods[:regular],
        }
      end
    end
  end
end

class String
  def indent(count, char = " ")
    gsub(/([^\n]*)(\n|$)/) do |match|
      last_iteration = ($1 == "" && $2 == "")
      line = +""
      line << (char * count) unless last_iteration
      line << $1
      line << $2
      line
    end
  end
end

ImportScripts::Nabble.new.perform
DEV: enable frozen string literal on all files This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging 2019-05-02 18:17:27 -04:00			`# frozen_string_literal: true`

Importer for nabble 2015-05-22 15:40:26 -04:00			`require File.expand_path(File.dirname(__FILE__) + "/base.rb")`
			`require "pg"`
Extended Nabble importer functionality 2016-01-03 15:26:12 -05:00			`require_relative "base/uploader"`

			`=begin`
			`if you want to create mock users for posts made by anonymous participants,`
			`run the following SQL prior to importing.`

			`-- first attribute any anonymous posts to existing users (if any)`

			`UPDATE node`
			`SET owner_id = p.user_id, anonymous_name = NULL`
			`FROM ( SELECT lower(name) AS name, user_id FROM user_ ) p`
			`WHERE p.name = lower(node.anonymous_name)`
			`AND owner_id IS NULL;`

			`-- then create mock users`

			`INSERT INTO user_ (email, name, joined, registered)`
			`SELECT lower(anonymous_name) \|\| '@dummy.com', MIN(anonymous_name), MIN(when_created), MIN(when_created)`
			`FROM node`
			`WHERE anonymous_name IS NOT NULL`
			`GROUP BY lower(anonymous_name);`

			`-- then move these posts to the new users`
			`-- (yes, this is the same query as the first one indeed)`

			`UPDATE node`
			`SET owner_id = p.user_id, anonymous_name = NULL`
			`FROM ( SELECT lower(name) AS name, user_id FROM user_ ) p`
			`WHERE p.name = lower(node.anonymous_name)`
			`AND owner_id IS NULL;`

			`=end`

Importer for mbox format 2015-07-23 18:37:40 -04:00			`class ImportScripts::Nabble < ImportScripts::Base`
Importer for nabble 2015-05-22 15:40:26 -04:00			`# CHANGE THESE BEFORE RUNNING THE IMPORTER`

			`BATCH_SIZE = 1000`

			`DB_NAME = "nabble"`
			`CATEGORY_ID = 6`

			`def initialize`
			`super`

			`@tagmap = []`
			`@td = PG::TextDecoder::TimestampWithTimeZone.new`
			`@client = PG.connect(dbname: DB_NAME)`
Extended Nabble importer functionality 2016-01-03 15:26:12 -05:00			`@uploader = ImportScripts::Uploader.new`
Importer for nabble 2015-05-22 15:40:26 -04:00			`end`

			`def execute`
			`import_users`
			`create_forum_topics`
			`import_replies`
			`end`

			`def import_users`
			`puts "", "importing users"`

			`total_count = @client.exec("SELECT COUNT(user_id) FROM user_")[0]["count"]`

			`batches(BATCH_SIZE) do \|offset\|`
			`users = @client.query(<<-SQL)`
			`SELECT user_id, name, email, joined`
			`FROM user_`
			`ORDER BY joined`
			`LIMIT #{BATCH_SIZE}`
			`OFFSET #{offset}`
			`SQL`

			`break if users.ntuples() < 1`

FEATURE: Skip batches if all records exist Update all import scripts to take advantage of all_records_exist? 2015-09-21 19:48:42 -04:00			`next if all_records_exist? :users, users.map { \|u\| u["user_id"].to_i }`

Extended Nabble importer functionality 2016-01-03 15:26:12 -05:00			`create_users(users, total: total_count, offset: offset) do \|row\|`
Importer for nabble 2015-05-22 15:40:26 -04:00			`{`
Update Rubocop to 0.60 2018-12-04 04:48:16 -05:00			`id: row["user_id"],`
Use an invalid domain for fake email addresses in importers 2019-05-30 16:02:10 -04:00			`email: row["email"] \|\| fake_email,`
Update Rubocop to 0.60 2018-12-04 04:48:16 -05:00			`created_at: Time.zone.at(@td.decode(row["joined"])),`
			`name: row["name"],`
Extended Nabble importer functionality 2016-01-03 15:26:12 -05:00			`post_create_action: proc { \|user\| import_avatar(user, row["user_id"]) },`
Importer for nabble 2015-05-22 15:40:26 -04:00			`}`
			`end`
			`end`
			`end`

Extended Nabble importer functionality 2016-01-03 15:26:12 -05:00			`def import_avatar(user, org_id)`
			`filename = "avatar" + org_id.to_s`
			`path = File.join("/tmp/nab", filename)`
			`res =`
			`@client.exec(`
			`"SELECT content FROM file_avatar WHERE name='avatar100.png' AND user_id = #{org_id} LIMIT 1",`
			`)`
			`return if res.ntuples() < 1`

			`binary = res[0]["content"]`
			`File.open(path, "wb") { \|f\| f.write(PG::Connection.unescape_bytea(binary)) }`

			`upload = @uploader.create_upload(user.id, path, filename)`

			`if upload.persisted?`
			`user.import_mode = false`
			`user.create_user_avatar`
			`user.import_mode = true`
			`user.user_avatar.update(custom_upload_id: upload.id)`
			`user.update(uploaded_avatar_id: upload.id)`
			`else`
			`Rails.logger.error("Could not persist avatar for user #{user.username}")`
			`end`
			`end`

Importer for nabble 2015-05-22 15:40:26 -04:00			`def parse_email(msg)`
FIX: Update Nabble importer to use Email.Receiver new API Email.Receiver API changed in 30836573587079c5e663d7b3122957fc8c70dafe 2016-08-22 08:04:01 -04:00			`receiver = Email::Receiver.new(msg)`
Importer for nabble 2015-05-22 15:40:26 -04:00			`mail = Mail.read_from_string(msg)`
Simple "cook" for email imports from mailing lists 2015-06-05 11:46:21 -04:00			`mail.body`

FIX: Update Nabble importer to use Email.Receiver new API Email.Receiver API changed in 30836573587079c5e663d7b3122957fc8c70dafe 2016-08-22 08:04:01 -04:00			`body, elided = receiver.select_body`
			`body.force_encoding(body.encoding).encode("UTF-8")`
Importer for nabble 2015-05-22 15:40:26 -04:00			`end`

			`def create_forum_topics`
			`puts "", "creating forum topics"`

			`app_node_id = @client.exec("SELECT node_id FROM node WHERE is_app LIMIT 1")[0]["node_id"]`
			`topic_count =`
			`@client.exec("SELECT COUNT(node_id) AS count FROM node WHERE parent_id = #{app_node_id}")[0][`
			`"count"`
			`]`

			`batches(BATCH_SIZE) do \|offset\|`
			`topics = @client.exec <<-SQL`
			`SELECT n.node_id, n.subject, n.owner_id, n.when_created, nm.message, n.msg_fmt`
			`FROM node AS n`
			`INNER JOIN node_msg AS nm ON nm.node_id = n.node_id`
			`WHERE n.parent_id = #{app_node_id}`
			`ORDER BY n.when_created`
			`LIMIT #{BATCH_SIZE}`
			`OFFSET #{offset}`
			`SQL`

			`break if topics.ntuples() < 1`

FEATURE: Skip batches if all records exist Update all import scripts to take advantage of all_records_exist? 2015-09-21 19:48:42 -04:00			`next if all_records_exist? :posts, topics.map { \|t\| t["node_id"].to_i }`

Importer for nabble 2015-05-22 15:40:26 -04:00			`create_posts(topics, total: topic_count, offset: offset) do \|t\|`
			`raw = body_from(t)`
			`next unless raw`
Extended Nabble importer functionality 2016-01-03 15:26:12 -05:00			`raw = process_content(raw)`
			`raw = process_attachments(raw, t["node_id"])`
Importer for nabble 2015-05-22 15:40:26 -04:00
Update Rubocop to 0.60 2018-12-04 04:48:16 -05:00			`{`
			`id: t["node_id"],`
Importer for nabble 2015-05-22 15:40:26 -04:00			`title: t["subject"],`
			`user_id: user_id_from_imported_user_id(t["owner_id"]) \|\| Discourse::SYSTEM_USER_ID,`
			`created_at: Time.zone.at(@td.decode(t["when_created"])),`
			`category: CATEGORY_ID,`
Simple "cook" for email imports from mailing lists 2015-06-05 11:46:21 -04:00			`raw: raw,`
Update Rubocop to 0.60 2018-12-04 04:48:16 -05:00			`cook_method: Post.cook_methods[:regular],`
			`}`
Importer for nabble 2015-05-22 15:40:26 -04:00			`end`
			`end`
			`end`

			`def body_from(p)`
			`%w[m s].include?(p["msg_fmt"]) ? parse_email(p["message"]) : p["message"]`
			`rescue Email::Receiver::EmptyEmailError`
			`puts "Skipped #{p["node_id"]}"`
			`end`

Extended Nabble importer functionality 2016-01-03 15:26:12 -05:00			`def process_content(txt)`
			`txt.gsub! /\<quote author="(.*?)"\>/, '[quote="\1"]'`
			`txt.gsub! %r{\</quote\>}, "[/quote]"`
			`txt.gsub!(%r{\<raw\>(.*?)\</raw\>}m) do \|match\|`
DEV: Enable `Style/SingleLineMethods` and `Style/Semicolon` in Rubocop (#6717) 2018-12-03 22:48:13 -05:00			`c = Regexp.last_match[1].indent(4)`
Extended Nabble importer functionality 2016-01-03 15:26:12 -05:00			`"\n#{c}\n"`
			`end`

			`# lines starting with # are comments, not headings, insert a space to prevent markdown`
			`txt.gsub! /\n#/m, " #"`

			`# in the languagetool forum, quite a lot of XML was not marked as raw`
			`# so we treat <rule...>...</rule> and <category...>...</category> as raw`

			`# uncomment below if you want to use this`

			`#txt.gsub!(/<rule(.?)>(.?<\/rule>)/m) do \|match\|`
			`# c = Regexp.last_match[2].indent(4);`
			`# "\n <rule#{Regexp.last_match[1]}>#{c}\n"`
			`#end`
			`#txt.gsub!(/<category(.?)>(.?<\/category>)/m) do \|match\|`
			`# c = Regexp.last_match[2].indent(4);`
			`# "\n <rule#{Regexp.last_match[1]}>#{c}\n"`
			`#end`
			`txt`
			`end`

			`def process_attachments(txt, postid)`
			`txt.gsub!(/<nabble_img src="(.?)" (.?)>/m) do \|match\|`
			`basename = Regexp.last_match[1]`
FIX: Attachments can be not found 2016-08-23 13:57:48 -04:00			`get_attachment_upload(basename, postid) { \|upload\| @uploader.embedded_image_html(upload) }`
Extended Nabble importer functionality 2016-01-03 15:26:12 -05:00			`end`

			`txt.gsub!(%r{<nabble_a href="(.?)">(.?)</nabble_a>}m) do \|match\|`
			`basename = Regexp.last_match[1]`
FIX: Attachments can be not found 2016-08-23 13:57:48 -04:00			`get_attachment_upload(basename, postid) do \|upload\|`
			`@uploader.attachment_html(upload, basename)`
			`end`
			`end`
			`txt`
			`end`
Extended Nabble importer functionality 2016-01-03 15:26:12 -05:00
FIX: Attachments can be not found 2016-08-23 13:57:48 -04:00			`def get_attachment_upload(basename, postid)`
			`contents =`
			`@client.exec("SELECT content FROM file_node WHERE name='#{basename}' AND node_id = #{postid}")`
			`if contents.any?`
			`binary = contents[0]["content"]`
			`fn = File.join("/tmp/nab", basename)`
Extended Nabble importer functionality 2016-01-03 15:26:12 -05:00			`File.open(fn, "wb") { \|f\| f.write(PG::Connection.unescape_bytea(binary)) }`
FIX: Attachments can be not found 2016-08-23 13:57:48 -04:00			`yield @uploader.create_upload(0, fn, basename)`
Extended Nabble importer functionality 2016-01-03 15:26:12 -05:00			`end`
			`end`

Importer for nabble 2015-05-22 15:40:26 -04:00			`def import_replies`
			`puts "", "creating topic replies"`

			`app_node_id = @client.exec("SELECT node_id FROM node WHERE is_app LIMIT 1")[0]["node_id"]`
			`post_count =`
			`@client.exec("SELECT COUNT(node_id) AS count FROM node WHERE parent_id != #{app_node_id}")[0][`
			`"count"`
			`]`

			`topic_ids = {}`

			`batches(BATCH_SIZE) do \|offset\|`
			`posts = @client.exec <<-SQL`
			`SELECT n.node_id, n.parent_id, n.subject, n.owner_id, n.when_created, nm.message, n.msg_fmt`
			`FROM node AS n`
			`INNER JOIN node_msg AS nm ON nm.node_id = n.node_id`
			`WHERE n.parent_id != #{app_node_id}`
			`ORDER BY n.when_created`
			`LIMIT #{BATCH_SIZE}`
			`OFFSET #{offset}`
			`SQL`

			`break if posts.ntuples() < 1`

FEATURE: Skip batches if all records exist Update all import scripts to take advantage of all_records_exist? 2015-09-21 19:48:42 -04:00			`next if all_records_exist? :posts, posts.map { \|p\| p["node_id"].to_i }`

Importer for nabble 2015-05-22 15:40:26 -04:00			`create_posts(posts, total: post_count, offset: offset) do \|p\|`
			`parent_id = p["parent_id"]`
			`id = p["node_id"]`

			`topic_id = topic_ids[parent_id]`
			`unless topic_id`
			`topic = topic_lookup_from_imported_post_id(parent_id)`
			`topic_id = topic[:topic_id] if topic`
			`end`
			`next unless topic_id`

			`topic_ids[id] = topic_id`

			`raw = body_from(p)`
			`next unless raw`
Extended Nabble importer functionality 2016-01-03 15:26:12 -05:00			`raw = process_content(raw)`
			`raw = process_attachments(raw, id)`
Importer for nabble 2015-05-22 15:40:26 -04:00			`{`
			`id: id,`
			`topic_id: topic_id,`
			`user_id: user_id_from_imported_user_id(p["owner_id"]) \|\| Discourse::SYSTEM_USER_ID,`
			`created_at: Time.zone.at(@td.decode(p["when_created"])),`
Simple "cook" for email imports from mailing lists 2015-06-05 11:46:21 -04:00			`raw: raw,`
Extended Nabble importer functionality 2016-01-03 15:26:12 -05:00			`cook_method: Post.cook_methods[:regular],`
			`}`
Importer for nabble 2015-05-22 15:40:26 -04:00			`end`
			`end`
			`end`
			`end`

Extended Nabble importer functionality 2016-01-03 15:26:12 -05:00			`class String`
			`def indent(count, char = " ")`
			`gsub(/([^\n]*)(\n\|$)/) do \|match\|`
			`last_iteration = ($1 == "" && $2 == "")`
Make import scripts work with frozen strings 2019-05-30 16:20:57 -04:00			`line = +""`
Extended Nabble importer functionality 2016-01-03 15:26:12 -05:00			`line << (char * count) unless last_iteration`
			`line << $1`
			`line << $2`
			`line`
			`end`
			`end`
			`end`

Importer for mbox format 2015-07-23 18:37:40 -04:00			`ImportScripts::Nabble.new.perform`