Improve Vanilla bulk import script

2018-08-16 22:00:26 +05:30 · 2018-08-16 22:00:26 +05:30 · 0e04e3990e
parent 93201d8dbe
commit 0e04e3990e
2 changed files with 155 additions and 18 deletions
--- a/script/bulk_import/base.rb
+++ b/script/bulk_import/base.rb
@ -58,6 +58,7 @@ class BulkImport::Base
    db = ActiveRecord::Base.connection_config
    @encoder = PG::TextEncoder::CopyRow.new
    @raw_connection = PG.connect(dbname: db[:database], host: db[:host_names]&.first, port: db[:port])
+    # @raw_connection = PG.connect(dbname: db[:database], host: db[:host_names]&.first, port: db[:port], password: "discourse")
    @uploader = ImportScripts::Uploader.new
    @html_entities = HTMLEntities.new
    @encoding = CHARSET_MAP[charset]
@ -580,13 +581,18 @@ class BulkImport::Base

    @raw_connection.copy_data(sql, @encoder) do
      rows.each do |row|
-        mapped = yield(row)
-        next unless mapped
-        processed = send(process_method_name, mapped)
-        imported_ids << mapped[:imported_id] unless mapped[:imported_id].nil?
-        imported_ids |= mapped[:imported_ids] unless mapped[:imported_ids].nil?
-        @raw_connection.put_copy_data columns.map { |c| processed[c] }
-        print "\r%7d - %6d/sec".freeze % [imported_ids.size, imported_ids.size.to_f / (Time.now - start)] if imported_ids.size % 5000 == 0
+        begin
+          mapped = yield(row)
+          next unless mapped
+          processed = send(process_method_name, mapped)
+          imported_ids << mapped[:imported_id] unless mapped[:imported_id].nil?
+          imported_ids |= mapped[:imported_ids] unless mapped[:imported_ids].nil?
+          @raw_connection.put_copy_data columns.map { |c| processed[c] }
+          print "\r%7d - %6d/sec".freeze % [imported_ids.size, imported_ids.size.to_f / (Time.now - start)] if imported_ids.size % 5000 == 0
+        rescue => e
+          puts "\n"
+          puts "ERROR: #{e.inspect}"
+        end
      end
    end

@ -624,6 +630,10 @@ class BulkImport::Base
    @uploader.create_upload(user_id, path, source_filename)
  end

+  def html_for_upload(upload, display_filename)
+    @uploader.html_for_upload(upload, display_filename)
+  end
+
  def fix_name(name)
    name.scrub! if name.valid_encoding? == false
    return if name.blank?
--- a/script/bulk_import/vanilla.rb
+++ b/script/bulk_import/vanilla.rb
@ -18,7 +18,9 @@ class BulkImport::Vanilla < BulkImport::Base
    @client = Mysql2::Client.new(
      host: "localhost",
      username: "root",
-      database: VANILLA_DB
+      database: VANILLA_DB,
+      password: "",
+      reconnect: true
    )

    @import_tags = false
@ -42,6 +44,7 @@ class BulkImport::Vanilla < BulkImport::Base

    import_avatars # slow
    create_permalinks # TODO: do it bulk style
+    import_attachments # slow
  end

  def execute
@ -54,8 +57,14 @@ class BulkImport::Vanilla < BulkImport::Base
    # other good ones:

    # SiteSetting.port = 3000
+    # SiteSetting.permalink_normalizations = "/discussion\/(\d+)\/.*/discussion/\1"
    # SiteSetting.automatic_backups_enabled = false
    # SiteSetting.disable_emails = "non-staff"
+    # SiteSetting.authorized_extensions = '*'
+    # SiteSetting.max_image_size_kb = 102400
+    # SiteSetting.max_attachment_size_kb = 102400
+    # SiteSetting.clean_up_uploads = false
+    # SiteSetting.clean_orphan_uploads_grace_period_hours = 43200
    # etc.

    import_users
@ -250,6 +259,86 @@ class BulkImport::Vanilla < BulkImport::Base
    end
  end

+  def import_attachments
+    if ATTACHMENTS_BASE_DIR && File.exists?(ATTACHMENTS_BASE_DIR)
+      puts "", "importing attachments"
+
+      start = Time.now
+      count = 0
+
+      # https://us.v-cdn.net/1234567/uploads/editor/xyz/image.jpg
+      cdn_regex = /https:\/\/us.v-cdn.net\/1234567\/uploads\/(\S+\/(\w|-)+.\w+)/i
+      # [attachment=10109:Screen Shot 2012-04-01 at 3.47.35 AM.png]
+      attachment_regex = /\[attachment=(\d+):(.*?)\]/i
+
+      Post.where("raw LIKE '%/us.v-cdn.net/%' OR raw LIKE '%[attachment%'").find_each do |post|
+        count += 1
+        print "\r%7d - %6d/sec".freeze % [count, count.to_f / (Time.now - start)]
+        new_raw = post.raw.dup
+
+        new_raw.gsub!(attachment_regex) do |s|
+          matches = attachment_regex.match(s)
+          attachment_id = matches[1]
+          file_name = matches[2]
+          next unless attachment_id
+
+          r = mysql_query("SELECT Path, Name FROM #{TABLE_PREFIX}Media WHERE MediaID = #{attachment_id};").first
+          next if r.nil?
+          path = r["Path"]
+          name = r["Name"]
+          next unless path.present?
+
+          path.gsub!("s3://content/", "")
+          path.gsub!("s3://uploads/", "")
+          file_path = "#{ATTACHMENTS_BASE_DIR}/#{path}"
+
+          if File.exists?(file_path)
+            upload = create_upload(post.user.id, file_path, File.basename(file_path))
+            if upload && upload.errors.empty?
+              # upload.url
+              filename = name || file_name || File.basename(file_path)
+              html_for_upload(upload, normalize_text(filename))
+            else
+              puts "Error: Upload did not persist for #{post.id} #{attachment_id}!"
+            end
+          else
+            puts "Couldn't find file for #{attachment_id}. Skipping."
+            next
+          end
+        end
+
+        new_raw.gsub!(cdn_regex) do |s|
+          matches = cdn_regex.match(s)
+          attachment_id = matches[1]
+
+          file_path = "#{ATTACHMENTS_BASE_DIR}/#{attachment_id}"
+
+          if File.exists?(file_path)
+            upload = create_upload(post.user.id, file_path, File.basename(file_path))
+            if upload && upload.errors.empty?
+              upload.url
+            else
+              puts "Error: Upload did not persist for #{post.id} #{attachment_id}!"
+            end
+          else
+            puts "Couldn't find file for #{attachment_id}. Skipping."
+            next
+          end
+        end
+
+        if new_raw != post.raw
+          begin
+            PostRevisor.new(post).revise!(post.user, { raw: new_raw }, skip_revision: true, skip_validations: true, bypass_bump: true)
+          rescue
+            puts "PostRevisor error for #{post.id}"
+            post.raw = new_raw
+            post.save(validate: false)
+          end
+        end
+      end
+    end
+  end
+
  def find_photo_file(path, base_filename)
    base_guess = base_filename.dup
    full_guess = File.join(path, base_guess) # often an exact match exists
@ -538,16 +627,18 @@ class BulkImport::Vanilla < BulkImport::Base
        pcf = post.custom_fields
        if pcf && pcf["import_id"]
          topic = post.topic
-          id = pcf["import_id"].split('-').last
-          if post.post_number == 1
-            slug = Slug.for(topic.title) # probably matches what vanilla would do...
-            @raw_connection.put_copy_data(
-              ["discussion/#{id}/#{slug}", topic.id, nil, now, now]
-            )
-          else
-            @raw_connection.put_copy_data(
-              ["discussion/comment/#{id}", nil, post.id, now, now]
-            )
+          if topic.present?
+            id = pcf["import_id"].split('-').last
+            if post.post_number == 1
+              slug = Slug.for(topic.title) # probably matches what vanilla would do...
+              @raw_connection.put_copy_data(
+                ["discussion/#{id}/#{slug}", topic.id, nil, now, now]
+              )
+            else
+              @raw_connection.put_copy_data(
+                ["discussion/comment/#{id}", nil, post.id, now, now]
+              )
+            end
          end
        end

@ -559,10 +650,46 @@ class BulkImport::Vanilla < BulkImport::Base
  def clean_up(raw)
    # post id is sometimes prefixed with "c-"
    raw.gsub!(/\[QUOTE="([^;]+);c-(\d+)"\]/i) { "[QUOTE=#{$1};#{$2}]" }
+    raw = raw.delete("\u0000")
+    raw = process_raw_text(raw)

    raw
  end

+  def process_raw_text(raw)
+    return "" if raw.blank?
+    text = raw.dup
+    text = CGI.unescapeHTML(text)
+
+    text.gsub!(/:(?:\w{8})\]/, ']')
+
+    # Some links look like this: <!-- m --><a class="postlink" href="http://www.onegameamonth.com">http://www.onegameamonth.com</a><!-- m -->
+    text.gsub!(/<!-- \w --><a(?:.+)href="(\S+)"(?:.*)>(.+)<\/a><!-- \w -->/i, '[\2](\1)')
+
+    # phpBB shortens link text like this, which breaks our markdown processing:
+    #   [http://answers.yahoo.com/question/index ... 223AAkkPli](http://answers.yahoo.com/question/index?qid=20070920134223AAkkPli)
+    #
+    # Work around it for now:
+    text.gsub!(/\[http(s)?:\/\/(www\.)?/i, '[')
+
+    # convert list tags to ul and list=1 tags to ol
+    # list=a is not supported, so handle it like list=1
+    # list=9 and list=x have the same result as list=1 and list=a
+    text.gsub!(/\[list\](.*?)\[\/list:u\]/mi, '[ul]\1[/ul]')
+    text.gsub!(/\[list=.*?\](.*?)\[\/list:o\]/mi, '[ol]\1[/ol]')
+
+    # convert *-tags to li-tags so bbcode-to-md can do its magic on phpBB's lists:
+    text.gsub!(/\[\*\](.*?)\[\/\*:m\]/mi, '[li]\1[/li]')
+
+    # [QUOTE="<username>"] -- add newline
+    text.gsub!(/(\[quote="[a-zA-Z\d]+"\])/i) { "#{$1}\n" }
+
+    # [/QUOTE] -- add newline
+    text.gsub!(/(\[\/quote\])/i) { "\n#{$1}" }
+
+    text
+  end
+
  def staff_guardian
    @_staff_guardian ||= Guardian.new(Discourse.system_user)
  end