From d93e5fb00d06df24bf9f7beb4c8317ccbbd6da55 Mon Sep 17 00:00:00 2001 From: Guo Xiang Tan Date: Thu, 30 May 2019 14:38:46 +0800 Subject: [PATCH] DEV: Class that converts MD with old attachment links to new MD. --- Gemfile | 1 + Gemfile.lock | 2 + app/services/inline_uploads.rb | 151 ++++++++++++++++ lib/tasks/posts.rake | 41 +++++ spec/fabricators/upload_fabricator.rb | 13 +- spec/services/inline_uploads_spec.rb | 243 ++++++++++++++++++++++++++ 6 files changed, 446 insertions(+), 5 deletions(-) create mode 100644 app/services/inline_uploads.rb create mode 100644 spec/services/inline_uploads_spec.rb diff --git a/Gemfile b/Gemfile index d0878a8873e..1caacdadfea 100644 --- a/Gemfile +++ b/Gemfile @@ -151,6 +151,7 @@ group :development do gem 'bullet', require: !!ENV['BULLET'] gem 'better_errors' gem 'binding_of_caller' + gem 'diffy' # waiting on 2.7.5 per: https://github.com/ctran/annotate_models/pull/595 if rails_master? diff --git a/Gemfile.lock b/Gemfile.lock index db8907d12f5..0fa35801f0d 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -90,6 +90,7 @@ GEM crass (1.0.4) debug_inspector (0.0.3) diff-lcs (1.3) + diffy (3.3.0) discourse-ember-source (3.8.0.1) discourse_image_optim (0.26.2) exifr (~> 1.2, >= 1.2.2) @@ -435,6 +436,7 @@ DEPENDENCIES certified colored2 cppjieba_rb + diffy discourse-ember-source (~> 3.8.0) discourse_image_optim email_reply_trimmer (~> 0.1) diff --git a/app/services/inline_uploads.rb b/app/services/inline_uploads.rb new file mode 100644 index 00000000000..4e3cece2c8d --- /dev/null +++ b/app/services/inline_uploads.rb @@ -0,0 +1,151 @@ +# frozen_string_literal: true + +require_dependency "pretty_text" + +class InlineUploads + def self.process(markdown, on_missing: nil) + markdown = markdown.dup + cooked_fragment = Nokogiri::HTML::fragment(PrettyText.cook(markdown)) + link_occurences = [] + + cooked_fragment.traverse do |node| + if node.name == "img" + # Do nothing + elsif !(node.children.count == 1 && (node.children[0].name != "img" && node.children[0].children.blank?)) + next + end + + if seen_link = matched_uploads(node).first + if actual_link = (node.attributes["href"]&.value || node.attributes["src"]&.value) + link_occurences << [actual_link, true] + else + link_occurences << [seen_link, false] + end + end + end + + raw_fragment = Nokogiri::HTML::fragment(markdown) + + raw_fragment.traverse do |node| + if node.name == "img" + # Do nothing + elsif !(node.children.count == 0 || (node.children.count == 1 && node.children[0].children.blank?)) + next + end + + matches = matched_uploads(node) + next if matches.blank? + links = extract_links(node) + + matches.zip(links).each do |_match, link| + seen_link, is_valid = link_occurences.shift + next unless (link && is_valid) + + if link.include?(seen_link) + begin + uri = URI(link) + rescue URI::Error + end + + if !Discourse.store.external? + next if uri&.host && uri.host != Discourse.current_hostname + end + + upload = Upload.get_from_url(link) + + if upload + new_node = + case node.name + when 'a' + attachment_postfix = + if node.attributes["class"]&.value&.split(" ")&.include?("attachment") + "|attachment" + else + "" + end + + text = node.children.text.strip.gsub("\n", "").gsub(/ +/, " ") + + markdown.sub!( + node.to_s, + "[#{text}#{attachment_postfix}](#{upload.short_url})" + ) + when "img" + text = node.attributes["alt"]&.value + width = node.attributes["width"]&.value + height = node.attributes["height"]&.value + text = "#{text}|#{width}x#{height}" if width && height + markdown.sub!(node.to_s, "![#{text}](#{upload.short_url})") + else + if markdown =~ /\[img\]\s?#{link}\s?\[\/img\]/ + capture = Regexp.last_match[0] + + if capture + markdown.sub!(capture, "![](#{upload.short_url})") + end + elsif markdown =~ /(!?\[([a-z0-9|]+)\]\([a-zA-z0-9\.\/]+\))/ + capture = Regexp.last_match[0] + + if capture + markdown.sub!(capture, "![#{Regexp.last_match[2]}](#{upload.short_url})") + end + end + end + + else + on_missing.call(link) if on_missing + end + end + end + end + + markdown + end + + def self.matched_uploads(node) + matches = [] + + regexps = [ + /(upload:\/\/([a-zA-Z0-9]+)[a-z0-9\.]*)/, + /(\/uploads\/short-url\/([a-zA-Z0-9]+)[a-z0-9\.]*)/, + ] + + db = RailsMultisite::ConnectionManagement.current_db + + if Discourse.store.external? + if Rails.configuration.multisite + regexps << /(#{SiteSetting.Upload.s3_base_url}\/uploads\/#{db}\/original\/(\dX\/(?:[a-f0-9]\/)*[a-f0-9]{40}[a-z0-9\.]*))/ + regexps << /(#{SiteSetting.Upload.s3_cdn_url}\/uploads\/#{db}\/original\/(\dX\/(?:[a-f0-9]\/)*[a-f0-9]{40}[a-z0-9\.]*))/ + else + regexps << /(#{SiteSetting.Upload.s3_base_url}\/original\/(\dX\/(?:[a-f0-9]\/)*[a-f0-9]{40}[a-z0-9\.]*))/ + regexps << /(#{SiteSetting.Upload.s3_cdn_url}\/original\/(\dX\/(?:[a-f0-9]\/)*[a-f0-9]{40}[a-z0-9\.]*))/ + regexps << /(\/uploads\/#{db}\/original\/(\dX\/(?:[a-f0-9]\/)*[a-f0-9]{40}[a-z0-9\.]*))/ + end + else + regexps << /(\/uploads\/#{db}\/original\/(\dX\/(?:[a-f0-9]\/)*[a-f0-9]{40}[a-z0-9\.]*))/ + end + + node = node.to_s + + regexps.each do |regexp| + node.scan(regexp).each do |matched| + matches << matched[0] + end + end + + matches + end + private_class_method :matched_uploads + + def self.extract_links(node) + links = [] + links << node.attributes["href"]&.value + links << node.attributes["src"]&.value + links = links.concat(node.to_s.scan(/\[img\]\s?(.+)\s?\[\/img\]/)) + links = links.concat(node.to_s.scan(/!?\[[a-z0-9|]+\]\(([a-zA-z0-9\.\/]+)\)/)) + links.flatten! + links.compact! + links + end + private_class_method :extract_links +end diff --git a/lib/tasks/posts.rake b/lib/tasks/posts.rake index 465808718f7..7da89fe4a96 100644 --- a/lib/tasks/posts.rake +++ b/lib/tasks/posts.rake @@ -649,3 +649,44 @@ task 'posts:invalidate_broken_images' => :environment do puts puts "", "#{rebaked} posts rebaked!" end + +desc "Coverts full upload URLs in `Post#raw` to short upload url" +task 'posts:inline_uploads' => :environment do |_, args| + dry_run = ENV["DRY_RUN"] || true + + scope = Post.joins(:post_uploads) + .distinct("posts.id") + .where("raw LIKE '%class=\"attachment%' OR raw LIKE '%In Code Block`" + + expect(InlineUploads.process(md)).to eq(md) + + md = " In Code Block" + + expect(InlineUploads.process(md)).to eq(md) + end + + it "should correct bbcode img URLs to the short version" do + md = <<~MD + [img]#{upload.url}[/img] + + [img] + #{upload2.url} + [/img] + MD + + expect(InlineUploads.process(md)).to eq(<<~MD) + ![](#{upload.short_url}) + + ![](#{upload2.short_url}) + MD + end + + it "should correct image URLs to the short version" do + md = <<~MD + ![image|690x290](#{upload.short_url}) + + ![image](#{upload.url}) + ![image|100x100](#{upload.url}) + + some image + some imagesome image + + + MD + + expect(InlineUploads.process(md)).to eq(<<~MD) + ![image|690x290](#{upload.short_url}) + + ![image](#{upload.short_url}) + ![image|100x100](#{upload.short_url}) + + ![some image](#{upload.short_url}) + ![some image](#{upload.short_url})![some image](#{upload.short_url}) + + ![|5x4](#{upload.short_url}) + MD + end + + it "should correct attachment URLS with an upload before" do + md = <<~MD + ![image](#{upload.short_url}) + + test2 + MD + + expect(InlineUploads.process(md)).to eq(<<~MD) + ![image](#{upload.short_url}) + + [test2|attachment](#{upload2.short_url}) + MD + end + + it "should correct attachment URLs to the short version" do + md = <<~MD + + this + is + some + attachment + + + + - test2 + - test2 + - test2 + + test3 + test3test3 + MD + + expect(InlineUploads.process(md)).to eq(<<~MD) + [this is some attachment|attachment](#{upload.short_url}) + + - [test2|attachment](#{upload2.short_url}) + - [test2|attachment](#{upload2.short_url}) + - [test2|attachment](#{upload2.short_url}) + + [test3|attachment](#{upload3.short_url}) + [test3|attachment](#{upload3.short_url})[test3|attachment](#{upload3.short_url}) + MD + end + + it 'should correct full upload url to the shorter version' do + md = <<~MD + Some random text + + ![test](#{upload.short_url}) + [test|attachment](#{upload.short_url}) + + + test + + + `In Code Block` + + In Code Block + + newtest + newtest + + test + test + MD + + expect(InlineUploads.process(md)).to eq(<<~MD) + Some random text + + ![test](#{upload.short_url}) + [test|attachment](#{upload.short_url}) + + [test|attachment](#{upload.short_url}) + + `In Code Block` + + In Code Block + + [newtest](#{upload.short_url}) + [newtest](#{upload.short_url}) + + test + test + MD + end + + it 'accepts a block that yields when link does not match an upload in the db' do + url = "#{Discourse.base_url}#{upload.url}" + + md = <<~MD + some image + some image + MD + + upload.destroy! + + InlineUploads.process(md, on_missing: lambda { |link| + expect(link).to eq(url) + }) + end + end + + describe "s3 uploads" do + let(:upload) { Fabricate(:upload_s3) } + + before do + SiteSetting.enable_s3_uploads = true + SiteSetting.s3_upload_bucket = "s3-upload-bucket" + SiteSetting.s3_access_key_id = "some key" + SiteSetting.s3_secret_access_key = "some secret key" + SiteSetting.s3_cdn_url = "https://s3.cdn.com" + end + + it "should correct image URLs to the short version" do + md = <<~MD + some image + some image + MD + + expect(InlineUploads.process(md)).to eq(<<~MD) + ![some image](#{upload.short_url}) + ![some image](#{upload.short_url}) + MD + end + + it "should correct image URLs in multisite" do + begin + Rails.configuration.multisite = true + + md = <<~MD + some image + some image + MD + + expect(InlineUploads.process(md)).to eq(<<~MD) + ![some image](#{upload.short_url}) + ![some image](#{upload.short_url}) + MD + ensure + Rails.configuration.multisite = false + end + end + end + end +end