DEV: Class that converts MD with old attachment links to new MD.

This commit is contained in:
Guo Xiang Tan 2019-05-30 14:38:46 +08:00 committed by Guo Xiang Tan
parent 22abad4151
commit d93e5fb00d
6 changed files with 446 additions and 5 deletions

View File

@ -151,6 +151,7 @@ group :development do
gem 'bullet', require: !!ENV['BULLET'] gem 'bullet', require: !!ENV['BULLET']
gem 'better_errors' gem 'better_errors'
gem 'binding_of_caller' gem 'binding_of_caller'
gem 'diffy'
# waiting on 2.7.5 per: https://github.com/ctran/annotate_models/pull/595 # waiting on 2.7.5 per: https://github.com/ctran/annotate_models/pull/595
if rails_master? if rails_master?

View File

@ -90,6 +90,7 @@ GEM
crass (1.0.4) crass (1.0.4)
debug_inspector (0.0.3) debug_inspector (0.0.3)
diff-lcs (1.3) diff-lcs (1.3)
diffy (3.3.0)
discourse-ember-source (3.8.0.1) discourse-ember-source (3.8.0.1)
discourse_image_optim (0.26.2) discourse_image_optim (0.26.2)
exifr (~> 1.2, >= 1.2.2) exifr (~> 1.2, >= 1.2.2)
@ -435,6 +436,7 @@ DEPENDENCIES
certified certified
colored2 colored2
cppjieba_rb cppjieba_rb
diffy
discourse-ember-source (~> 3.8.0) discourse-ember-source (~> 3.8.0)
discourse_image_optim discourse_image_optim
email_reply_trimmer (~> 0.1) email_reply_trimmer (~> 0.1)

View File

@ -0,0 +1,151 @@
# frozen_string_literal: true
require_dependency "pretty_text"
class InlineUploads
def self.process(markdown, on_missing: nil)
markdown = markdown.dup
cooked_fragment = Nokogiri::HTML::fragment(PrettyText.cook(markdown))
link_occurences = []
cooked_fragment.traverse do |node|
if node.name == "img"
# Do nothing
elsif !(node.children.count == 1 && (node.children[0].name != "img" && node.children[0].children.blank?))
next
end
if seen_link = matched_uploads(node).first
if actual_link = (node.attributes["href"]&.value || node.attributes["src"]&.value)
link_occurences << [actual_link, true]
else
link_occurences << [seen_link, false]
end
end
end
raw_fragment = Nokogiri::HTML::fragment(markdown)
raw_fragment.traverse do |node|
if node.name == "img"
# Do nothing
elsif !(node.children.count == 0 || (node.children.count == 1 && node.children[0].children.blank?))
next
end
matches = matched_uploads(node)
next if matches.blank?
links = extract_links(node)
matches.zip(links).each do |_match, link|
seen_link, is_valid = link_occurences.shift
next unless (link && is_valid)
if link.include?(seen_link)
begin
uri = URI(link)
rescue URI::Error
end
if !Discourse.store.external?
next if uri&.host && uri.host != Discourse.current_hostname
end
upload = Upload.get_from_url(link)
if upload
new_node =
case node.name
when 'a'
attachment_postfix =
if node.attributes["class"]&.value&.split(" ")&.include?("attachment")
"|attachment"
else
""
end
text = node.children.text.strip.gsub("\n", "").gsub(/ +/, " ")
markdown.sub!(
node.to_s,
"[#{text}#{attachment_postfix}](#{upload.short_url})"
)
when "img"
text = node.attributes["alt"]&.value
width = node.attributes["width"]&.value
height = node.attributes["height"]&.value
text = "#{text}|#{width}x#{height}" if width && height
markdown.sub!(node.to_s, "![#{text}](#{upload.short_url})")
else
if markdown =~ /\[img\]\s?#{link}\s?\[\/img\]/
capture = Regexp.last_match[0]
if capture
markdown.sub!(capture, "![](#{upload.short_url})")
end
elsif markdown =~ /(!?\[([a-z0-9|]+)\]\([a-zA-z0-9\.\/]+\))/
capture = Regexp.last_match[0]
if capture
markdown.sub!(capture, "![#{Regexp.last_match[2]}](#{upload.short_url})")
end
end
end
else
on_missing.call(link) if on_missing
end
end
end
end
markdown
end
def self.matched_uploads(node)
matches = []
regexps = [
/(upload:\/\/([a-zA-Z0-9]+)[a-z0-9\.]*)/,
/(\/uploads\/short-url\/([a-zA-Z0-9]+)[a-z0-9\.]*)/,
]
db = RailsMultisite::ConnectionManagement.current_db
if Discourse.store.external?
if Rails.configuration.multisite
regexps << /(#{SiteSetting.Upload.s3_base_url}\/uploads\/#{db}\/original\/(\dX\/(?:[a-f0-9]\/)*[a-f0-9]{40}[a-z0-9\.]*))/
regexps << /(#{SiteSetting.Upload.s3_cdn_url}\/uploads\/#{db}\/original\/(\dX\/(?:[a-f0-9]\/)*[a-f0-9]{40}[a-z0-9\.]*))/
else
regexps << /(#{SiteSetting.Upload.s3_base_url}\/original\/(\dX\/(?:[a-f0-9]\/)*[a-f0-9]{40}[a-z0-9\.]*))/
regexps << /(#{SiteSetting.Upload.s3_cdn_url}\/original\/(\dX\/(?:[a-f0-9]\/)*[a-f0-9]{40}[a-z0-9\.]*))/
regexps << /(\/uploads\/#{db}\/original\/(\dX\/(?:[a-f0-9]\/)*[a-f0-9]{40}[a-z0-9\.]*))/
end
else
regexps << /(\/uploads\/#{db}\/original\/(\dX\/(?:[a-f0-9]\/)*[a-f0-9]{40}[a-z0-9\.]*))/
end
node = node.to_s
regexps.each do |regexp|
node.scan(regexp).each do |matched|
matches << matched[0]
end
end
matches
end
private_class_method :matched_uploads
def self.extract_links(node)
links = []
links << node.attributes["href"]&.value
links << node.attributes["src"]&.value
links = links.concat(node.to_s.scan(/\[img\]\s?(.+)\s?\[\/img\]/))
links = links.concat(node.to_s.scan(/!?\[[a-z0-9|]+\]\(([a-zA-z0-9\.\/]+)\)/))
links.flatten!
links.compact!
links
end
private_class_method :extract_links
end

View File

@ -649,3 +649,44 @@ task 'posts:invalidate_broken_images' => :environment do
puts puts
puts "", "#{rebaked} posts rebaked!" puts "", "#{rebaked} posts rebaked!"
end end
desc "Coverts full upload URLs in `Post#raw` to short upload url"
task 'posts:inline_uploads' => :environment do |_, args|
dry_run = ENV["DRY_RUN"] || true
scope = Post.joins(:post_uploads)
.distinct("posts.id")
.where("raw LIKE '%class=\"attachment%' OR raw LIKE '%<img src=\"%'")
affected_posts_count = scope.count
fixed_count = 0
not_corrected_post_ids = []
scope.find_each do |post|
new_raw = InlineUploads.process(post.raw)
if post.raw != new_raw
if dry_run
puts "Post id #{post.id} raw changed!"
Diffy::Diff.default_format = :color
puts Diffy::Diff.new(post.raw, new_raw, context: 1)
else
putc "."
end
fixed_count += 1
else
not_corrected_post_ids << post.id
end
end
puts "#{fixed_count} out of #{affected_posts_count} affected posts corrected"
if fixed_count != affected_posts_count
puts "Ids of posts that were not correct: #{not_corrected_post_ids}"
end
if dry_run
end
end

View File

@ -24,12 +24,15 @@ end
Fabricator(:upload_s3, from: :upload) do Fabricator(:upload_s3, from: :upload) do
url do |attrs| url do |attrs|
sequence(:url) do |n| sequence(:url) do |n|
File.join( path = +Discourse.store.get_path_for(
Discourse.store.absolute_base_url, "original", n + 1, attrs[:sha1], ".#{attrs[:extension]}"
Discourse.store.get_path_for(
"original", n + 1, attrs[:sha1], ".#{attrs[:extension]}"
)
) )
if Rails.configuration.multisite
path.prepend(File.join(Discourse.store.upload_path, "/"))
end
File.join(Discourse.store.absolute_base_url, path)
end end
end end
end end

View File

@ -0,0 +1,243 @@
require 'rails_helper'
RSpec.describe InlineUploads do
before do
@original_asset_host = Rails.configuration.action_controller.asset_host
Rails.configuration.action_controller.asset_host = "https://cdn.discourse.org/stuff"
end
after do
Rails.configuration.action_controller.asset_host = @original_asset_host
end
describe '.process' do
describe 'local uploads' do
fab!(:upload) { Fabricate(:upload) }
fab!(:upload2) { Fabricate(:upload) }
fab!(:upload3) { Fabricate(:upload) }
it "should not correct existing inline uploads" do
md = <<~MD
![test](#{upload.short_url})haha
[test]#{upload.short_url}
MD
expect(InlineUploads.process(md)).to eq(md)
md = <<~MD
![test](#{upload.short_url})
[test|attachment](#{upload.short_url})
MD
expect(InlineUploads.process(md)).to eq(md)
end
it "should not escape existing content" do
md = "1 > 2"
expect(InlineUploads.process(md)).to eq(md)
end
it "should not escape invalid HTML tags" do
md = "<x>.<y>"
expect(InlineUploads.process(md)).to eq(md)
end
it "should not correct code blocks" do
md = "`<a class=\"attachment\" href=\"#{upload2.url}\">In Code Block</a>`"
expect(InlineUploads.process(md)).to eq(md)
md = " <a class=\"attachment\" href=\"#{upload2.url}\">In Code Block</a>"
expect(InlineUploads.process(md)).to eq(md)
end
it "should correct bbcode img URLs to the short version" do
md = <<~MD
[img]#{upload.url}[/img]
[img]
#{upload2.url}
[/img]
MD
expect(InlineUploads.process(md)).to eq(<<~MD)
![](#{upload.short_url})
![](#{upload2.short_url})
MD
end
it "should correct image URLs to the short version" do
md = <<~MD
![image|690x290](#{upload.short_url})
![image](#{upload.url})
![image|100x100](#{upload.url})
<img src="#{Discourse.base_url}#{upload.url}" alt="some image">
<img src="#{Discourse.base_url}#{upload.url}" alt="some image"><img src="#{Discourse.base_url}#{upload.url}" alt="some image">
<img src="#{upload.url}" width="5" height="4">
MD
expect(InlineUploads.process(md)).to eq(<<~MD)
![image|690x290](#{upload.short_url})
![image](#{upload.short_url})
![image|100x100](#{upload.short_url})
![some image](#{upload.short_url})
![some image](#{upload.short_url})![some image](#{upload.short_url})
![|5x4](#{upload.short_url})
MD
end
it "should correct attachment URLS with an upload before" do
md = <<~MD
![image](#{upload.short_url})
<a class="attachment" href="#{upload2.url}">test2</a>
MD
expect(InlineUploads.process(md)).to eq(<<~MD)
![image](#{upload.short_url})
[test2|attachment](#{upload2.short_url})
MD
end
it "should correct attachment URLs to the short version" do
md = <<~MD
<a class="attachment" href="#{upload.url}">
this
is
some
attachment
</a>
- <a class="attachment" href="#{upload2.url}">test2</a>
- <a class="attachment" href="#{upload2.url}">test2</a>
- <a class="attachment" href="#{upload2.url}">test2</a>
<a class="test attachment" href="#{upload3.url}">test3</a>
<a class="test attachment" href="#{upload3.url}">test3</a><a class="test attachment" href="#{upload3.url}">test3</a>
MD
expect(InlineUploads.process(md)).to eq(<<~MD)
[this is some attachment|attachment](#{upload.short_url})
- [test2|attachment](#{upload2.short_url})
- [test2|attachment](#{upload2.short_url})
- [test2|attachment](#{upload2.short_url})
[test3|attachment](#{upload3.short_url})
[test3|attachment](#{upload3.short_url})[test3|attachment](#{upload3.short_url})
MD
end
it 'should correct full upload url to the shorter version' do
md = <<~MD
Some random text
![test](#{upload.short_url})
[test|attachment](#{upload.short_url})
<a class="test attachment" href="#{upload.url}">
test
</a>
`<a class="attachment" href="#{upload2.url}">In Code Block</a>`
<a class="attachment" href="#{upload3.url}">In Code Block</a>
<a href="#{upload.url}">newtest</a>
<a href="#{Discourse.base_url_no_prefix}#{upload.url}">newtest</a>
<a href="https://somerandomesite.com#{upload.url}">test</a>
<a class="attachment" href="https://somerandom.com/url">test</a>
MD
expect(InlineUploads.process(md)).to eq(<<~MD)
Some random text
![test](#{upload.short_url})
[test|attachment](#{upload.short_url})
[test|attachment](#{upload.short_url})
`<a class="attachment" href="#{upload2.url}">In Code Block</a>`
<a class="attachment" href="#{upload3.url}">In Code Block</a>
[newtest](#{upload.short_url})
[newtest](#{upload.short_url})
<a href="https://somerandomesite.com#{upload.url}">test</a>
<a class="attachment" href="https://somerandom.com/url">test</a>
MD
end
it 'accepts a block that yields when link does not match an upload in the db' do
url = "#{Discourse.base_url}#{upload.url}"
md = <<~MD
<img src="#{url}" alt="some image">
<img src="#{upload2.url}" alt="some image">
MD
upload.destroy!
InlineUploads.process(md, on_missing: lambda { |link|
expect(link).to eq(url)
})
end
end
describe "s3 uploads" do
let(:upload) { Fabricate(:upload_s3) }
before do
SiteSetting.enable_s3_uploads = true
SiteSetting.s3_upload_bucket = "s3-upload-bucket"
SiteSetting.s3_access_key_id = "some key"
SiteSetting.s3_secret_access_key = "some secret key"
SiteSetting.s3_cdn_url = "https://s3.cdn.com"
end
it "should correct image URLs to the short version" do
md = <<~MD
<img src="#{upload.url}" alt="some image">
<img src="#{URI.join(SiteSetting.s3_cdn_url, URI.parse(upload.url).path).to_s}" alt="some image">
MD
expect(InlineUploads.process(md)).to eq(<<~MD)
![some image](#{upload.short_url})
![some image](#{upload.short_url})
MD
end
it "should correct image URLs in multisite" do
begin
Rails.configuration.multisite = true
md = <<~MD
<img src="#{upload.url}" alt="some image">
<img src="#{URI.join(SiteSetting.s3_cdn_url, URI.parse(upload.url).path).to_s}" alt="some image">
MD
expect(InlineUploads.process(md)).to eq(<<~MD)
![some image](#{upload.short_url})
![some image](#{upload.short_url})
MD
ensure
Rails.configuration.multisite = false
end
end
end
end
end