FEATURE: Update pull hotlinked images to use `Upload#short_url`.

This commit is contained in:
Guo Xiang Tan 2019-06-06 15:50:35 +08:00 committed by Guo Xiang Tan
parent 42ab016856
commit fb0a655e8a
2 changed files with 115 additions and 36 deletions

View File

@ -92,39 +92,40 @@ module Jobs
url = downloaded_urls[src] url = downloaded_urls[src]
escaped_src = Regexp.escape(original_src) escaped_src = Regexp.escape(original_src)
# there are 6 ways to insert an image in a post replace_raw = ->(match, match_src, replacement, _index) {
# HTML tag - <img src="http://..."> if src.include?(match_src)
raw.gsub!(/src=["']#{escaped_src}["']/i, "src='#{url}'") raw = raw.gsub(
match,
if (original_path = Upload.extract_url(original_src)&.to_s) && replacement.sub(InlineUploads::PLACEHOLDER, upload.short_url)
Upload.extract_url(url)&.to_s
raw.gsub!(
/src=["']\S*#{Regexp.escape(original_path)}["']/i,
"src='#{url}'"
) )
end end
}
# there are 6 ways to insert an image in a post
# HTML tag - <img src="http://...">
InlineUploads.match_img(raw, external_src: true, &replace_raw)
# BBCode tag - [img]http://...[/img] # BBCode tag - [img]http://...[/img]
raw.gsub!(/\[img\]#{escaped_src}\[\/img\]/i, "[img]#{url}[/img]") InlineUploads.match_bbcode_img(raw, &replace_raw)
# Markdown linked image - [![alt](http://...)](http://...) # Markdown linked image - [![alt](http://...)](http://...)
raw.gsub!(/\[!\[([^\]]*)\]\(#{escaped_src}\)\]/) { "[<img src='#{url}' alt='#{$1}'>]" }
# Markdown inline - ![alt](http://...) # Markdown inline - ![alt](http://...)
raw.gsub!(/!\[([^\]]*)\]\(#{escaped_src}\)/) { "![#{$1}](#{url})" }
# Markdown inline - ![](http://... "image title") # Markdown inline - ![](http://... "image title")
raw.gsub!(/!\[\]\(#{escaped_src} "([^\]]*)"\)/) { "![](#{url})" }
# Markdown inline - ![alt](http://... "image title") # Markdown inline - ![alt](http://... "image title")
raw.gsub!(/!\[([^\]]*)\]\(#{escaped_src} "([^\]]*)"\)/) { "![](#{url})" } InlineUploads.match_md_inline_img(raw, external_src: true, &replace_raw)
# Markdown reference - [x]: http://
raw.gsub!(/\[([^\]]+)\]:\s?#{escaped_src}/) { "[#{$1}]: #{url}" }
# Direct link # Direct link
raw.gsub!(/^#{escaped_src}(\s?)$/) { "<img src='#{url}'>#{$1}" } raw.gsub!(/^#{escaped_src}(\s?)$/) { "![](#{upload.short_url})#{$1}" }
end end
rescue => e rescue => e
if Rails.env.test?
raise e
else
log(:error, "Failed to pull hotlinked image (#{src}) post: #{post_id}\n" + e.message + "\n" + e.backtrace.join("\n")) log(:error, "Failed to pull hotlinked image (#{src}) post: #{post_id}\n" + e.message + "\n" + e.backtrace.join("\n"))
end end
end end
end end
end
large_images.uniq! large_images.uniq!
broken_images.uniq! broken_images.uniq!
@ -147,7 +148,10 @@ module Jobs
def extract_images_from(html) def extract_images_from(html)
doc = Nokogiri::HTML::fragment(html) doc = Nokogiri::HTML::fragment(html)
doc.css("img[src], a.lightbox[href]") - doc.css("img.avatar") - doc.css(".lightbox img[src]")
doc.css("img[src], a.lightbox[href], a.onebox[href]") -
doc.css("img.avatar") -
doc.css(".lightbox img[src]")
end end
def should_download_image?(src) def should_download_image?(src)

View File

@ -49,20 +49,22 @@ describe Jobs::PullHotlinkedImages do
it 'replaces images' do it 'replaces images' do
post = Fabricate(:post, raw: "<img src='#{image_url}'>") post = Fabricate(:post, raw: "<img src='#{image_url}'>")
expect do
Jobs::PullHotlinkedImages.new.execute(post_id: post.id) Jobs::PullHotlinkedImages.new.execute(post_id: post.id)
post.reload end.to change { Upload.count }.by(1)
expect(post.raw).to match(/^<img src='\/uploads/) expect(post.reload.raw).to eq("![](#{Upload.last.short_url})")
end end
it 'replaces images without protocol' do it 'replaces images without protocol' do
url = image_url.sub(/^https?\:/, '') url = image_url.sub(/^https?\:/, '')
post = Fabricate(:post, raw: "<img src='#{url}'>") post = Fabricate(:post, raw: "<img alt='test' src='#{url}'>")
expect do
Jobs::PullHotlinkedImages.new.execute(post_id: post.id) Jobs::PullHotlinkedImages.new.execute(post_id: post.id)
post.reload end.to change { Upload.count }.by(1)
expect(post.raw).to match(/^<img src='\/uploads/) expect(post.reload.raw).to eq("![test](#{Upload.last.short_url})")
end end
it 'replaces images without extension' do it 'replaces images without extension' do
@ -70,10 +72,11 @@ describe Jobs::PullHotlinkedImages do
stub_request(:get, url).to_return(body: png, headers: { "Content-Type" => "image/png" }) stub_request(:get, url).to_return(body: png, headers: { "Content-Type" => "image/png" })
post = Fabricate(:post, raw: "<img src='#{url}'>") post = Fabricate(:post, raw: "<img src='#{url}'>")
expect do
Jobs::PullHotlinkedImages.new.execute(post_id: post.id) Jobs::PullHotlinkedImages.new.execute(post_id: post.id)
post.reload end.to change { Upload.count }.by(1)
expect(post.raw).to match(/^<img src='\/uploads/) expect(post.reload.raw).to eq("![](#{Upload.last.short_url})")
end end
it 'replaces optimized images' do it 'replaces optimized images' do
@ -91,10 +94,73 @@ describe Jobs::PullHotlinkedImages do
upload = Upload.last upload = Upload.last
post.reload post.reload
expect(post.raw).to eq("<img src='#{upload.url}'>") expect(post.raw).to eq("![](#{upload.short_url})")
expect(post.uploads).to contain_exactly(upload) expect(post.uploads).to contain_exactly(upload)
end end
it 'replaces direct links' do
post = Fabricate(:post, raw: <<~MD)
#{image_url}
#{image_url}
MD
expect { Jobs::PullHotlinkedImages.new.execute(post_id: post.id) }
.to change { Upload.count }.by(1)
post.reload
expect(post.raw).to eq(<<~MD.chomp)
![](#{Upload.last.short_url})
![](#{Upload.last.short_url})
MD
end
it 'replaces markdown image' do
post = Fabricate(:post, raw: <<~MD)
[![some test](#{image_url})](https://somelink.com)
![some test](#{image_url})
![](#{image_url})
![abcde](#{image_url} 'some test')
![](#{image_url} 'some test')
MD
expect { Jobs::PullHotlinkedImages.new.execute(post_id: post.id) }
.to change { Upload.count }.by(1)
post.reload
expect(post.raw).to eq(<<~MD.chomp)
[![some test](#{Upload.last.short_url})](https://somelink.com)
![some test](#{Upload.last.short_url})
![](#{Upload.last.short_url})
![abcde](#{Upload.last.short_url} 'some test')
![](#{Upload.last.short_url} 'some test')
MD
end
it 'replaces bbcode images' do
post = Fabricate(:post, raw: <<~MD)
[img]
#{image_url}
[/img]
[img]
#{image_url}
[/img]
MD
expect { Jobs::PullHotlinkedImages.new.execute(post_id: post.id) }
.to change { Upload.count }.by(1)
post.reload
expect(post.raw).to eq(<<~MD.chomp)
![](#{Upload.last.short_url})
![](#{Upload.last.short_url})
MD
end
describe 'onebox' do describe 'onebox' do
let(:media) { "File:Brisbane_May_2013201.jpg" } let(:media) { "File:Brisbane_May_2013201.jpg" }
let(:url) { "https://commons.wikimedia.org/wiki/#{media}" } let(:url) { "https://commons.wikimedia.org/wiki/#{media}" }
@ -104,6 +170,7 @@ describe Jobs::PullHotlinkedImages do
Jobs.run_later! Jobs.run_later!
stub_request(:head, url) stub_request(:head, url)
stub_request(:get, url).to_return(body: '') stub_request(:get, url).to_return(body: '')
stub_request(:get, api_url).to_return(body: "{ stub_request(:get, api_url).to_return(body: "{
\"query\": { \"query\": {
\"pages\": { \"pages\": {
@ -139,12 +206,20 @@ describe Jobs::PullHotlinkedImages do
<a href='#{url}'><img src='#{large_image_url}'></a> <a href='#{url}'><img src='#{large_image_url}'></a>
BODY BODY
2.times do
Jobs::ProcessPost.new.execute(post_id: post.id) Jobs::ProcessPost.new.execute(post_id: post.id)
Jobs::PullHotlinkedImages.new.execute(post_id: post.id) Jobs::PullHotlinkedImages.new.execute(post_id: post.id)
Jobs::ProcessPost.new.execute(post_id: post.id) end
Jobs::PullHotlinkedImages.new.execute(post_id: post.id)
post.reload post.reload
expect(post.raw).to eq(<<~MD.chomp)
![](upload://z2QSs1KJWoj51uYhDjb6ifCzxH6.gif)
https://commons.wikimedia.org/wiki/File:Brisbane_May_2013201.jpg
<img src='#{broken_image_url}'>
<a href='#{url}'><img src='#{large_image_url}'></a>
MD
expect(post.cooked).to match(/<p><img src=.*\/uploads/) expect(post.cooked).to match(/<p><img src=.*\/uploads/)
expect(post.cooked).to match(/<img src=.*\/uploads.*\ class="thumbnail"/) expect(post.cooked).to match(/<img src=.*\/uploads.*\ class="thumbnail"/)
expect(post.cooked).to match(/<span class="broken-image/) expect(post.cooked).to match(/<span class="broken-image/)
@ -231,7 +306,7 @@ describe Jobs::PullHotlinkedImages do
post.reload post.reload
expect(post.raw).to eq("<img src='#{Upload.last.url}'>") expect(post.raw).to eq("![](#{Upload.last.short_url})")
expect(post.uploads.count).to eq(1) expect(post.uploads.count).to eq(1)
end end