FIX: properly handle too large & broken images in posts

This commit is contained in:
Régis Hanol 2017-11-16 15:45:07 +01:00
parent 6f2a3cb026
commit 678e28794a
5 changed files with 78 additions and 107 deletions

View File

@ -5,11 +5,8 @@ require_dependency 'upload_creator'
module Jobs
class PullHotlinkedImages < Jobs::Base
sidekiq_options queue: 'low'
LARGE_IMAGES = "large_images".freeze
def initialize
@max_size = SiteSetting.max_image_size_kb.kilobytes
end
@ -47,26 +44,25 @@ module Jobs
raw = post.raw.dup
start_raw = raw.dup
downloaded_urls = {}
large_images = post.custom_fields[LARGE_IMAGES].presence || []
# recover from bad custom field silently
unless Array === large_images
large_images = []
end
large_images = JSON.parse(post.custom_fields[Post::LARGE_IMAGES].presence || "[]") rescue []
broken_images = JSON.parse(post.custom_fields[Post::BROKEN_IMAGES].presence || "[]") rescue []
downloaded_images = JSON.parse(post.custom_fields[Post::DOWNLOADED_IMAGES].presence || "{}") rescue {}
broken_images, new_large_images = [], []
has_new_large_image = false
has_new_broken_image = false
has_downloaded_image = false
extract_images_from(post.cooked).each do |image|
src = original_src = image['src']
if src.start_with?("//")
src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}"
end
src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//")
if is_valid_image_url(src)
begin
# have we already downloaded that file?
unless downloaded_urls.include?(src) || large_images.include?(src) || broken_images.include?(src)
unless downloaded_images.include?(src) || large_images.include?(src) || broken_images.include?(src)
if hotlinked = download(src)
if File.size(hotlinked.path) <= @max_size
filename = File.basename(URI.parse(src).path)
@ -74,15 +70,18 @@ module Jobs
upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(post.user_id)
if upload.persisted?
downloaded_urls[src] = upload.url
downloaded_images[src.sub(/^https?:/i, "")] = upload.id
has_downloaded_image = true
else
log(:info, "Failed to pull hotlinked image for post: #{post_id}: #{src} - #{upload.errors.full_messages.join("\n")}")
end
else
large_images << original_src
new_large_images << original_src
large_images << original_src.sub(/^https?:/i, "")
has_new_large_image = true
end
else
broken_images << original_src
broken_images << original_src.sub(/^https?:/i, "")
has_new_broken_image = true
end
end
# have we successfully downloaded that file?
@ -111,42 +110,24 @@ module Jobs
log(:error, "Failed to pull hotlinked image (#{src}) post: #{post_id}\n" + e.message + "\n" + e.backtrace.join("\n"))
end
end
end
if new_large_images.length > 0
post.custom_fields[LARGE_IMAGES] = large_images
post.save_custom_fields
end
large_images.uniq!
broken_images.uniq!
post.custom_fields[Post::LARGE_IMAGES] = large_images.to_json if large_images.present?
post.custom_fields[Post::BROKEN_IMAGES] = broken_images.to_json if broken_images.present?
post.custom_fields[Post::DOWNLOADED_IMAGES] = downloaded_images.to_json if downloaded_images.present?
# only save custom fields if there are any
post.save_custom_fields if large_images.present? || broken_images.present? || downloaded_images.present?
post.reload
if start_raw == post.raw && raw != post.raw
changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") }
# we never want that job to bump the topic
options = { bypass_bump: true }
post.revise(Discourse.system_user, changes, options)
elsif downloaded_urls.present? || new_large_images.present?
post.revise(Discourse.system_user, changes, bypass_bump: true)
elsif has_downloaded_image || has_new_large_image || has_new_broken_image
post.trigger_post_process(true)
elsif broken_images.present?
start_html = post.cooked
doc = Nokogiri::HTML::fragment(start_html)
images = doc.css("img[src]") - doc.css("img.avatar")
images.each do |tag|
src = tag['src']
if broken_images.include?(src)
tag.name = 'span'
tag.set_attribute('class', 'broken-image fa fa-chain-broken')
tag.set_attribute('title', I18n.t('post.image_placeholder.broken'))
tag.remove_attribute('src')
tag.remove_attribute('width')
tag.remove_attribute('height')
end
end
if start_html == post.cooked && doc.to_html != post.cooked
post.update_column(:cooked, doc.to_html)
post.publish_change_to_clients! :revised
end
end
end

View File

@ -58,7 +58,11 @@ class Post < ActiveRecord::Base
# We can pass several creating options to a post via attributes
attr_accessor :image_sizes, :quoted_post_numbers, :no_bump, :invalidate_oneboxes, :cooking_options, :skip_unique_check
SHORT_POST_CHARS = 1200
LARGE_IMAGES ||= "large_images".freeze
BROKEN_IMAGES ||= "broken_images".freeze
DOWNLOADED_IMAGES ||= "downloaded_images".freeze
SHORT_POST_CHARS ||= 1200
scope :private_posts_for_user, ->(user) {
where("posts.topic_id IN (SELECT topic_id

View File

@ -31,9 +31,9 @@ class CookedPostProcessor
def post_process(bypass_bump = false)
DistributedMutex.synchronize("post_process_#{@post.id}") do
DiscourseEvent.trigger(:before_post_process_cooked, @doc, @post)
keep_reverse_index_up_to_date
post_process_images
post_process_oneboxes
post_process_images
keep_reverse_index_up_to_date
optimize_urls
update_post_image
enforce_nofollow
@ -65,28 +65,30 @@ class CookedPostProcessor
end
end
upload_ids |= oneboxed_image_uploads.pluck(:id)
upload_ids |= downloaded_images.values.select { |id| Upload.exists?(id) }
values = upload_ids.map { |u| "(#{@post.id},#{u})" }.join(",")
PostUpload.transaction do
PostUpload.where(post_id: @post.id).delete_all
if upload_ids.length > 0
if upload_ids.size > 0
PostUpload.exec_sql("INSERT INTO post_uploads (post_id, upload_id) VALUES #{values}")
end
end
end
def post_process_images
images = extract_images
return if images.blank?
images.each do |img|
next if large_images.include?(img["src"]) && add_large_image_placeholder!(img)
extract_images.each do |img|
src = img["src"].sub(/^https?:/i, "")
if large_images.include?(src)
add_large_image_placeholder!(img)
elsif broken_images.include?(src)
add_broken_image_placeholder!(img)
else
limit_size!(img)
convert_to_link!(img)
end
end
end
def add_large_image_placeholder!(img)
url = img["src"]
@ -125,32 +127,48 @@ class CookedPostProcessor
end
img.remove
true
end
def add_broken_image_placeholder!(img)
img.name = "span"
img.set_attribute("class", "broken-image fa fa-chain-broken")
img.set_attribute("title", I18n.t("post.image_placeholder.broken"))
img.remove_attribute("src")
img.remove_attribute("width")
img.remove_attribute("height")
end
def large_images
@large_images ||= @post.custom_fields[Jobs::PullHotlinkedImages::LARGE_IMAGES].presence || []
@large_images ||= JSON.parse(@post.custom_fields[Post::LARGE_IMAGES].presence || "[]") rescue []
end
def broken_images
@broken_images ||= JSON.parse(@post.custom_fields[Post::BROKEN_IMAGES].presence || "[]") rescue []
end
def downloaded_images
@downloaded_images ||= JSON.parse(@post.custom_fields[Post::DOWNLOADED_IMAGES].presence || "{}") rescue {}
end
def extract_images
# all image with a src attribute
# all images with a src attribute
@doc.css("img[src]") -
# minus, data images
# minus data images
@doc.css("img[src^='data']") -
# minus, emojis
# minus emojis
@doc.css("img.emoji") -
# minus, image inside oneboxes
# minus oneboxed images
oneboxed_images -
# minus, images inside quotes
# minus images inside quotes
@doc.css(".quote img")
end
def extract_images_for_post
# all image with a src attribute
# all images with a src attribute
@doc.css("img[src]") -
# minus, emojis
# minus emojis
@doc.css("img.emoji") -
# minus, images inside quotes
# minus images inside quotes
@doc.css(".quote img")
end
@ -158,19 +176,6 @@ class CookedPostProcessor
@doc.css(".onebox-body img, .onebox img")
end
def oneboxed_image_uploads
urls = Set.new
oneboxed_images.each do |img|
url = img["src"].sub(/^https?:/i, "")
urls << url
urls << "http:#{url}"
urls << "https:#{url}"
end
Upload.where(origin: urls.to_a)
end
def limit_size!(img)
# retrieve the size from
# 1) the width/height attributes
@ -377,15 +382,16 @@ class CookedPostProcessor
Oneboxer.onebox(url, args)
end
uploads = oneboxed_image_uploads.select(:url, :origin)
oneboxed_images.each do |img|
if large_images.include?(img["src"])
src = img["src"].sub(/^https?:/i, "")
if large_images.include?(src) || broken_images.include?(src)
img.remove
next
end
url = img["src"].sub(/^https?:/i, "")
upload = uploads.find { |u| u.origin.sub(/^https?:/i, "") == url }
upload_id = downloaded_images[src]
upload = Upload.find(upload_id) if upload_id
img["src"] = upload.url if upload.present?
# make sure we grab dimensions for oneboxed images
@ -462,7 +468,7 @@ class CookedPostProcessor
# don't download remote images for posts that are more than n days old
return unless @post.created_at > (Date.today - SiteSetting.download_remote_images_max_days_old)
# we only want to run the job whenever it's changed by a user
return if @post.last_editor_id == Discourse.system_user.id
return if @post.last_editor_id && @post.last_editor_id <= 0
# make sure no other job is scheduled
Jobs.cancel_scheduled_job(:pull_hotlinked_images, post_id: @post.id)
# schedule the job

View File

@ -10,9 +10,9 @@ describe CookedPostProcessor do
let(:post_process) { sequence("post_process") }
it "post process in sequence" do
cpp.expects(:keep_reverse_index_up_to_date).in_sequence(post_process)
cpp.expects(:post_process_images).in_sequence(post_process)
cpp.expects(:post_process_oneboxes).in_sequence(post_process)
cpp.expects(:post_process_images).in_sequence(post_process)
cpp.expects(:keep_reverse_index_up_to_date).in_sequence(post_process)
cpp.expects(:optimize_urls).in_sequence(post_process)
cpp.expects(:pull_hotlinked_images).in_sequence(post_process)
cpp.post_process

View File

@ -125,26 +125,6 @@ describe Jobs::PullHotlinkedImages do
end
end
describe 'replace' do
it 'broken image with placeholder' do
post = Fabricate(:post, raw: "<img src='#{broken_image_url}'>")
Jobs::PullHotlinkedImages.new.execute(post_id: post.id)
post.reload
expect(post.cooked).to match(/<span class="broken-image fa fa-chain-broken/)
end
it 'large image with placeholder' do
post = Fabricate(:post, raw: "<img src='#{large_image_url}'>")
Jobs::PullHotlinkedImages.new.execute(post_id: post.id)
post.reload
expect(post.cooked).to match(/<div class="large-image-placeholder"><a href=.*\ target="_blank" .*\>/)
end
end
describe '#is_valid_image_url' do
subject { described_class.new }