From 678e28794ae43919cc1326ffb4ad5b7e853af267 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Hanol?= Date: Thu, 16 Nov 2017 15:45:07 +0100 Subject: [PATCH] FIX: properly handle too large & broken images in posts --- app/jobs/regular/pull_hotlinked_images.rb | 69 ++++++--------- app/models/post.rb | 6 +- lib/cooked_post_processor.rb | 86 ++++++++++--------- spec/components/cooked_post_processor_spec.rb | 4 +- spec/jobs/pull_hotlinked_images_spec.rb | 20 ----- 5 files changed, 78 insertions(+), 107 deletions(-) diff --git a/app/jobs/regular/pull_hotlinked_images.rb b/app/jobs/regular/pull_hotlinked_images.rb index b3d2bbdb009..b920c60590c 100644 --- a/app/jobs/regular/pull_hotlinked_images.rb +++ b/app/jobs/regular/pull_hotlinked_images.rb @@ -5,11 +5,8 @@ require_dependency 'upload_creator' module Jobs class PullHotlinkedImages < Jobs::Base - sidekiq_options queue: 'low' - LARGE_IMAGES = "large_images".freeze - def initialize @max_size = SiteSetting.max_image_size_kb.kilobytes end @@ -47,26 +44,25 @@ module Jobs raw = post.raw.dup start_raw = raw.dup + downloaded_urls = {} - large_images = post.custom_fields[LARGE_IMAGES].presence || [] - # recover from bad custom field silently - unless Array === large_images - large_images = [] - end + large_images = JSON.parse(post.custom_fields[Post::LARGE_IMAGES].presence || "[]") rescue [] + broken_images = JSON.parse(post.custom_fields[Post::BROKEN_IMAGES].presence || "[]") rescue [] + downloaded_images = JSON.parse(post.custom_fields[Post::DOWNLOADED_IMAGES].presence || "{}") rescue {} - broken_images, new_large_images = [], [] + has_new_large_image = false + has_new_broken_image = false + has_downloaded_image = false extract_images_from(post.cooked).each do |image| src = original_src = image['src'] - if src.start_with?("//") - src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" - end + src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//") if is_valid_image_url(src) begin # have we already downloaded that file? - unless downloaded_urls.include?(src) || large_images.include?(src) || broken_images.include?(src) + unless downloaded_images.include?(src) || large_images.include?(src) || broken_images.include?(src) if hotlinked = download(src) if File.size(hotlinked.path) <= @max_size filename = File.basename(URI.parse(src).path) @@ -74,15 +70,18 @@ module Jobs upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(post.user_id) if upload.persisted? downloaded_urls[src] = upload.url + downloaded_images[src.sub(/^https?:/i, "")] = upload.id + has_downloaded_image = true else log(:info, "Failed to pull hotlinked image for post: #{post_id}: #{src} - #{upload.errors.full_messages.join("\n")}") end else - large_images << original_src - new_large_images << original_src + large_images << original_src.sub(/^https?:/i, "") + has_new_large_image = true end else - broken_images << original_src + broken_images << original_src.sub(/^https?:/i, "") + has_new_broken_image = true end end # have we successfully downloaded that file? @@ -111,42 +110,24 @@ module Jobs log(:error, "Failed to pull hotlinked image (#{src}) post: #{post_id}\n" + e.message + "\n" + e.backtrace.join("\n")) end end - end - if new_large_images.length > 0 - post.custom_fields[LARGE_IMAGES] = large_images - post.save_custom_fields - end + large_images.uniq! + broken_images.uniq! + + post.custom_fields[Post::LARGE_IMAGES] = large_images.to_json if large_images.present? + post.custom_fields[Post::BROKEN_IMAGES] = broken_images.to_json if broken_images.present? + post.custom_fields[Post::DOWNLOADED_IMAGES] = downloaded_images.to_json if downloaded_images.present? + # only save custom fields if there are any + post.save_custom_fields if large_images.present? || broken_images.present? || downloaded_images.present? post.reload if start_raw == post.raw && raw != post.raw changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") } - # we never want that job to bump the topic - options = { bypass_bump: true } - post.revise(Discourse.system_user, changes, options) - elsif downloaded_urls.present? || new_large_images.present? + post.revise(Discourse.system_user, changes, bypass_bump: true) + elsif has_downloaded_image || has_new_large_image || has_new_broken_image post.trigger_post_process(true) - elsif broken_images.present? - start_html = post.cooked - doc = Nokogiri::HTML::fragment(start_html) - images = doc.css("img[src]") - doc.css("img.avatar") - images.each do |tag| - src = tag['src'] - if broken_images.include?(src) - tag.name = 'span' - tag.set_attribute('class', 'broken-image fa fa-chain-broken') - tag.set_attribute('title', I18n.t('post.image_placeholder.broken')) - tag.remove_attribute('src') - tag.remove_attribute('width') - tag.remove_attribute('height') - end - end - if start_html == post.cooked && doc.to_html != post.cooked - post.update_column(:cooked, doc.to_html) - post.publish_change_to_clients! :revised - end end end diff --git a/app/models/post.rb b/app/models/post.rb index 4dc6999c2cd..87bd0faf0f4 100644 --- a/app/models/post.rb +++ b/app/models/post.rb @@ -58,7 +58,11 @@ class Post < ActiveRecord::Base # We can pass several creating options to a post via attributes attr_accessor :image_sizes, :quoted_post_numbers, :no_bump, :invalidate_oneboxes, :cooking_options, :skip_unique_check - SHORT_POST_CHARS = 1200 + LARGE_IMAGES ||= "large_images".freeze + BROKEN_IMAGES ||= "broken_images".freeze + DOWNLOADED_IMAGES ||= "downloaded_images".freeze + + SHORT_POST_CHARS ||= 1200 scope :private_posts_for_user, ->(user) { where("posts.topic_id IN (SELECT topic_id diff --git a/lib/cooked_post_processor.rb b/lib/cooked_post_processor.rb index 31ee7d0266a..997b7e895d7 100644 --- a/lib/cooked_post_processor.rb +++ b/lib/cooked_post_processor.rb @@ -31,9 +31,9 @@ class CookedPostProcessor def post_process(bypass_bump = false) DistributedMutex.synchronize("post_process_#{@post.id}") do DiscourseEvent.trigger(:before_post_process_cooked, @doc, @post) - keep_reverse_index_up_to_date - post_process_images post_process_oneboxes + post_process_images + keep_reverse_index_up_to_date optimize_urls update_post_image enforce_nofollow @@ -65,26 +65,28 @@ class CookedPostProcessor end end - upload_ids |= oneboxed_image_uploads.pluck(:id) + upload_ids |= downloaded_images.values.select { |id| Upload.exists?(id) } values = upload_ids.map { |u| "(#{@post.id},#{u})" }.join(",") PostUpload.transaction do PostUpload.where(post_id: @post.id).delete_all - if upload_ids.length > 0 + if upload_ids.size > 0 PostUpload.exec_sql("INSERT INTO post_uploads (post_id, upload_id) VALUES #{values}") end end end def post_process_images - images = extract_images - return if images.blank? - - images.each do |img| - next if large_images.include?(img["src"]) && add_large_image_placeholder!(img) - - limit_size!(img) - convert_to_link!(img) + extract_images.each do |img| + src = img["src"].sub(/^https?:/i, "") + if large_images.include?(src) + add_large_image_placeholder!(img) + elsif broken_images.include?(src) + add_broken_image_placeholder!(img) + else + limit_size!(img) + convert_to_link!(img) + end end end @@ -125,32 +127,48 @@ class CookedPostProcessor end img.remove - true + end + + def add_broken_image_placeholder!(img) + img.name = "span" + img.set_attribute("class", "broken-image fa fa-chain-broken") + img.set_attribute("title", I18n.t("post.image_placeholder.broken")) + img.remove_attribute("src") + img.remove_attribute("width") + img.remove_attribute("height") end def large_images - @large_images ||= @post.custom_fields[Jobs::PullHotlinkedImages::LARGE_IMAGES].presence || [] + @large_images ||= JSON.parse(@post.custom_fields[Post::LARGE_IMAGES].presence || "[]") rescue [] + end + + def broken_images + @broken_images ||= JSON.parse(@post.custom_fields[Post::BROKEN_IMAGES].presence || "[]") rescue [] + end + + def downloaded_images + @downloaded_images ||= JSON.parse(@post.custom_fields[Post::DOWNLOADED_IMAGES].presence || "{}") rescue {} end def extract_images - # all image with a src attribute + # all images with a src attribute @doc.css("img[src]") - - # minus, data images + # minus data images @doc.css("img[src^='data']") - - # minus, emojis + # minus emojis @doc.css("img.emoji") - - # minus, image inside oneboxes + # minus oneboxed images oneboxed_images - - # minus, images inside quotes + # minus images inside quotes @doc.css(".quote img") end def extract_images_for_post - # all image with a src attribute + # all images with a src attribute @doc.css("img[src]") - - # minus, emojis + # minus emojis @doc.css("img.emoji") - - # minus, images inside quotes + # minus images inside quotes @doc.css(".quote img") end @@ -158,19 +176,6 @@ class CookedPostProcessor @doc.css(".onebox-body img, .onebox img") end - def oneboxed_image_uploads - urls = Set.new - - oneboxed_images.each do |img| - url = img["src"].sub(/^https?:/i, "") - urls << url - urls << "http:#{url}" - urls << "https:#{url}" - end - - Upload.where(origin: urls.to_a) - end - def limit_size!(img) # retrieve the size from # 1) the width/height attributes @@ -377,15 +382,16 @@ class CookedPostProcessor Oneboxer.onebox(url, args) end - uploads = oneboxed_image_uploads.select(:url, :origin) oneboxed_images.each do |img| - if large_images.include?(img["src"]) + src = img["src"].sub(/^https?:/i, "") + + if large_images.include?(src) || broken_images.include?(src) img.remove next end - url = img["src"].sub(/^https?:/i, "") - upload = uploads.find { |u| u.origin.sub(/^https?:/i, "") == url } + upload_id = downloaded_images[src] + upload = Upload.find(upload_id) if upload_id img["src"] = upload.url if upload.present? # make sure we grab dimensions for oneboxed images @@ -462,7 +468,7 @@ class CookedPostProcessor # don't download remote images for posts that are more than n days old return unless @post.created_at > (Date.today - SiteSetting.download_remote_images_max_days_old) # we only want to run the job whenever it's changed by a user - return if @post.last_editor_id == Discourse.system_user.id + return if @post.last_editor_id && @post.last_editor_id <= 0 # make sure no other job is scheduled Jobs.cancel_scheduled_job(:pull_hotlinked_images, post_id: @post.id) # schedule the job diff --git a/spec/components/cooked_post_processor_spec.rb b/spec/components/cooked_post_processor_spec.rb index b30137d5b75..b6bf69b1b0e 100644 --- a/spec/components/cooked_post_processor_spec.rb +++ b/spec/components/cooked_post_processor_spec.rb @@ -10,9 +10,9 @@ describe CookedPostProcessor do let(:post_process) { sequence("post_process") } it "post process in sequence" do - cpp.expects(:keep_reverse_index_up_to_date).in_sequence(post_process) - cpp.expects(:post_process_images).in_sequence(post_process) cpp.expects(:post_process_oneboxes).in_sequence(post_process) + cpp.expects(:post_process_images).in_sequence(post_process) + cpp.expects(:keep_reverse_index_up_to_date).in_sequence(post_process) cpp.expects(:optimize_urls).in_sequence(post_process) cpp.expects(:pull_hotlinked_images).in_sequence(post_process) cpp.post_process diff --git a/spec/jobs/pull_hotlinked_images_spec.rb b/spec/jobs/pull_hotlinked_images_spec.rb index 3447fd00796..fd0b65df612 100644 --- a/spec/jobs/pull_hotlinked_images_spec.rb +++ b/spec/jobs/pull_hotlinked_images_spec.rb @@ -125,26 +125,6 @@ describe Jobs::PullHotlinkedImages do end end - describe 'replace' do - it 'broken image with placeholder' do - post = Fabricate(:post, raw: "") - - Jobs::PullHotlinkedImages.new.execute(post_id: post.id) - post.reload - - expect(post.cooked).to match(/") - - Jobs::PullHotlinkedImages.new.execute(post_id: post.id) - post.reload - - expect(post.cooked).to match(/