From c1db968740d6177d28f80599afb2f7961aca9638 Mon Sep 17 00:00:00 2001 From: David Taylor Date: Tue, 3 May 2022 13:53:32 +0100 Subject: [PATCH] DEV: Move hotlinked image information into a dedicated table (#16585) This will make future changes to the 'pull hotlinked images' system easier. This commit should not introduce any functional change. For now, the old post_custom_field data is kept in the database. This will be dropped in a future commit. --- app/jobs/regular/pull_hotlinked_images.rb | 66 +++++++--------- app/models/post.rb | 8 +- app/models/post_hotlinked_media.rb | 29 +++++++ app/models/upload.rb | 2 + ...20428094026_create_post_hotlinked_media.rb | 75 +++++++++++++++++++ lib/cooked_post_processor.rb | 6 +- spec/lib/cooked_post_processor_spec.rb | 26 +++++-- spec/models/post_spec.rb | 5 -- spec/requests/posts_controller_spec.rb | 5 +- 9 files changed, 164 insertions(+), 58 deletions(-) create mode 100644 app/models/post_hotlinked_media.rb create mode 100644 db/migrate/20220428094026_create_post_hotlinked_media.rb diff --git a/app/jobs/regular/pull_hotlinked_images.rb b/app/jobs/regular/pull_hotlinked_images.rb index eeac9ff89ea..bf0aa6014c7 100644 --- a/app/jobs/regular/pull_hotlinked_images.rb +++ b/app/jobs/regular/pull_hotlinked_images.rb @@ -21,15 +21,9 @@ module Jobs raw = post.raw.dup start_raw = raw.dup - large_image_urls = post.custom_fields[Post::LARGE_IMAGES] || [] - broken_image_urls = post.custom_fields[Post::BROKEN_IMAGES] || [] - downloaded_image_ids = post.custom_fields[Post::DOWNLOADED_IMAGES] || {} + hotlinked_map = post.post_hotlinked_media.map { |r| [r.url, r] }.to_h - upload_records = Upload.where(id: downloaded_image_ids.values) - upload_records = Hash[upload_records.map { |u| [u.id, u] }] - - downloaded_images = {} - downloaded_image_ids.each { |url, id| downloaded_images[url] = upload_records[id] } + changed_hotlink_records = false extract_images_from(post.cooked).each do |node| download_src = original_src = node['src'] || node['href'] @@ -38,19 +32,32 @@ module Jobs next if !should_download_image?(download_src, post) - begin - already_attempted_download = downloaded_images.include?(normalized_src) || large_image_urls.include?(normalized_src) || broken_image_urls.include?(normalized_src) - if !already_attempted_download - downloaded_images[normalized_src] = attempt_download(download_src, post.user_id) + hotlink_record = hotlinked_map[normalized_src] + + if hotlink_record.nil? + hotlinked_map[normalized_src] = hotlink_record = PostHotlinkedMedia.new( + post: post, + url: normalized_src + ) + begin + hotlink_record.upload = attempt_download(download_src, post.user_id) + hotlink_record.status = :downloaded + rescue ImageTooLargeError + hotlink_record.status = :too_large + rescue ImageBrokenError + hotlink_record.status = :download_failed + rescue UploadCreateError + hotlink_record.status = :upload_create_failed end - rescue ImageTooLargeError - large_image_urls << normalized_src - rescue ImageBrokenError - broken_image_urls << normalized_src + end + + if hotlink_record.changed? + changed_hotlink_records = true + hotlink_record.save! end # have we successfully downloaded that file? - if upload = downloaded_images[normalized_src] + if upload = hotlink_record&.upload raw = replace_in_raw(original_src: original_src, upload: upload, raw: raw) end rescue => e @@ -58,26 +65,6 @@ module Jobs log(:error, "Failed to pull hotlinked image (#{download_src}) post: #{@post_id}\n" + e.message + "\n" + e.backtrace.join("\n")) end - large_image_urls.uniq! - broken_image_urls.uniq! - downloaded_images.compact! - - post.custom_fields[Post::LARGE_IMAGES] = large_image_urls - post.custom_fields[Post::BROKEN_IMAGES] = broken_image_urls - - downloaded_image_ids = {} - downloaded_images.each { |url, upload| downloaded_image_ids[url] = upload.id } - post.custom_fields[Post::DOWNLOADED_IMAGES] = downloaded_image_ids - - [Post::LARGE_IMAGES, Post::BROKEN_IMAGES, Post::DOWNLOADED_IMAGES].each do |key| - post.custom_fields.delete(key) if !post.custom_fields[key].present? - end - - custom_fields_updated = !post.custom_fields_clean? - - # only save custom fields if they changed - post.save_custom_fields if custom_fields_updated - # If post changed while we were downloading images, never apply edits post.reload post_changed_elsewhere = (start_raw != post.raw) @@ -86,7 +73,7 @@ module Jobs if !post_changed_elsewhere && raw_changed_here changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") } post.revise(Discourse.system_user, changes, bypass_bump: true, skip_staff_log: true) - elsif custom_fields_updated + elsif changed_hotlink_records post.trigger_post_process( bypass_bump: true, skip_pull_hotlinked_images: true # Avoid an infinite loop of job scheduling @@ -127,6 +114,7 @@ module Jobs class ImageTooLargeError < StandardError; end class ImageBrokenError < StandardError; end + class UploadCreateError < StandardError; end def attempt_download(src, user_id) # secure-media-uploads endpoint prevents anonymous downloads, so we @@ -145,7 +133,7 @@ module Jobs upload else log(:info, "Failed to persist downloaded hotlinked image for post: #{@post_id}: #{src} - #{upload.errors.full_messages.join("\n")}") - nil + raise UploadCreateError end end diff --git a/app/models/post.rb b/app/models/post.rb index e3b74078a00..371ea3172a6 100644 --- a/app/models/post.rb +++ b/app/models/post.rb @@ -62,6 +62,8 @@ class Post < ActiveRecord::Base belongs_to :image_upload, class_name: "Upload" + has_many :post_hotlinked_media, dependent: :destroy, class_name: "PostHotlinkedMedia" + validates_with PostValidator, unless: :skip_validation after_commit :index_search @@ -78,6 +80,8 @@ class Post < ActiveRecord::Base SHORT_POST_CHARS ||= 1200 + # TODO: Drop the data from these three custom fields, + # drop the indexes, and remove the relavent constants register_custom_field_type(LARGE_IMAGES, :json) register_custom_field_type(BROKEN_IMAGES, :json) register_custom_field_type(DOWNLOADED_IMAGES, :json) @@ -714,8 +718,8 @@ class Post < ActiveRecord::Base end if invalidate_broken_images - custom_fields.delete(BROKEN_IMAGES) - save_custom_fields + post_hotlinked_media.download_failed.destroy_all + post_hotlinked_media.upload_create_failed.destroy_all end # Extracts urls from the body diff --git a/app/models/post_hotlinked_media.rb b/app/models/post_hotlinked_media.rb new file mode 100644 index 00000000000..6ff1f921736 --- /dev/null +++ b/app/models/post_hotlinked_media.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: true + +class PostHotlinkedMedia < ActiveRecord::Base + belongs_to :post + belongs_to :upload + enum status: { + downloaded: "downloaded", + too_large: "too_large", + download_failed: "download_failed", + upload_create_failed: "upload_create_failed" + } +end + +# == Schema Information +# +# Table name: post_hotlinked_media +# +# id :bigint not null, primary key +# post_id :bigint not null +# url :string not null +# status :enum not null +# upload_id :bigint +# created_at :datetime not null +# updated_at :datetime not null +# +# Indexes +# +# index_post_hotlinked_media_on_post_id_and_url (post_id,url) UNIQUE +# diff --git a/app/models/upload.rb b/app/models/upload.rb index b24044b7a68..ff2aafa16e7 100644 --- a/app/models/upload.rb +++ b/app/models/upload.rb @@ -27,6 +27,8 @@ class Upload < ActiveRecord::Base has_many :post_uploads, dependent: :destroy has_many :posts, through: :post_uploads + has_many :post_hotlinked_media, dependent: :destroy, class_name: "PostHotlinkedMedia" + has_many :optimized_images, dependent: :destroy has_many :user_uploads, dependent: :destroy has_many :topic_thumbnails diff --git a/db/migrate/20220428094026_create_post_hotlinked_media.rb b/db/migrate/20220428094026_create_post_hotlinked_media.rb new file mode 100644 index 00000000000..847f9990173 --- /dev/null +++ b/db/migrate/20220428094026_create_post_hotlinked_media.rb @@ -0,0 +1,75 @@ +# frozen_string_literal: true + +class CreatePostHotlinkedMedia < ActiveRecord::Migration[6.1] + def change + reversible do |dir| + dir.up do + execute <<~SQL + CREATE TYPE hotlinked_media_status AS ENUM('downloaded', 'too_large', 'download_failed', 'upload_create_failed') + SQL + end + dir.down do + execute <<~SQL + DROP TYPE hotlinked_media_status + SQL + end + end + + create_table :post_hotlinked_media do |t| + t.bigint :post_id, null: false + t.string :url, null: false + t.column :status, :hotlinked_media_status, null: false + t.bigint :upload_id + t.timestamps + + t.index [:post_id, :url], unique: true + end + + reversible do |dir| + dir.up do + execute <<~SQL + INSERT INTO post_hotlinked_media (post_id, url, status, upload_id, created_at, updated_at) + SELECT + post_id, + obj.key AS url, + 'downloaded', + obj.value::bigint AS upload_id, + pcf.created_at, + pcf.updated_at + FROM post_custom_fields pcf + JOIN json_each_text(pcf.value::json) obj ON true + JOIN uploads ON obj.value::bigint = uploads.id + WHERE name='downloaded_images' + SQL + + execute <<~SQL + INSERT INTO post_hotlinked_media (post_id, url, status, upload_id, created_at, updated_at) + SELECT + post_id, + url.value, + 'download_failed', + NULL, + pcf.created_at, + pcf.updated_at + FROM post_custom_fields pcf + JOIN json_array_elements_text(pcf.value::json) url ON true + WHERE name='broken_images' + SQL + + execute <<~SQL + INSERT INTO post_hotlinked_media (post_id, url, status, upload_id, created_at, updated_at) + SELECT + post_id, + url.value, + 'too_large', + NULL, + pcf.created_at, + pcf.updated_at + FROM post_custom_fields pcf + JOIN json_array_elements_text(pcf.value::json) url ON true + WHERE name='large_images' + SQL + end + end + end +end diff --git a/lib/cooked_post_processor.rb b/lib/cooked_post_processor.rb index 30092ed98a1..1160597ff07 100644 --- a/lib/cooked_post_processor.rb +++ b/lib/cooked_post_processor.rb @@ -146,15 +146,15 @@ class CookedPostProcessor end def large_images - @large_images ||= @post&.custom_fields[Post::LARGE_IMAGES].presence || [] + @large_images ||= @post&.post_hotlinked_media.select { |r| r.too_large? }.map(&:url) end def broken_images - @broken_images ||= @post&.custom_fields[Post::BROKEN_IMAGES].presence || [] + @broken_images ||= @post&.post_hotlinked_media.select { |r| r.download_failed? }.map(&:url) end def downloaded_images - @downloaded_images ||= @post&.downloaded_images || [] + @downloaded_images ||= @post&.post_hotlinked_media.select { |r| r.downloaded? }.map { |r| [r.url, r.upload_id] }.to_h end def convert_to_link!(img) diff --git a/spec/lib/cooked_post_processor_spec.rb b/spec/lib/cooked_post_processor_spec.rb index 5f0718325b2..e27261e1ea6 100644 --- a/spec/lib/cooked_post_processor_spec.rb +++ b/spec/lib/cooked_post_processor_spec.rb @@ -1080,8 +1080,7 @@ describe CookedPostProcessor do post = Fabricate(:post, raw: url) upload.update!(url: "https://test.s3.amazonaws.com/something.png") - post.custom_fields[Post::DOWNLOADED_IMAGES] = { "//image.com/avatar.png": upload.id } - post.save_custom_fields + PostHotlinkedMedia.create!(url: "//image.com/avatar.png", post: post, status: 'downloaded', upload: upload) cpp = CookedPostProcessor.new(post, invalidate_oneboxes: true) stub_image_size(width: 100, height: 200) @@ -1113,8 +1112,7 @@ describe CookedPostProcessor do post = Fabricate(:post, raw: url) upload.update!(url: "https://test.s3.amazonaws.com/something.png") - post.custom_fields[Post::DOWNLOADED_IMAGES] = { "//image.com/avatar.png": upload.id } - post.save_custom_fields + PostHotlinkedMedia.create!(url: "//image.com/avatar.png", post: post, status: 'downloaded', upload: upload) cooked_url = "https://localhost/secure-media-uploads/test.png" UrlHelper.expects(:cook_url).with(upload.url, secure: true).returns(cooked_url) @@ -1137,8 +1135,7 @@ describe CookedPostProcessor do post = Fabricate(:post, raw: url) - post.custom_fields[Post::LARGE_IMAGES] = ["//image.com/avatar.png"] - post.save_custom_fields + PostHotlinkedMedia.create!(url: "//image.com/avatar.png", post: post, status: 'too_large') cpp = CookedPostProcessor.new(post, invalidate_oneboxes: true) cpp.post_process @@ -1146,6 +1143,23 @@ describe CookedPostProcessor do expect(cpp.doc.to_s).to match(/
/) expect(cpp.doc.to_s).to include(I18n.t("upload.placeholders.too_large_humanized", max_size: "4 MB")) end + + it "replaces broken image placeholder" do + url = 'https://image.com/my-avatar' + image_url = 'https://image.com/avatar.png' + + Oneboxer.stubs(:onebox).with(url, anything).returns("") + + post = Fabricate(:post, raw: url) + + PostHotlinkedMedia.create!(url: "//image.com/avatar.png", post: post, status: 'download_failed') + + cpp = CookedPostProcessor.new(post, invalidate_oneboxes: true) + cpp.post_process + + expect(cpp.doc.to_s).to have_tag("span.broken-image") + expect(cpp.doc.to_s).to include(I18n.t("post.image_placeholder.broken")) + end end context "#post_process_oneboxes removes nofollow if add_rel_nofollow_to_user_content is disabled" do diff --git a/spec/models/post_spec.rb b/spec/models/post_spec.rb index dd0949e91e3..f338b776caf 100644 --- a/spec/models/post_spec.rb +++ b/spec/models/post_spec.rb @@ -1447,11 +1447,6 @@ describe Post do context "#link_post_uploads" do it "finds all the uploads in the post" do - post.custom_fields[Post::DOWNLOADED_IMAGES] = { - "/#{upload_path}/original/1X/1/1234567890123456.csv": attachment_upload.id - } - - post.save_custom_fields post.link_post_uploads expect(PostUpload.where(post: post).pluck(:upload_id)).to contain_exactly( diff --git a/spec/requests/posts_controller_spec.rb b/spec/requests/posts_controller_spec.rb index 7e39725e48e..c5b50cbe0f8 100644 --- a/spec/requests/posts_controller_spec.rb +++ b/spec/requests/posts_controller_spec.rb @@ -793,11 +793,10 @@ describe PostsController do it "will invalidate broken images cache" do sign_in(moderator) - post.custom_fields[Post::BROKEN_IMAGES] = ["https://example.com/image.jpg"] - post.save_custom_fields + PostHotlinkedMedia.create!(url: "https://example.com/image.jpg", post: post, status: 'download_failed') put "/posts/#{post.id}/rebake.json" post.reload - expect(post.custom_fields[Post::BROKEN_IMAGES]).to be_nil + expect(post.post_hotlinked_media).to eq([]) end end end