From dc55b645b288d1ff80ce4f6f6b3cc1381c539fdc Mon Sep 17 00:00:00 2001 From: Alan Guo Xiang Tan Date: Thu, 30 May 2024 08:37:38 +0800 Subject: [PATCH] DEV: Allow site administrators to mark S3 uploads with a missing status (#27222) This commit introduces the following changes which allows a site administrator to mark `Upload` records with the `s3_file_missing` verification status which will result in the `Upload` record being ignored when `Discourse.store.list_missing_uploads` is ran on a site where S3 uploads are enabled and `SiteSetting.enable_s3_inventory` is set to `true`. 1. Introduce `s3_file_missing` to `Upload.verification_statuses` 2. Introduce `Upload.mark_invalid_s3_uploads_as_missing` which updates `Upload#verification_status` of all `Upload` records from `invalid_etag` to `s3_file_missing`. 3. Introduce `rake uploads:mark_invalid_s3_uploads_as_missing` Rake task which allows a site administrator to change `Upload` records with `invalid_etag` verification status to the `s3_file_missing` verificaton_status. 4. Update `S3Inventory` to ignore `Upload` records with the `s3_file_missing` verification status. --- app/models/upload.rb | 22 ++++++++++++- lib/s3_inventory.rb | 59 ++++++++++++++++++++--------------- lib/tasks/uploads.rake | 34 ++++++++++++++++---- spec/lib/s3_inventory_spec.rb | 30 +++++++++++------- spec/models/upload_spec.rb | 24 ++++++++++++++ 5 files changed, 125 insertions(+), 44 deletions(-) diff --git a/app/models/upload.rb b/app/models/upload.rb index 665ac847ed3..ec60031cf0d 100644 --- a/app/models/upload.rb +++ b/app/models/upload.rb @@ -57,8 +57,28 @@ class Upload < ActiveRecord::Base scope :by_users, -> { where("uploads.id > ?", SEEDED_ID_THRESHOLD) } + scope :without_s3_file_missing_confirmed_verification_status, + -> do + where.not(verification_status: Upload.verification_statuses[:s3_file_missing_confirmed]) + end + + scope :with_invalid_etag_verification_status, + -> { where(verification_status: Upload.verification_statuses[:invalid_etag]) } + def self.verification_statuses - @verification_statuses ||= Enum.new(unchecked: 1, verified: 2, invalid_etag: 3) + @verification_statuses ||= + Enum.new( + unchecked: 1, + verified: 2, + invalid_etag: 3, # Used by S3Inventory to mark S3 Upload records that have an invalid ETag value compared to the ETag value of the inventory file + s3_file_missing_confirmed: 4, # Used by S3Inventory to skip S3 Upload records that are confirmed to not be backed by a file in the S3 file store + ) + end + + def self.mark_invalid_s3_uploads_as_missing + Upload.with_invalid_etag_verification_status.update_all( + verification_status: Upload.verification_statuses[:s3_file_missing_confirmed], + ) end def self.add_unused_callback(&block) diff --git a/lib/s3_inventory.rb b/lib/s3_inventory.rb index 973dd75e02f..73fb7ab46a4 100644 --- a/lib/s3_inventory.rb +++ b/lib/s3_inventory.rb @@ -4,7 +4,7 @@ require "aws-sdk-s3" require "csv" class S3Inventory - attr_reader :type, :model, :inventory_date + attr_reader :type, :inventory_date CSV_KEY_INDEX = 1 CSV_ETAG_INDEX = 2 @@ -25,9 +25,12 @@ class S3Inventory if type == :upload @type = "original" @model = Upload + @scope = @model.by_users.without_s3_file_missing_confirmed_verification_status elsif type == :optimized @type = "optimized" - @model = OptimizedImage + @scope = @model = OptimizedImage + else + raise "Invalid type: #{type}" end end @@ -46,10 +49,10 @@ class S3Inventory ActiveRecord::Base.transaction do begin connection.exec( - "CREATE TEMP TABLE #{table_name}(url text UNIQUE, etag text, PRIMARY KEY(etag, url))", + "CREATE TEMP TABLE #{tmp_table_name}(url text UNIQUE, etag text, PRIMARY KEY(etag, url))", ) - connection.copy_data("COPY #{table_name} FROM STDIN CSV") do + connection.copy_data("COPY #{tmp_table_name} FROM STDIN CSV") do for_each_inventory_row do |row| key = row[CSV_KEY_INDEX] @@ -61,66 +64,70 @@ class S3Inventory end end + table_name = @model.table_name + # backfilling etags connection.async_exec( - "UPDATE #{model.table_name} - SET etag = #{table_name}.etag - FROM #{table_name} - WHERE #{model.table_name}.etag IS NULL AND - #{model.table_name}.url = #{table_name}.url", + "UPDATE #{table_name} + SET etag = #{tmp_table_name}.etag + FROM #{tmp_table_name} + WHERE #{table_name}.etag IS NULL AND + #{table_name}.url = #{tmp_table_name}.url", ) - uploads = model.where("updated_at < ?", inventory_date) - uploads = uploads.by_users if model == Upload + uploads = @scope.where("updated_at < ?", inventory_date) missing_uploads = uploads.joins( - "LEFT JOIN #{table_name} ON #{table_name}.etag = #{model.table_name}.etag", - ).where("#{table_name}.etag IS NULL") + "LEFT JOIN #{tmp_table_name} ON #{tmp_table_name}.etag = #{table_name}.etag", + ).where("#{tmp_table_name}.etag IS NULL") exists_with_different_etag = missing_uploads .joins( - "LEFT JOIN #{table_name} inventory2 ON inventory2.url = #{model.table_name}.url", + "LEFT JOIN #{tmp_table_name} inventory2 ON inventory2.url = #{table_name}.url", ) .where("inventory2.etag IS NOT NULL") .pluck(:id) # marking as verified/not verified - if model == Upload + if @model == Upload sql_params = { inventory_date: inventory_date, invalid_etag: Upload.verification_statuses[:invalid_etag], + s3_file_missing_confirmed: Upload.verification_statuses[:s3_file_missing_confirmed], verified: Upload.verification_statuses[:verified], - seeded_id_threshold: model::SEEDED_ID_THRESHOLD, + seeded_id_threshold: @model::SEEDED_ID_THRESHOLD, } DB.exec(<<~SQL, sql_params) - UPDATE #{model.table_name} + UPDATE #{table_name} SET verification_status = :verified WHERE etag IS NOT NULL AND verification_status <> :verified + AND verification_status <> :s3_file_missing_confirmed AND updated_at < :inventory_date AND id > :seeded_id_threshold AND EXISTS ( SELECT 1 - FROM #{table_name} - WHERE #{table_name}.etag = #{model.table_name}.etag + FROM #{tmp_table_name} + WHERE #{tmp_table_name}.etag = #{table_name}.etag ) SQL DB.exec(<<~SQL, sql_params) - UPDATE #{model.table_name} + UPDATE #{table_name} SET verification_status = :invalid_etag WHERE verification_status <> :invalid_etag + AND verification_status <> :s3_file_missing_confirmed AND updated_at < :inventory_date AND id > :seeded_id_threshold AND NOT EXISTS ( SELECT 1 - FROM #{table_name} - WHERE #{table_name}.etag = #{model.table_name}.etag + FROM #{tmp_table_name} + WHERE #{tmp_table_name}.etag = #{table_name}.etag ) SQL end @@ -136,16 +143,16 @@ class S3Inventory end end - log "#{missing_count} of #{uploads.count} #{model.name.underscore.pluralize} are missing" + log "#{missing_count} of #{uploads.count} #{@scope.name.underscore.pluralize} are missing" if exists_with_different_etag.present? log "#{exists_with_different_etag.count} of these are caused by differing etags" log "Null the etag column and re-run for automatic backfill" end end - Discourse.stats.set("missing_s3_#{model.table_name}", missing_count) + Discourse.stats.set("missing_s3_#{table_name}", missing_count) ensure - connection.exec("DROP TABLE #{table_name}") unless connection.nil? + connection.exec("DROP TABLE #{tmp_table_name}") unless connection.nil? end end ensure @@ -255,7 +262,7 @@ class S3Inventory @connection ||= ActiveRecord::Base.connection.raw_connection end - def table_name + def tmp_table_name "#{type}_inventory" end diff --git a/lib/tasks/uploads.rake b/lib/tasks/uploads.rake index 1b3a0253545..8e01c7215a0 100644 --- a/lib/tasks/uploads.rake +++ b/lib/tasks/uploads.rake @@ -949,7 +949,7 @@ def analyze_missing_s3 puts end - missing_uploads = Upload.where(verification_status: Upload.verification_statuses[:invalid_etag]) + missing_uploads = Upload.with_invalid_etag_verification_status puts "Total missing uploads: #{missing_uploads.count}, newest is #{missing_uploads.maximum(:created_at)}" puts "Total problem posts: #{lookup.keys.count} with #{lookup.values.sum { |a| a.length }} missing uploads" puts "Other missing uploads count: #{other.count}" @@ -991,16 +991,15 @@ def analyze_missing_s3 end def delete_missing_s3 - missing = - Upload.where(verification_status: Upload.verification_statuses[:invalid_etag]).order( - :created_at, - ) + missing = Upload.with_invalid_etag_verification_status.order(:created_at) count = missing.count + if count > 0 puts "The following uploads will be deleted from the database" missing.each { |upload| puts "#{upload.id} - #{upload.url} - #{upload.created_at}" } puts "Please confirm you wish to delete #{count} upload records by typing YES" confirm = STDIN.gets.strip + if confirm == "YES" missing.destroy_all puts "#{count} records were deleted" @@ -1011,6 +1010,29 @@ def delete_missing_s3 end end +task "uploads:mark_invalid_s3_uploads_as_missing" => :environment do + puts "Marking invalid S3 uploads as missing for '#{RailsMultisite::ConnectionManagement.current_db}'..." + invalid_s3_uploads = Upload.with_invalid_etag_verification_status.order(:created_at) + count = invalid_s3_uploads.count + + if count > 0 + puts "The following uploads will be marked as missing on S3" + invalid_s3_uploads.each { |upload| puts "#{upload.id} - #{upload.url} - #{upload.created_at}" } + puts "Please confirm you wish to mark #{count} upload records as missing by typing YES" + confirm = STDIN.gets.strip + + if confirm == "YES" + changed_count = Upload.mark_invalid_s3_uploads_as_missing + puts "#{changed_count} records were marked as missing" + else + STDERR.puts "Aborting" + exit 1 + end + else + puts "No uploads found with invalid S3 etag verification status" + end +end + task "uploads:delete_missing_s3" => :environment do if RailsMultisite::ConnectionManagement.current_db != "default" delete_missing_s3 @@ -1031,7 +1053,7 @@ def fix_missing_s3 Jobs.run_immediately! puts "Attempting to download missing uploads and recreate" - ids = Upload.where(verification_status: Upload.verification_statuses[:invalid_etag]).pluck(:id) + ids = Upload.with_invalid_etag_verification_status.pluck(:id) ids.each do |id| upload = Upload.find_by(id: id) next if !upload diff --git a/spec/lib/s3_inventory_spec.rb b/spec/lib/s3_inventory_spec.rb index 3070fb99538..c1b07b743eb 100644 --- a/spec/lib/s3_inventory_spec.rb +++ b/spec/lib/s3_inventory_spec.rb @@ -38,10 +38,18 @@ RSpec.describe S3Inventory do ) end - @upload1 = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago) - @upload2 = Fabricate(:upload, etag: "ETag2", updated_at: Time.now) + @upload_1 = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago) + @upload_2 = Fabricate(:upload, etag: "ETag2", updated_at: Time.now) @no_etag = Fabricate(:upload, updated_at: 2.days.ago) + @upload_3 = + Fabricate( + :upload, + etag: "ETag3", + updated_at: 2.days.ago, + verification_status: Upload.verification_statuses[:s3_file_missing_confirmed], + ) + inventory.expects(:files).returns([{ key: "Key", filename: "#{csv_filename}.gz" }]).times(3) inventory.expects(:inventory_date).times(2).returns(Time.now) end @@ -49,7 +57,7 @@ RSpec.describe S3Inventory do it "should display missing uploads correctly" do output = capture_stdout { inventory.backfill_etags_and_list_missing } - expect(output).to eq("#{@upload1.url}\n#{@no_etag.url}\n2 of 5 uploads are missing\n") + expect(output).to eq("#{@upload_1.url}\n#{@no_etag.url}\n2 of 5 uploads are missing\n") expect(Discourse.stats.get("missing_s3_uploads")).to eq(2) end @@ -61,7 +69,7 @@ RSpec.describe S3Inventory do expect(output).to eq(<<~TEXT) #{differing_etag.url} has different etag - #{@upload1.url} + #{@upload_1.url} #{@no_etag.url} 3 of 5 uploads are missing 1 of these are caused by differing etags @@ -80,23 +88,23 @@ RSpec.describe S3Inventory do expect( Upload.where(verification_status: Upload.verification_statuses[:verified]).count, ).to eq(3) - expect( - Upload.where(verification_status: Upload.verification_statuses[:invalid_etag]).count, - ).to eq(2) + + expect(Upload.with_invalid_etag_verification_status.count).to eq(2) + expect( Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count, ).to eq(7) end it "does not affect the updated_at date of uploads" do - upload_1_updated = @upload1.updated_at - upload_2_updated = @upload2.updated_at + upload_1_updated = @upload_1.updated_at + upload_2_updated = @upload_2.updated_at no_etag_updated = @no_etag.updated_at output = capture_stdout { inventory.backfill_etags_and_list_missing } - expect(@upload1.reload.updated_at).to eq_time(upload_1_updated) - expect(@upload2.reload.updated_at).to eq_time(upload_2_updated) + expect(@upload_1.reload.updated_at).to eq_time(upload_1_updated) + expect(@upload_2.reload.updated_at).to eq_time(upload_2_updated) expect(@no_etag.reload.updated_at).to eq_time(no_etag_updated) end end diff --git a/spec/models/upload_spec.rb b/spec/models/upload_spec.rb index 0fbd19ee48b..af5cd175b7b 100644 --- a/spec/models/upload_spec.rb +++ b/spec/models/upload_spec.rb @@ -809,4 +809,28 @@ RSpec.describe Upload do expect { u.update!(dominant_color: "abcd") }.to raise_error(ActiveRecord::RecordInvalid) end end + + describe ".mark_invalid_s3_uploads_as_missing" do + it "should update all upload records with a `verification_status` of `invalid_etag` to `s3_file_missing`" do + upload_1 = + Fabricate(:upload_s3, verification_status: Upload.verification_statuses[:invalid_etag]) + + upload_2 = + Fabricate(:upload_s3, verification_status: Upload.verification_statuses[:invalid_etag]) + + upload_3 = Fabricate(:upload_s3, verification_status: Upload.verification_statuses[:verified]) + + Upload.mark_invalid_s3_uploads_as_missing + + expect(upload_1.reload.verification_status).to eq( + Upload.verification_statuses[:s3_file_missing_confirmed], + ) + + expect(upload_2.reload.verification_status).to eq( + Upload.verification_statuses[:s3_file_missing_confirmed], + ) + + expect(upload_3.reload.verification_status).to eq(Upload.verification_statuses[:verified]) + end + end end