DEV: Add verified to uploads and fill in S3 inventory (#10406)

When we run the S3 inventory, mark uploads that exist as verified true, those that don't as verified false, and uploads not included in the check / not yet checked as verified nil.
This commit is contained in:
Martin Brennan 2020-08-11 14:43:51 +10:00 committed by GitHub
parent cfa37d1670
commit b950b3fb3f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 69 additions and 15 deletions

View File

@ -0,0 +1,7 @@
# frozen_string_literal: true
class AddVerifiedColumnToUploads < ActiveRecord::Migration[6.0]
def change
add_column :uploads, :verified, :boolean, null: true
end
end

View File

@ -67,6 +67,23 @@ class S3Inventory
.joins("LEFT JOIN #{table_name} ON #{table_name}.etag = #{model.table_name}.etag")
.where("#{table_name}.etag IS NULL")
# marking as verified/not verified
id_threshold_clause = model == Upload ? " AND model_table.id > #{model::SEEDED_ID_THRESHOLD}" : ""
DB.exec(<<~SQL, inventory_date
UPDATE #{model.table_name}
SET verified = CASE when table_name_alias.etag IS NULL THEN false ELSE true END
FROM #{model.table_name} AS model_table
LEFT JOIN #{table_name} AS table_name_alias ON model_table.etag = table_name_alias.etag
WHERE model_table.id = #{model.table_name}.id
AND model_table.updated_at < ?
AND (
model_table.verified IS NULL OR
model_table.verified <> CASE when table_name_alias.etag IS NULL THEN false ELSE true END
)
#{id_threshold_clause}
SQL
)
if (missing_count = missing_uploads.count) > 0
missing_uploads.select(:id, :url).find_each do |upload|
log upload.url

View File

@ -58,27 +58,57 @@ describe "S3Inventory" do
expect(output).to eq("Failed to list inventory from S3\n")
end
it "should display missing uploads correctly" do
freeze_time
describe "verifying uploads" do
before do
freeze_time
CSV.foreach(csv_filename, headers: false) do |row|
next unless row[S3Inventory::CSV_KEY_INDEX].include?("default")
Fabricate(:upload, etag: row[S3Inventory::CSV_ETAG_INDEX], updated_at: 2.days.ago)
CSV.foreach(csv_filename, headers: false) do |row|
next unless row[S3Inventory::CSV_KEY_INDEX].include?("default")
Fabricate(:upload, etag: row[S3Inventory::CSV_ETAG_INDEX], updated_at: 2.days.ago)
end
@upload1 = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago)
@upload2 = Fabricate(:upload, etag: "ETag2", updated_at: Time.now)
@no_etag = Fabricate(:upload, updated_at: 2.days.ago)
inventory.expects(:files).returns([{ key: "Key", filename: "#{csv_filename}.gz" }]).times(3)
inventory.expects(:inventory_date).times(2).returns(Time.now)
end
upload = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago)
Fabricate(:upload, etag: "ETag2", updated_at: Time.now)
no_etag = Fabricate(:upload, updated_at: 2.days.ago)
it "should display missing uploads correctly" do
output = capture_stdout do
inventory.backfill_etags_and_list_missing
end
inventory.expects(:files).returns([{ key: "Key", filename: "#{csv_filename}.gz" }]).times(3)
inventory.expects(:inventory_date).returns(Time.now)
output = capture_stdout do
inventory.backfill_etags_and_list_missing
expect(output).to eq("#{@upload1.url}\n#{@no_etag.url}\n2 of 5 uploads are missing\n")
expect(Discourse.stats.get("missing_s3_uploads")).to eq(2)
end
expect(output).to eq("#{upload.url}\n#{no_etag.url}\n2 of 5 uploads are missing\n")
expect(Discourse.stats.get("missing_s3_uploads")).to eq(2)
it "marks missing uploads as not verified and found uploads as verified. uploads not checked will be verified nil" do
expect(Upload.where(verified: nil).count).to eq(12)
output = capture_stdout do
inventory.backfill_etags_and_list_missing
end
verified = Upload.pluck(:verified)
expect(Upload.where(verified: true).count).to eq(3)
expect(Upload.where(verified: false).count).to eq(2)
expect(Upload.where(verified: nil).count).to eq(7)
end
it "does not affect the updated_at date of uploads" do
upload_1_updated = @upload1.updated_at
upload_2_updated = @upload2.updated_at
no_etag_updated = @no_etag.updated_at
output = capture_stdout do
inventory.backfill_etags_and_list_missing
end
expect(@upload1.reload.updated_at).to eq_time(upload_1_updated)
expect(@upload2.reload.updated_at).to eq_time(upload_2_updated)
expect(@no_etag.reload.updated_at).to eq_time(no_etag_updated)
end
end
it "should backfill etags to uploads table correctly" do