FEATURE: Use amazon s3 inventory to manage upload stats (#6867)
This commit is contained in:
parent
a1b4d9b061
commit
b4f713ca52
|
@ -0,0 +1,17 @@
|
|||
require "s3_inventory"
|
||||
|
||||
module Jobs
|
||||
# if upload bucket changes or inventory bucket changes we want to update s3 bucket policy and inventory configuration
|
||||
class UpdateS3Inventory < Jobs::Base
|
||||
|
||||
def execute(args)
|
||||
return unless SiteSetting.enable_s3_inventory? && SiteSetting.enable_s3_uploads?
|
||||
|
||||
[:upload, :optimized].each do |type|
|
||||
s3_inventory = S3Inventory.new(Discourse.store.s3_helper, type)
|
||||
s3_inventory.update_bucket_policy if type == :upload
|
||||
s3_inventory.update_bucket_inventory_configuration
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -191,6 +191,7 @@ en:
|
|||
other: "You specified the invalid choices %{name}"
|
||||
default_categories_already_selected: "You cannot select a category used in another list."
|
||||
s3_upload_bucket_is_required: "You cannot enable uploads to S3 unless you've provided the 's3_upload_bucket'."
|
||||
enable_s3_uploads_is_required: "You cannot enable inventory to S3 unless you've enabled the S3 uploads."
|
||||
s3_backup_requires_s3_settings: "You cannot use S3 as backup location unless you've provided the '%{setting_name}'."
|
||||
s3_bucket_reused: "You cannot use the same bucket for 's3_upload_bucket' and 's3_backup_bucket'. Choose a different bucket or use a different path for each bucket."
|
||||
conflicting_google_user_id: 'The Google Account ID for this account has changed; staff intervention is required for security reasons. Please contact staff and point them to <br><a href="https://meta.discourse.org/t/76575">https://meta.discourse.org/t/76575</a>'
|
||||
|
@ -1488,6 +1489,7 @@ en:
|
|||
s3_force_path_style: "Enforce path-style addressing for your custom endpoint. IMPORTANT: Required for using Minio uploads and backups."
|
||||
s3_configure_tombstone_policy: "Enable automatic deletion policy for tombstone uploads. IMPORTANT: If disabled, no space will be reclaimed after uploads are deleted."
|
||||
s3_disable_cleanup: "Disable the removal of backups from S3 when removed locally."
|
||||
enable_s3_inventory: "Generate reports and verify uploads using Amazon S3 inventory. IMPORTANT: requires valid S3 credentials (both access key id & secret access key)."
|
||||
backup_time_of_day: "Time of day UTC when the backup should occur."
|
||||
backup_with_uploads: "Include uploads in scheduled backups. Disabling this will only backup the database."
|
||||
backup_location: "Location where backups are stored. IMPORTANT: S3 requires valid S3 credentials entered in Files settings."
|
||||
|
|
|
@ -1050,6 +1050,8 @@ files:
|
|||
s3_configure_tombstone_policy:
|
||||
default: true
|
||||
shadowed_by_global: true
|
||||
enable_s3_inventory:
|
||||
default: false
|
||||
allow_profile_backgrounds:
|
||||
client: true
|
||||
default: true
|
||||
|
|
|
@ -32,9 +32,7 @@ module BackupRestore
|
|||
end
|
||||
|
||||
def download_file(filename, destination_path, failure_message = nil)
|
||||
unless @s3_helper.object(filename).download_file(destination_path)
|
||||
raise failure_message&.to_s || "Failed to download file"
|
||||
end
|
||||
@s3_helper.download_file(filename, destination_path, failure_message)
|
||||
end
|
||||
|
||||
def upload_file(filename, source_path, content_type)
|
||||
|
|
|
@ -463,6 +463,11 @@ module Discourse
|
|||
end
|
||||
end
|
||||
|
||||
DiscourseEvent.on(:site_setting_saved) do |site_setting|
|
||||
name = site_setting.name.to_s
|
||||
Jobs.enqueue(:update_s3_inventory) if name.include?("s3_inventory") || name == "s3_upload_bucket"
|
||||
end
|
||||
|
||||
def self.current_user_provider
|
||||
@current_user_provider || Auth::DefaultCurrentUserProvider
|
||||
end
|
||||
|
|
|
@ -124,9 +124,15 @@ module FileStore
|
|||
end
|
||||
|
||||
def list_missing_uploads(skip_optimized: false)
|
||||
if SiteSetting.enable_s3_inventory
|
||||
require 's3_inventory'
|
||||
S3Inventory.new(s3_helper, :upload).list_missing
|
||||
S3Inventory.new(s3_helper, :optimized).list_missing unless skip_optimized
|
||||
else
|
||||
list_missing(Upload, "original/")
|
||||
list_missing(OptimizedImage, "optimized/") unless skip_optimized
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
|
@ -140,7 +146,7 @@ module FileStore
|
|||
verified_ids = []
|
||||
|
||||
files.each do |f|
|
||||
id = model.where("url LIKE '%#{f.key}'").pluck(:id).first if f.size > 0
|
||||
id = model.where("url LIKE '%#{f.key}' AND etag = '#{f.etag}'").pluck(:id).first
|
||||
verified_ids << id if id.present?
|
||||
marker = f.key
|
||||
end
|
||||
|
@ -150,7 +156,7 @@ module FileStore
|
|||
files = @s3_helper.list(prefix, marker)
|
||||
end
|
||||
|
||||
missing_uploads = model.where("id NOT IN (SELECT val FROM verified_ids)")
|
||||
missing_uploads = model.joins('LEFT JOIN verified_ids ON verified_ids.val = id').where("verified_ids.val IS NULL")
|
||||
missing_count = missing_uploads.count
|
||||
|
||||
if missing_count > 0
|
||||
|
|
|
@ -205,6 +205,20 @@ class S3Helper
|
|||
opts
|
||||
end
|
||||
|
||||
def download_file(filename, destination_path, failure_message = nil)
|
||||
unless object(filename).download_file(destination_path)
|
||||
raise failure_message&.to_s || "Failed to download file"
|
||||
end
|
||||
end
|
||||
|
||||
def s3_client
|
||||
@s3_client ||= Aws::S3::Client.new(@s3_options)
|
||||
end
|
||||
|
||||
def s3_inventory_path(path = 'inventory')
|
||||
get_path_for_s3_upload(path)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def default_s3_options
|
||||
|
@ -228,10 +242,6 @@ class S3Helper
|
|||
File.join("uploads", RailsMultisite::ConnectionManagement.current_db, "/")
|
||||
end
|
||||
|
||||
def s3_client
|
||||
@s3_client ||= Aws::S3::Client.new(@s3_options)
|
||||
end
|
||||
|
||||
def s3_resource
|
||||
Aws::S3::Resource.new(client: s3_client)
|
||||
end
|
||||
|
|
|
@ -0,0 +1,193 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
require "aws-sdk-s3"
|
||||
require "csv"
|
||||
|
||||
class S3Inventory
|
||||
|
||||
attr_reader :inventory_id, :csv_filename, :model
|
||||
|
||||
CSV_KEY_INDEX ||= 1
|
||||
CSV_ETAG_INDEX ||= 2
|
||||
INVENTORY_PREFIX ||= "inventory"
|
||||
|
||||
def initialize(s3_helper, type)
|
||||
@s3_helper = s3_helper
|
||||
|
||||
if type == :upload
|
||||
@inventory_id = "original"
|
||||
@model = Upload
|
||||
elsif type == :optimized
|
||||
@inventory_id = "optimized"
|
||||
@model = OptimizedImage
|
||||
end
|
||||
end
|
||||
|
||||
def file
|
||||
@file ||= unsorted_files.sort_by { |file| -file.last_modified.to_i }.first
|
||||
end
|
||||
|
||||
def list_missing
|
||||
if file.blank?
|
||||
error("Failed to list inventory from S3")
|
||||
return
|
||||
end
|
||||
|
||||
DistributedMutex.synchronize("s3_inventory_list_missing_#{inventory_id}") do
|
||||
current_db = RailsMultisite::ConnectionManagement.current_db
|
||||
timestamp = Time.now.strftime("%Y-%m-%d-%H%M%S")
|
||||
@tmp_directory = File.join(Rails.root, "tmp", INVENTORY_PREFIX, current_db, timestamp)
|
||||
@archive_filename = File.join(@tmp_directory, File.basename(file.key))
|
||||
@csv_filename = @archive_filename[0...-3]
|
||||
|
||||
FileUtils.mkdir_p(@tmp_directory)
|
||||
download_inventory_file_to_tmp_directory
|
||||
decompress_inventory_file
|
||||
|
||||
begin
|
||||
table_name = "#{inventory_id}_inventory"
|
||||
connection = ActiveRecord::Base.connection.raw_connection
|
||||
connection.exec("CREATE TEMP TABLE #{table_name}(key text UNIQUE, etag text PRIMARY KEY)")
|
||||
connection.copy_data("COPY #{table_name} FROM STDIN CSV") do
|
||||
CSV.foreach(csv_filename, headers: false) do |row|
|
||||
connection.put_copy_data("#{row[CSV_KEY_INDEX]},#{row[CSV_ETAG_INDEX]}\n")
|
||||
end
|
||||
end
|
||||
|
||||
uploads = model.where("created_at < ?", file.last_modified)
|
||||
missing_uploads = uploads.joins("LEFT JOIN #{table_name} ON #{table_name}.etag = #{model.table_name}.etag").where("#{table_name}.etag is NULL")
|
||||
|
||||
if (missing_count = missing_uploads.count) > 0
|
||||
missing_uploads.select(:id, :url).find_each do |upload|
|
||||
log upload.url
|
||||
end
|
||||
|
||||
log "#{missing_count} of #{uploads.count} #{model.name.underscore.pluralize} are missing"
|
||||
end
|
||||
ensure
|
||||
connection.exec("DROP TABLE #{table_name}") unless connection.nil?
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def download_inventory_file_to_tmp_directory
|
||||
log "Downloading inventory file to tmp directory..."
|
||||
failure_message = "Failed to inventory file to tmp directory."
|
||||
|
||||
@s3_helper.download_file(file.key, @archive_filename, failure_message)
|
||||
end
|
||||
|
||||
def decompress_inventory_file
|
||||
log "Decompressing inventory file, this may take a while..."
|
||||
|
||||
FileUtils.cd(@tmp_directory) do
|
||||
Discourse::Utils.execute_command('gzip', '--decompress', @archive_filename, failure_message: "Failed to decompress inventory file.")
|
||||
end
|
||||
end
|
||||
|
||||
def update_bucket_policy
|
||||
@s3_helper.s3_client.put_bucket_policy(
|
||||
bucket: bucket_name,
|
||||
policy: {
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Sid": "InventoryAndAnalyticsPolicy",
|
||||
"Effect": "Allow",
|
||||
"Principal": { "Service": "s3.amazonaws.com" },
|
||||
"Action": ["s3:PutObject"],
|
||||
"Resource": ["arn:aws:s3:::#{inventory_root_path}/*"],
|
||||
"Condition": {
|
||||
"ArnLike": {
|
||||
"aws:SourceArn": "arn:aws:s3:::#{bucket_name}"
|
||||
},
|
||||
"StringEquals": {
|
||||
"s3:x-amz-acl": "bucket-owner-full-control"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}.to_json
|
||||
)
|
||||
end
|
||||
|
||||
def update_bucket_inventory_configuration
|
||||
@s3_helper.s3_client.put_bucket_inventory_configuration(
|
||||
bucket: bucket_name,
|
||||
id: inventory_id,
|
||||
inventory_configuration: inventory_configuration,
|
||||
use_accelerate_endpoint: false
|
||||
)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def inventory_configuration
|
||||
filter_prefix = inventory_id
|
||||
destination_prefix = File.join(INVENTORY_PREFIX, inventory_id)
|
||||
|
||||
if bucket_folder_path.present?
|
||||
filter_prefix = File.join(bucket_folder_path, filter_prefix)
|
||||
destination_prefix = File.join(bucket_folder_path, destination_prefix)
|
||||
end
|
||||
|
||||
{
|
||||
destination: {
|
||||
s3_bucket_destination: {
|
||||
bucket: "arn:aws:s3:::#{bucket_name}",
|
||||
prefix: destination_prefix,
|
||||
format: "CSV"
|
||||
}
|
||||
},
|
||||
filter: {
|
||||
prefix: filter_prefix
|
||||
},
|
||||
is_enabled: SiteSetting.enable_s3_inventory,
|
||||
id: inventory_id,
|
||||
included_object_versions: "Current",
|
||||
optional_fields: ["ETag"],
|
||||
schedule: {
|
||||
frequency: "Daily"
|
||||
}
|
||||
}
|
||||
end
|
||||
|
||||
def bucket_name
|
||||
@s3_helper.s3_bucket_name
|
||||
end
|
||||
|
||||
def bucket_folder_path
|
||||
@s3_helper.s3_bucket_folder_path
|
||||
end
|
||||
|
||||
def unsorted_files
|
||||
objects = []
|
||||
|
||||
@s3_helper.list(File.join(inventory_path, "data")).each do |obj|
|
||||
if obj.key.match?(/\.csv\.gz$/i)
|
||||
objects << obj
|
||||
end
|
||||
end
|
||||
|
||||
objects
|
||||
rescue Aws::Errors::ServiceError => e
|
||||
log("Failed to list inventory from S3", e)
|
||||
end
|
||||
|
||||
def inventory_path
|
||||
File.join(inventory_root_path, inventory_id)
|
||||
end
|
||||
|
||||
def inventory_root_path
|
||||
File.join(bucket_name, bucket_folder_path || "", INVENTORY_PREFIX)
|
||||
end
|
||||
|
||||
def log(message, ex = nil)
|
||||
puts(message)
|
||||
Rails.logger.error("#{ex}\n" + (ex.backtrace || []).join("\n")) if ex
|
||||
end
|
||||
|
||||
def error(message)
|
||||
log(message, StandardError.new(message))
|
||||
end
|
||||
end
|
|
@ -53,6 +53,10 @@ module SiteSettings::Validations
|
|||
validate_error :s3_upload_bucket_is_required if new_val == "t" && SiteSetting.s3_upload_bucket.blank?
|
||||
end
|
||||
|
||||
def validate_enable_s3_inventory(new_val)
|
||||
validate_error :enable_s3_uploads_is_required if new_val == "t" && !SiteSetting.enable_s3_uploads?
|
||||
end
|
||||
|
||||
def validate_backup_location(new_val)
|
||||
return unless new_val == BackupLocationSiteSetting::S3
|
||||
validate_error(:s3_backup_requires_s3_settings, setting_name: "s3_backup_bucket") if SiteSetting.s3_backup_bucket.blank?
|
||||
|
|
|
@ -11,16 +11,6 @@ describe Migration::SafeMigrate do
|
|||
Migration::SafeMigrate::SafeMigration.enable_safe!
|
||||
end
|
||||
|
||||
def capture_stdout
|
||||
old_stdout = $stdout
|
||||
io = StringIO.new
|
||||
$stdout = io
|
||||
yield
|
||||
io.string
|
||||
ensure
|
||||
$stdout = old_stdout
|
||||
end
|
||||
|
||||
def migrate_up(path)
|
||||
migrations = ActiveRecord::MigrationContext.new(path).migrations
|
||||
ActiveRecord::Migrator.new(:up, migrations, migrations.first.version).run
|
||||
|
|
|
@ -0,0 +1,77 @@
|
|||
require "rails_helper"
|
||||
require "s3_helper"
|
||||
require "s3_inventory"
|
||||
require "file_store/s3_store"
|
||||
|
||||
describe "S3Inventory" do
|
||||
let(:client) { Aws::S3::Client.new(stub_responses: true) }
|
||||
let(:helper) { S3Helper.new(SiteSetting.Upload.s3_upload_bucket.downcase, "", client: client) }
|
||||
let(:store) { FileStore::S3Store.new(helper) }
|
||||
let(:inventory) { S3Inventory.new(helper, :upload) }
|
||||
let(:csv_filename) { "#{Rails.root}/spec/fixtures/csv/s3_inventory.csv" }
|
||||
|
||||
before do
|
||||
SiteSetting.enable_s3_uploads = true
|
||||
SiteSetting.s3_access_key_id = "abc"
|
||||
SiteSetting.s3_secret_access_key = "def"
|
||||
SiteSetting.enable_s3_inventory = true
|
||||
|
||||
client.stub_responses(:list_objects,
|
||||
contents: [
|
||||
{
|
||||
etag: "\"70ee1738b6b21e2c8a43f3a5ab0eee71\"",
|
||||
key: "example1.csv.gz",
|
||||
last_modified: Time.parse("2014-11-21T19:40:05.000Z"),
|
||||
owner: {
|
||||
display_name: "myname",
|
||||
id: "12345example25102679df27bb0ae12b3f85be6f290b936c4393484be31bebcc",
|
||||
},
|
||||
size: 11,
|
||||
storage_class: "STANDARD",
|
||||
},
|
||||
{
|
||||
etag: "\"9c8af9a76df052144598c115ef33e511\"",
|
||||
key: "example2.csv.gz",
|
||||
last_modified: Time.parse("2013-11-15T01:10:49.000Z"),
|
||||
owner: {
|
||||
display_name: "myname",
|
||||
id: "12345example25102679df27bb0ae12b3f85be6f290b936c4393484be31bebcc",
|
||||
},
|
||||
size: 713193,
|
||||
storage_class: "STANDARD",
|
||||
}
|
||||
],
|
||||
next_marker: "eyJNYXJrZXIiOiBudWxsLCAiYm90b190cnVuY2F0ZV9hbW91bnQiOiAyfQ=="
|
||||
)
|
||||
end
|
||||
|
||||
it "should return the latest inventory file name" do
|
||||
expect(inventory.file.key).to eq("example1.csv.gz")
|
||||
end
|
||||
|
||||
it "should raise error if an inventory file is not found" do
|
||||
client.stub_responses(:list_objects, contents: [])
|
||||
output = capture_stdout { inventory.list_missing }
|
||||
expect(output).to eq("Failed to list inventory from S3\n")
|
||||
end
|
||||
|
||||
it "should display missing uploads correctly" do
|
||||
freeze_time
|
||||
|
||||
CSV.foreach(csv_filename, headers: false) do |row|
|
||||
Fabricate(:upload, etag: row[S3Inventory::CSV_ETAG_INDEX], created_at: 2.days.ago)
|
||||
end
|
||||
upload = Fabricate(:upload, etag: "ETag", created_at: 1.days.ago)
|
||||
Fabricate(:upload, etag: "ETag2", created_at: Time.now)
|
||||
|
||||
inventory.expects(:decompress_inventory_file)
|
||||
inventory.expects(:csv_filename).returns(csv_filename)
|
||||
inventory.file.expects(:last_modified).returns(Time.now)
|
||||
|
||||
output = capture_stdout do
|
||||
inventory.list_missing
|
||||
end
|
||||
|
||||
expect(output).to eq("Downloading inventory file to tmp directory...\n#{upload.url}\n1 of 4 uploads are missing\n")
|
||||
end
|
||||
end
|
|
@ -0,0 +1,3 @@
|
|||
"abc","original/0184537a4f419224404d013414e913a4f56018f2.jpg","defcaac0b4aca535c284e95f30d608d0"
|
||||
"abc","original/050afc0ab01debe8cf48fd2ce50fbbf5eb072815.jpg","0cdc623af39cde0adb382670a6dc702a"
|
||||
"abc","original/0789fbf5490babc68326b9cec90eeb0d6590db05.png","25c02eaceef4cb779fc17030d33f7f06"
|
|
|
@ -0,0 +1,50 @@
|
|||
require 'rails_helper'
|
||||
require "file_store/s3_store"
|
||||
|
||||
describe Jobs::UpdateS3Inventory do
|
||||
before do
|
||||
SiteSetting.enable_s3_uploads = true
|
||||
SiteSetting.s3_access_key_id = "abc"
|
||||
SiteSetting.s3_secret_access_key = "def"
|
||||
SiteSetting.enable_s3_inventory = true
|
||||
|
||||
store = FileStore::S3Store.new
|
||||
@client = Aws::S3::Client.new(stub_responses: true)
|
||||
store.s3_helper.stubs(:s3_client).returns(@client)
|
||||
Discourse.stubs(:store).returns(store)
|
||||
end
|
||||
|
||||
it "updates the bucket policy and inventory configuration in S3" do
|
||||
id = "original"
|
||||
@client.expects(:put_bucket_policy).with(
|
||||
bucket: "bucket",
|
||||
policy: "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Sid\":\"InventoryAndAnalyticsPolicy\",\"Effect\":\"Allow\",\"Principal\":{\"Service\":\"s3.amazonaws.com\"},\"Action\":[\"s3:PutObject\"],\"Resource\":[\"arn:aws:s3:::bucket/inventory/*\"],\"Condition\":{\"ArnLike\":{\"aws:SourceArn\":\"arn:aws:s3:::bucket\"},\"StringEquals\":{\"s3:x-amz-acl\":\"bucket-owner-full-control\"}}}]}"
|
||||
)
|
||||
@client.expects(:put_bucket_inventory_configuration)
|
||||
@client.expects(:put_bucket_inventory_configuration).with(
|
||||
bucket: "bucket",
|
||||
id: id,
|
||||
inventory_configuration: {
|
||||
destination: {
|
||||
s3_bucket_destination: {
|
||||
bucket: "arn:aws:s3:::bucket",
|
||||
prefix: "inventory/#{id}",
|
||||
format: "CSV"
|
||||
}
|
||||
},
|
||||
filter: {
|
||||
prefix: id
|
||||
},
|
||||
is_enabled: true,
|
||||
id: id,
|
||||
included_object_versions: "Current",
|
||||
optional_fields: ["ETag"],
|
||||
schedule: { frequency: "Daily" }
|
||||
},
|
||||
use_accelerate_endpoint: false
|
||||
)
|
||||
|
||||
described_class.new.execute(nil)
|
||||
end
|
||||
|
||||
end
|
|
@ -102,4 +102,14 @@ module Helpers
|
|||
tag_group.tags << (Tag.where(name: name).first || Fabricate(:tag, name: name))
|
||||
end
|
||||
end
|
||||
|
||||
def capture_stdout
|
||||
old_stdout = $stdout
|
||||
io = StringIO.new
|
||||
$stdout = io
|
||||
yield
|
||||
io.string
|
||||
ensure
|
||||
$stdout = old_stdout
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue