2019-01-31 23:40:48 -05:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
|
|
require "aws-sdk-s3"
|
|
|
|
require "csv"
|
|
|
|
|
|
|
|
class S3Inventory
|
|
|
|
|
2019-05-01 19:05:35 -04:00
|
|
|
attr_reader :type, :model, :inventory_date
|
2019-01-31 23:40:48 -05:00
|
|
|
|
|
|
|
CSV_KEY_INDEX ||= 1
|
|
|
|
CSV_ETAG_INDEX ||= 2
|
|
|
|
INVENTORY_PREFIX ||= "inventory"
|
2019-02-06 08:46:08 -05:00
|
|
|
INVENTORY_VERSION ||= "1"
|
2019-01-31 23:40:48 -05:00
|
|
|
|
|
|
|
def initialize(s3_helper, type)
|
|
|
|
@s3_helper = s3_helper
|
|
|
|
|
|
|
|
if type == :upload
|
2019-05-01 19:05:35 -04:00
|
|
|
@type = "original"
|
2019-01-31 23:40:48 -05:00
|
|
|
@model = Upload
|
|
|
|
elsif type == :optimized
|
2019-05-01 19:05:35 -04:00
|
|
|
@type = "optimized"
|
2019-01-31 23:40:48 -05:00
|
|
|
@model = OptimizedImage
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-02-19 11:24:35 -05:00
|
|
|
def backfill_etags_and_list_missing
|
2019-02-13 17:11:52 -05:00
|
|
|
if files.blank?
|
2019-01-31 23:40:48 -05:00
|
|
|
error("Failed to list inventory from S3")
|
|
|
|
return
|
|
|
|
end
|
|
|
|
|
2019-05-01 19:05:35 -04:00
|
|
|
DistributedMutex.synchronize("s3_inventory_list_missing_#{type}") do
|
2019-02-13 17:11:52 -05:00
|
|
|
download_inventory_files_to_tmp_directory
|
|
|
|
decompress_inventory_files
|
2019-01-31 23:40:48 -05:00
|
|
|
|
2019-06-08 09:06:06 -04:00
|
|
|
multisite_prefix = "uploads/#{RailsMultisite::ConnectionManagement.current_db}/"
|
2019-02-19 21:52:40 -05:00
|
|
|
ActiveRecord::Base.transaction do
|
|
|
|
begin
|
2019-06-13 12:33:58 -04:00
|
|
|
connection.exec("CREATE TEMP TABLE #{table_name}(url text UNIQUE, etag text, PRIMARY KEY(etag, url))")
|
2019-02-19 21:52:40 -05:00
|
|
|
connection.copy_data("COPY #{table_name} FROM STDIN CSV") do
|
|
|
|
files.each do |file|
|
|
|
|
CSV.foreach(file[:filename][0...-3], headers: false) do |row|
|
2019-06-08 09:06:06 -04:00
|
|
|
key = row[CSV_KEY_INDEX]
|
|
|
|
next if Rails.configuration.multisite && key.exclude?(multisite_prefix)
|
2019-06-13 12:33:58 -04:00
|
|
|
url = File.join(Discourse.store.absolute_base_url, key)
|
|
|
|
connection.put_copy_data("#{url},#{row[CSV_ETAG_INDEX]}\n")
|
2019-02-19 21:52:40 -05:00
|
|
|
end
|
2019-02-13 17:11:52 -05:00
|
|
|
end
|
2019-01-31 23:40:48 -05:00
|
|
|
end
|
|
|
|
|
2019-02-19 21:52:40 -05:00
|
|
|
# backfilling etags
|
|
|
|
connection.async_exec("UPDATE #{model.table_name}
|
|
|
|
SET etag = #{table_name}.etag
|
|
|
|
FROM #{table_name}
|
|
|
|
WHERE #{model.table_name}.etag IS NULL
|
2019-06-13 12:33:58 -04:00
|
|
|
AND #{model.table_name}.url = #{table_name}.url")
|
2019-02-13 18:48:06 -05:00
|
|
|
|
2019-05-03 15:46:20 -04:00
|
|
|
list_missing_post_uploads if type == "original"
|
|
|
|
|
2019-03-13 05:39:07 -04:00
|
|
|
uploads = (model == Upload) ? model.by_users.where("created_at < ?", inventory_date) : model
|
2019-05-19 14:39:52 -04:00
|
|
|
missing_uploads = uploads
|
|
|
|
.joins("LEFT JOIN #{table_name} ON #{table_name}.etag = #{model.table_name}.etag")
|
|
|
|
.where("#{table_name}.etag IS NULL AND #{model.table_name}.etag IS NOT NULL")
|
2019-01-31 23:40:48 -05:00
|
|
|
|
2019-02-19 21:52:40 -05:00
|
|
|
if (missing_count = missing_uploads.count) > 0
|
|
|
|
missing_uploads.select(:id, :url).find_each do |upload|
|
|
|
|
log upload.url
|
|
|
|
end
|
2019-01-31 23:40:48 -05:00
|
|
|
|
2019-02-19 21:52:40 -05:00
|
|
|
log "#{missing_count} of #{uploads.count} #{model.name.underscore.pluralize} are missing"
|
|
|
|
end
|
2019-04-04 10:35:25 -04:00
|
|
|
|
2019-04-17 03:15:04 -04:00
|
|
|
Discourse.stats.set("missing_s3_#{model.table_name}", missing_count)
|
2019-02-19 21:52:40 -05:00
|
|
|
ensure
|
|
|
|
connection.exec("DROP TABLE #{table_name}") unless connection.nil?
|
2019-01-31 23:40:48 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-05-03 15:46:20 -04:00
|
|
|
def list_missing_post_uploads
|
|
|
|
log "Listing missing post uploads..."
|
|
|
|
|
2019-07-18 16:14:08 -04:00
|
|
|
missing = Post.find_missing_uploads(include_local_upload: false) do |post, _, _, sha1|
|
2019-05-03 15:46:20 -04:00
|
|
|
next if sha1.blank?
|
|
|
|
|
|
|
|
upload_id = nil
|
2019-06-13 12:33:58 -04:00
|
|
|
result = connection.exec("SELECT * FROM #{table_name} WHERE url LIKE '%original/%/#{sha1}%'")
|
2019-05-03 15:46:20 -04:00
|
|
|
|
2019-05-07 07:39:08 -04:00
|
|
|
if result.count >= 1
|
2019-05-20 14:36:36 -04:00
|
|
|
begin
|
2019-06-13 12:33:58 -04:00
|
|
|
url = result[0]["url"]
|
|
|
|
key = url.sub(/^#{Discourse.store.absolute_base_url}\//, "")
|
2019-05-20 14:36:36 -04:00
|
|
|
data = @s3_helper.object(key).data
|
|
|
|
filename = (data.content_disposition&.match(/filename=\"(.*)\"/) || [])[1]
|
|
|
|
|
|
|
|
upload = Upload.new(
|
|
|
|
user_id: Discourse.system_user.id,
|
|
|
|
original_filename: filename || File.basename(key),
|
|
|
|
filesize: data.content_length,
|
2019-06-13 12:33:58 -04:00
|
|
|
url: url,
|
2019-05-20 14:36:36 -04:00
|
|
|
sha1: sha1,
|
|
|
|
etag: result[0]["etag"]
|
|
|
|
)
|
|
|
|
upload.save!(validate: false)
|
|
|
|
upload_id = upload.id
|
2019-07-18 16:14:08 -04:00
|
|
|
post.link_post_uploads
|
2019-05-20 14:36:36 -04:00
|
|
|
rescue Aws::S3::Errors::NotFound
|
|
|
|
next
|
|
|
|
end
|
2019-05-03 15:46:20 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
upload_id
|
|
|
|
end
|
|
|
|
|
|
|
|
Discourse.stats.set("missing_post_uploads", missing[:count])
|
|
|
|
log "#{missing[:count]} post uploads are missing."
|
|
|
|
end
|
|
|
|
|
2019-02-13 17:11:52 -05:00
|
|
|
def download_inventory_files_to_tmp_directory
|
|
|
|
files.each do |file|
|
2019-07-25 04:46:47 -04:00
|
|
|
next if File.exists?(file[:filename])
|
|
|
|
|
2019-02-13 17:11:52 -05:00
|
|
|
log "Downloading inventory file '#{file[:key]}' to tmp directory..."
|
|
|
|
failure_message = "Failed to inventory file '#{file[:key]}' to tmp directory."
|
2019-01-31 23:40:48 -05:00
|
|
|
|
2019-02-13 17:11:52 -05:00
|
|
|
@s3_helper.download_file(file[:key], file[:filename], failure_message)
|
|
|
|
end
|
2019-01-31 23:40:48 -05:00
|
|
|
end
|
|
|
|
|
2019-02-13 17:11:52 -05:00
|
|
|
def decompress_inventory_files
|
2019-06-13 12:13:50 -04:00
|
|
|
files.each do |file|
|
|
|
|
log "Decompressing inventory file '#{file[:filename]}', this may take a while..."
|
|
|
|
Discourse::Utils.execute_command('gzip', '--decompress', file[:filename], failure_message: "Failed to decompress inventory file '#{file[:filename]}'.", chdir: tmp_directory)
|
2019-01-31 23:40:48 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def update_bucket_policy
|
|
|
|
@s3_helper.s3_client.put_bucket_policy(
|
|
|
|
bucket: bucket_name,
|
|
|
|
policy: {
|
|
|
|
"Version": "2012-10-17",
|
|
|
|
"Statement": [
|
|
|
|
{
|
|
|
|
"Sid": "InventoryAndAnalyticsPolicy",
|
|
|
|
"Effect": "Allow",
|
|
|
|
"Principal": { "Service": "s3.amazonaws.com" },
|
|
|
|
"Action": ["s3:PutObject"],
|
2019-02-06 08:46:08 -05:00
|
|
|
"Resource": ["#{inventory_path_arn}/*"],
|
2019-01-31 23:40:48 -05:00
|
|
|
"Condition": {
|
|
|
|
"ArnLike": {
|
2019-02-06 08:46:08 -05:00
|
|
|
"aws:SourceArn": bucket_arn
|
2019-01-31 23:40:48 -05:00
|
|
|
},
|
|
|
|
"StringEquals": {
|
|
|
|
"s3:x-amz-acl": "bucket-owner-full-control"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
]
|
|
|
|
}.to_json
|
|
|
|
)
|
|
|
|
end
|
|
|
|
|
|
|
|
def update_bucket_inventory_configuration
|
|
|
|
@s3_helper.s3_client.put_bucket_inventory_configuration(
|
|
|
|
bucket: bucket_name,
|
|
|
|
id: inventory_id,
|
|
|
|
inventory_configuration: inventory_configuration,
|
|
|
|
use_accelerate_endpoint: false
|
|
|
|
)
|
|
|
|
end
|
|
|
|
|
|
|
|
private
|
|
|
|
|
2019-05-03 15:46:20 -04:00
|
|
|
def connection
|
|
|
|
@connection ||= ActiveRecord::Base.connection.raw_connection
|
|
|
|
end
|
|
|
|
|
|
|
|
def table_name
|
|
|
|
"#{type}_inventory"
|
|
|
|
end
|
|
|
|
|
2019-02-13 17:11:52 -05:00
|
|
|
def files
|
|
|
|
@files ||= begin
|
|
|
|
symlink_file = unsorted_files.sort_by { |file| -file.last_modified.to_i }.first
|
|
|
|
return [] if symlink_file.blank?
|
|
|
|
|
2019-02-14 07:23:08 -05:00
|
|
|
@inventory_date = symlink_file.last_modified - 1.day
|
2019-02-13 17:11:52 -05:00
|
|
|
log "Downloading symlink file to tmp directory..."
|
|
|
|
failure_message = "Failed to download symlink file to tmp directory."
|
|
|
|
filename = File.join(tmp_directory, File.basename(symlink_file.key))
|
|
|
|
|
|
|
|
@s3_helper.download_file(symlink_file.key, filename, failure_message)
|
|
|
|
File.readlines(filename).map do |key|
|
2019-02-13 19:26:30 -05:00
|
|
|
key = key.sub("s3://#{bucket_name}/", "").sub("\n", "")
|
2019-02-13 17:11:52 -05:00
|
|
|
{ key: key, filename: File.join(tmp_directory, File.basename(key)) }
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def tmp_directory
|
|
|
|
@tmp_directory ||= begin
|
|
|
|
current_db = RailsMultisite::ConnectionManagement.current_db
|
|
|
|
timestamp = Time.now.strftime("%Y-%m-%d-%H%M%S")
|
|
|
|
directory = File.join(Rails.root, "tmp", INVENTORY_PREFIX, current_db, timestamp)
|
|
|
|
FileUtils.mkdir_p(directory)
|
|
|
|
directory
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-01-31 23:40:48 -05:00
|
|
|
def inventory_configuration
|
2019-05-01 19:05:35 -04:00
|
|
|
filter_prefix = type
|
2019-02-06 10:21:28 -05:00
|
|
|
filter_prefix = File.join(bucket_folder_path, filter_prefix) if bucket_folder_path.present?
|
2019-01-31 23:40:48 -05:00
|
|
|
|
|
|
|
{
|
|
|
|
destination: {
|
|
|
|
s3_bucket_destination: {
|
2019-02-06 08:46:08 -05:00
|
|
|
bucket: bucket_arn,
|
2019-02-06 10:21:28 -05:00
|
|
|
prefix: inventory_path,
|
2019-01-31 23:40:48 -05:00
|
|
|
format: "CSV"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
filter: {
|
|
|
|
prefix: filter_prefix
|
|
|
|
},
|
|
|
|
is_enabled: SiteSetting.enable_s3_inventory,
|
|
|
|
id: inventory_id,
|
|
|
|
included_object_versions: "Current",
|
|
|
|
optional_fields: ["ETag"],
|
|
|
|
schedule: {
|
|
|
|
frequency: "Daily"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
end
|
|
|
|
|
|
|
|
def bucket_name
|
|
|
|
@s3_helper.s3_bucket_name
|
|
|
|
end
|
|
|
|
|
|
|
|
def bucket_folder_path
|
|
|
|
@s3_helper.s3_bucket_folder_path
|
|
|
|
end
|
|
|
|
|
|
|
|
def unsorted_files
|
|
|
|
objects = []
|
|
|
|
|
2019-02-13 17:11:52 -05:00
|
|
|
hive_path = File.join(inventory_path, bucket_name, inventory_id, "hive")
|
|
|
|
@s3_helper.list(hive_path).each do |obj|
|
|
|
|
if obj.key.match?(/symlink\.txt$/i)
|
2019-01-31 23:40:48 -05:00
|
|
|
objects << obj
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
objects
|
|
|
|
rescue Aws::Errors::ServiceError => e
|
|
|
|
log("Failed to list inventory from S3", e)
|
|
|
|
end
|
|
|
|
|
2019-05-01 19:05:35 -04:00
|
|
|
def inventory_id
|
|
|
|
@inventory_id ||= begin
|
2019-07-25 04:46:47 -04:00
|
|
|
id = Rails.configuration.multisite ? "original" : type # TODO: rename multisite path to "uploads"
|
|
|
|
bucket_folder_path.present? ? "#{bucket_folder_path}-#{id}" : id
|
2019-05-01 19:05:35 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-02-06 08:46:08 -05:00
|
|
|
def inventory_path_arn
|
|
|
|
File.join(bucket_arn, inventory_path)
|
|
|
|
end
|
|
|
|
|
2019-01-31 23:40:48 -05:00
|
|
|
def inventory_path
|
2019-02-06 08:46:08 -05:00
|
|
|
path = File.join(INVENTORY_PREFIX, INVENTORY_VERSION)
|
|
|
|
path = File.join(bucket_folder_path, path) if bucket_folder_path.present?
|
|
|
|
path
|
2019-01-31 23:40:48 -05:00
|
|
|
end
|
|
|
|
|
2019-02-06 08:46:08 -05:00
|
|
|
def bucket_arn
|
|
|
|
"arn:aws:s3:::#{bucket_name}"
|
2019-01-31 23:40:48 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
def log(message, ex = nil)
|
|
|
|
puts(message)
|
|
|
|
Rails.logger.error("#{ex}\n" + (ex.backtrace || []).join("\n")) if ex
|
|
|
|
end
|
|
|
|
|
|
|
|
def error(message)
|
|
|
|
log(message, StandardError.new(message))
|
|
|
|
end
|
|
|
|
end
|