diff --git a/app/models/post.rb b/app/models/post.rb
index 5ab73200d75..903400c0049 100644
--- a/app/models/post.rb
+++ b/app/models/post.rb
@@ -890,12 +890,12 @@ class Post < ActiveRecord::Base
def link_post_uploads(fragments: nil)
upload_ids = []
- fragments ||= Nokogiri::HTML::fragment(self.cooked)
- fragments.css("a/@href", "img/@src").each do |media|
- if upload = Upload.get_from_url(media.value)
- upload_ids << upload.id
- end
+ each_upload_url(fragments: fragments) do |src, _, sha1|
+ upload = nil
+ upload = Upload.find_by(sha1: sha1) if sha1.present?
+ upload ||= Upload.get_from_url(src)
+ upload_ids << upload.id if upload.present?
upload_ids |= Upload.where(id: downloaded_images.values).pluck(:id)
@@ -916,6 +916,84 @@ class Post < ActiveRecord::Base
+ def each_upload_url(fragments: nil, include_local_upload: true)
+ upload_patterns = [
+ /\/uploads\/#{RailsMultisite::ConnectionManagement.current_db}\//,
+ /\/original\//,
+ /\/optimized\//
+ ]
+ fragments ||= Nokogiri::HTML::fragment(self.cooked)
+ links = fragments.css("a/@href", "img/@src").map { |media| media.value }.uniq
+ links.each do |src|
+ next if src.blank? || upload_patterns.none? { |pattern| src =~ pattern }
+ src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//")
+ next unless Discourse.store.has_been_uploaded?(src) || (include_local_upload && src =~ /\A\/[^\/]/i)
+ path = begin
+ URI(URI.unescape(src))&.path
+ rescue URI::Error
+ end
+ next if path.blank?
+ sha1 =
+ if path.include? "optimized"
+ OptimizedImage.extract_sha1(path)
+ else
+ Upload.extract_sha1(path)
+ end
+ yield(src, path, sha1)
+ end
+ end
+ def self.find_missing_uploads(include_local_upload: true)
+ PostCustomField.where(name: Post::MISSING_UPLOADS).delete_all
+ missing_uploads = []
+ missing_post_uploads = {}
+ Post.have_uploads.select(:id, :cooked).find_in_batches do |posts|
+ ids = posts.pluck(:id)
+ sha1s = Upload.joins(:post_uploads).where("post_uploads.post_id >= ? AND post_uploads.post_id <= ?", ids.min, ids.max).pluck(:sha1)
+ posts.each do |post|
+ post.each_upload_url do |src, path, sha1|
+ next if sha1.present? && sha1s.include?(sha1)
+ missing_post_uploads[post.id] ||= []
+ if missing_uploads.include?(src)
+ missing_post_uploads[post.id] << src
+ next
+ end
+ upload_id = nil
+ upload_id = Upload.where(sha1: sha1).pluck(:id).first if sha1.present?
+ upload_id ||= yield(post, src, path, sha1)
+ if upload_id.present?
+ attributes = { post_id: post.id, upload_id: upload_id }
+ PostUpload.create!(attributes) unless PostUpload.exists?(attributes)
+ else
+ missing_uploads << src
+ missing_post_uploads[post.id] << src
+ end
+ end
+ end
+ end
+ count = 0
+ missing_post_uploads = missing_post_uploads.reject { |_, uploads| uploads.empty? }
+ missing_post_uploads.reject do |post_id, uploads|
+ PostCustomField.create!(post_id: post_id, name: Post::MISSING_UPLOADS, value: uploads.to_json)
+ count += uploads.count
+ end
+ return { uploads: missing_uploads, post_uploads: missing_post_uploads, count: count }
+ end
def parse_quote_into_arguments(quote)
diff --git a/lib/s3_inventory.rb b/lib/s3_inventory.rb
index 486d49244ea..1603136d57b 100644
--- a/lib/s3_inventory.rb
+++ b/lib/s3_inventory.rb
@@ -36,8 +36,6 @@ class S3Inventory
ActiveRecord::Base.transaction do
- table_name = "#{type}_inventory"
- connection = ActiveRecord::Base.connection.raw_connection
connection.exec("CREATE TEMP TABLE #{table_name}(key text UNIQUE, etag text, PRIMARY KEY(etag, key))")
connection.copy_data("COPY #{table_name} FROM STDIN CSV") do
files.each do |file|
@@ -54,6 +52,8 @@ class S3Inventory
WHERE #{model.table_name}.etag IS NULL
AND url ILIKE '%' || #{table_name}.key")
+ list_missing_post_uploads if type == "original"
uploads = (model == Upload) ? model.by_users.where("created_at < ?", inventory_date) : model
missing_uploads = uploads.joins("LEFT JOIN #{table_name} ON #{table_name}.etag = #{model.table_name}.etag").where("#{table_name}.etag is NULL")
@@ -73,6 +73,35 @@ class S3Inventory
+ def list_missing_post_uploads
+ log "Listing missing post uploads..."
+ missing = Post.find_missing_uploads(include_local_upload: false) do |_, _, _, sha1|
+ next if sha1.blank?
+ upload_id = nil
+ result = connection.exec("SELECT * FROM #{table_name} WHERE key LIKE '%original/%/#{sha1}%'")
+ if result.count >= 0
+ key = result[0]["key"]
+ data = s3_helper.object(key).data
+ upload_id = Upload.create!(
+ user_id: Discourse.system_user.id,
+ original_filename: "",
+ filesize: data.content_length,
+ url: File.join(Discourse.store.absolute_base_url, key),
+ sha1: sha1,
+ etag: result[0]["etag"]
+ ).id
+ end
+ upload_id
+ end
+ Discourse.stats.set("missing_post_uploads", missing[:count])
+ log "#{missing[:count]} post uploads are missing."
+ end
def download_inventory_files_to_tmp_directory
files.each do |file|
log "Downloading inventory file '#{file[:key]}' to tmp directory..."
@@ -128,6 +157,14 @@ class S3Inventory
+ def connection
+ @connection ||= ActiveRecord::Base.connection.raw_connection
+ end
+ def table_name
+ "#{type}_inventory"
+ end
def files
@files ||= begin
symlink_file = unsorted_files.sort_by { |file| -file.last_modified.to_i }.first
diff --git a/lib/tasks/posts.rake b/lib/tasks/posts.rake
index 3d1509c96fc..241dbbff9cf 100644
--- a/lib/tasks/posts.rake
+++ b/lib/tasks/posts.rake
@@ -390,123 +390,60 @@ task 'posts:reorder_posts', [:topic_id] => [:environment] do |_, args|
puts "", "Done.", ""
-def get_missing_uploads
- PostCustomField.where(name: Post::MISSING_UPLOADS)
desc 'Finds missing post upload records from cooked HTML content'
task 'posts:missing_uploads' => :environment do
- get_missing_uploads.delete_all
- upload_patterns = [
- /\/uploads\/#{RailsMultisite::ConnectionManagement.current_db}\//,
- /\/original\//,
- /\/optimized\//
- ]
- missing_uploads = []
old_scheme_upload_count = 0
- count = 0
- Post.have_uploads.select(:id, :cooked).find_in_batches do |posts|
- ids = posts.pluck(:id)
- sha1s = Upload.joins(:post_uploads).where("post_uploads.post_id >= ? AND post_uploads.post_id <= ?", ids.min, ids.max).pluck(:sha1)
+ missing = Post.find_missing_uploads do |post, src, path, sha1|
+ next if sha1.present?
- posts.each do |post|
- missing_post_uploads = []
- links = Nokogiri::HTML::fragment(post.cooked).css("a/@href", "img/@src").map { |media| media.value }.uniq
+ upload_id = nil
- links.each do |src|
- next if src.blank? || upload_patterns.none? { |pattern| src =~ pattern }
+ # recovering old scheme upload.
+ local_store = FileStore::LocalStore.new
+ public_path = "#{local_store.public_dir}#{path}"
+ file_path = nil
- src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//")
- next unless Discourse.store.has_been_uploaded?(src) || src =~ /\A\/[^\/]/i
- path = begin
- URI(URI.unescape(src))&.path
- rescue URI::Error
- end
- next if path.blank?
- sha1 =
- if path.include? "optimized"
- OptimizedImage.extract_sha1(path)
- else
- Upload.extract_sha1(path)
- end
- if sha1.blank? || sha1s.exclude?(sha1)
- upload_id = nil
- if missing_uploads.exclude?(src)
- if sha1.blank?
- # recovering old scheme upload.
- local_store = FileStore::LocalStore.new
- public_path = "#{local_store.public_dir}#{path}"
- file_path = nil
- if File.exists?(public_path)
- file_path = public_path
- else
- tombstone_path = public_path.sub("/uploads/", "/uploads/tombstone/")
- file_path = tombstone_path if File.exists?(tombstone_path)
- end
- if file_path.present?
- tmp = Tempfile.new
- tmp.write(File.read(file_path))
- tmp.rewind
- if upload = UploadCreator.new(tmp, File.basename(path)).create_for(Discourse.system_user.id)
- sha1s << upload.sha1
- upload_id = upload.id
- DbHelper.remap(UrlHelper.absolute(src), upload.url)
- post.reload
- post.raw.gsub!(src, upload.url)
- post.cooked.gsub!(src, upload.url)
- if post.changed?
- post.save!(validate: false)
- post.rebake!
- end
- end
- FileUtils.rm(tmp, force: true)
- else
- old_scheme_upload_count += 1
- end
- else
- upload_id = Upload.where(sha1: sha1).pluck(:id).first
- end
- if upload_id.present?
- attributes = { post_id: post.id, upload_id: upload_id }
- PostUpload.create!(attributes) unless PostUpload.exists?(attributes)
- else
- missing_uploads << src
- end
- end
- missing_post_uploads << src if upload_id.blank?
- end
- end
- if missing_post_uploads.present?
- PostCustomField.create!(post_id: post.id, name: Post::MISSING_UPLOADS, value: missing_post_uploads.to_json)
- count += missing_post_uploads.count
- putc "x"
- else
- putc "."
- end
+ if File.exists?(public_path)
+ file_path = public_path
+ else
+ tombstone_path = public_path.sub("/uploads/", "/uploads/tombstone/")
+ file_path = tombstone_path if File.exists?(tombstone_path)
+ if file_path.present?
+ tmp = Tempfile.new
+ tmp.write(File.read(file_path))
+ tmp.rewind
+ if upload = UploadCreator.new(tmp, File.basename(path)).create_for(Discourse.system_user.id)
+ sha1s << upload.sha1
+ upload_id = upload.id
+ DbHelper.remap(UrlHelper.absolute(src), upload.url)
+ post.reload
+ post.raw.gsub!(src, upload.url)
+ post.cooked.gsub!(src, upload.url)
+ if post.changed?
+ post.save!(validate: false)
+ post.rebake!
+ end
+ end
+ FileUtils.rm(tmp, force: true)
+ else
+ old_scheme_upload_count += 1
+ end
+ upload_id
- puts "", "#{count} post uploads are missing.", ""
+ puts "", "#{missing[:count]} post uploads are missing.", ""
- if count > 0
- puts "#{missing_uploads.count} uploads are missing."
- puts "#{old_scheme_upload_count} of #{missing_uploads.count} are old scheme uploads." if old_scheme_upload_count > 0
- puts "#{get_missing_uploads.count} of #{Post.count} posts are affected.", ""
+ if missing[:count] > 0
+ puts "#{missing[:uploads].count} uploads are missing."
+ puts "#{old_scheme_upload_count} of #{missing[:uploads].count} are old scheme uploads." if old_scheme_upload_count > 0
+ puts "#{missing[:post_uploads].count} of #{Post.count} posts are affected.", ""
diff --git a/spec/components/s3_inventory_spec.rb b/spec/components/s3_inventory_spec.rb
index 3f4a016ec2a..6c26dcd4145 100644
--- a/spec/components/s3_inventory_spec.rb
+++ b/spec/components/s3_inventory_spec.rb
@@ -75,7 +75,7 @@ describe "S3Inventory" do
- expect(output).to eq("#{upload.url}\n1 of 4 uploads are missing\n")
+ expect(output).to eq("Listing missing post uploads...\n0 post uploads are missing.\n#{upload.url}\n1 of 4 uploads are missing\n")
expect(Discourse.stats.get("missing_s3_uploads")).to eq(1)
diff --git a/spec/models/post_spec.rb b/spec/models/post_spec.rb
index d993dc420cf..84629708bc8 100644
--- a/spec/models/post_spec.rb
+++ b/spec/models/post_spec.rb
@@ -1261,19 +1261,23 @@ describe Post do
+ let(:base_url) { "#{Discourse.base_url_no_prefix}#{Discourse.base_uri}" }
+ let(:video_url) { "#{base_url}#{video_upload.url}" }
+ let(:audio_url) { "#{base_url}#{audio_upload.url}" }
let(:raw) do