diff --git a/app/models/post.rb b/app/models/post.rb index 5ab73200d75..903400c0049 100644 --- a/app/models/post.rb +++ b/app/models/post.rb @@ -890,12 +890,12 @@ class Post < ActiveRecord::Base def link_post_uploads(fragments: nil) upload_ids = [] - fragments ||= Nokogiri::HTML::fragment(self.cooked) - fragments.css("a/@href", "img/@src").each do |media| - if upload = Upload.get_from_url(media.value) - upload_ids << upload.id - end + each_upload_url(fragments: fragments) do |src, _, sha1| + upload = nil + upload = Upload.find_by(sha1: sha1) if sha1.present? + upload ||= Upload.get_from_url(src) + upload_ids << upload.id if upload.present? end upload_ids |= Upload.where(id: downloaded_images.values).pluck(:id) @@ -916,6 +916,84 @@ class Post < ActiveRecord::Base {} end + def each_upload_url(fragments: nil, include_local_upload: true) + upload_patterns = [ + /\/uploads\/#{RailsMultisite::ConnectionManagement.current_db}\//, + /\/original\//, + /\/optimized\// + ] + fragments ||= Nokogiri::HTML::fragment(self.cooked) + links = fragments.css("a/@href", "img/@src").map { |media| media.value }.uniq + + links.each do |src| + next if src.blank? || upload_patterns.none? { |pattern| src =~ pattern } + + src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//") + next unless Discourse.store.has_been_uploaded?(src) || (include_local_upload && src =~ /\A\/[^\/]/i) + + path = begin + URI(URI.unescape(src))&.path + rescue URI::Error + end + + next if path.blank? + + sha1 = + if path.include? "optimized" + OptimizedImage.extract_sha1(path) + else + Upload.extract_sha1(path) + end + + yield(src, path, sha1) + end + end + + def self.find_missing_uploads(include_local_upload: true) + PostCustomField.where(name: Post::MISSING_UPLOADS).delete_all + missing_uploads = [] + missing_post_uploads = {} + + Post.have_uploads.select(:id, :cooked).find_in_batches do |posts| + ids = posts.pluck(:id) + sha1s = Upload.joins(:post_uploads).where("post_uploads.post_id >= ? AND post_uploads.post_id <= ?", ids.min, ids.max).pluck(:sha1) + + posts.each do |post| + post.each_upload_url do |src, path, sha1| + next if sha1.present? && sha1s.include?(sha1) + + missing_post_uploads[post.id] ||= [] + + if missing_uploads.include?(src) + missing_post_uploads[post.id] << src + next + end + + upload_id = nil + upload_id = Upload.where(sha1: sha1).pluck(:id).first if sha1.present? + upload_id ||= yield(post, src, path, sha1) + + if upload_id.present? + attributes = { post_id: post.id, upload_id: upload_id } + PostUpload.create!(attributes) unless PostUpload.exists?(attributes) + else + missing_uploads << src + missing_post_uploads[post.id] << src + end + end + end + end + + count = 0 + missing_post_uploads = missing_post_uploads.reject { |_, uploads| uploads.empty? } + missing_post_uploads.reject do |post_id, uploads| + PostCustomField.create!(post_id: post_id, name: Post::MISSING_UPLOADS, value: uploads.to_json) + count += uploads.count + end + + return { uploads: missing_uploads, post_uploads: missing_post_uploads, count: count } + end + private def parse_quote_into_arguments(quote) diff --git a/lib/s3_inventory.rb b/lib/s3_inventory.rb index 486d49244ea..1603136d57b 100644 --- a/lib/s3_inventory.rb +++ b/lib/s3_inventory.rb @@ -36,8 +36,6 @@ class S3Inventory ActiveRecord::Base.transaction do begin - table_name = "#{type}_inventory" - connection = ActiveRecord::Base.connection.raw_connection connection.exec("CREATE TEMP TABLE #{table_name}(key text UNIQUE, etag text, PRIMARY KEY(etag, key))") connection.copy_data("COPY #{table_name} FROM STDIN CSV") do files.each do |file| @@ -54,6 +52,8 @@ class S3Inventory WHERE #{model.table_name}.etag IS NULL AND url ILIKE '%' || #{table_name}.key") + list_missing_post_uploads if type == "original" + uploads = (model == Upload) ? model.by_users.where("created_at < ?", inventory_date) : model missing_uploads = uploads.joins("LEFT JOIN #{table_name} ON #{table_name}.etag = #{model.table_name}.etag").where("#{table_name}.etag is NULL") @@ -73,6 +73,35 @@ class S3Inventory end end + def list_missing_post_uploads + log "Listing missing post uploads..." + + missing = Post.find_missing_uploads(include_local_upload: false) do |_, _, _, sha1| + next if sha1.blank? + + upload_id = nil + result = connection.exec("SELECT * FROM #{table_name} WHERE key LIKE '%original/%/#{sha1}%'") + + if result.count >= 0 + key = result[0]["key"] + data = s3_helper.object(key).data + upload_id = Upload.create!( + user_id: Discourse.system_user.id, + original_filename: "", + filesize: data.content_length, + url: File.join(Discourse.store.absolute_base_url, key), + sha1: sha1, + etag: result[0]["etag"] + ).id + end + + upload_id + end + + Discourse.stats.set("missing_post_uploads", missing[:count]) + log "#{missing[:count]} post uploads are missing." + end + def download_inventory_files_to_tmp_directory files.each do |file| log "Downloading inventory file '#{file[:key]}' to tmp directory..." @@ -128,6 +157,14 @@ class S3Inventory private + def connection + @connection ||= ActiveRecord::Base.connection.raw_connection + end + + def table_name + "#{type}_inventory" + end + def files @files ||= begin symlink_file = unsorted_files.sort_by { |file| -file.last_modified.to_i }.first diff --git a/lib/tasks/posts.rake b/lib/tasks/posts.rake index 3d1509c96fc..241dbbff9cf 100644 --- a/lib/tasks/posts.rake +++ b/lib/tasks/posts.rake @@ -390,123 +390,60 @@ task 'posts:reorder_posts', [:topic_id] => [:environment] do |_, args| puts "", "Done.", "" end -def get_missing_uploads - PostCustomField.where(name: Post::MISSING_UPLOADS) -end - desc 'Finds missing post upload records from cooked HTML content' task 'posts:missing_uploads' => :environment do - get_missing_uploads.delete_all - - upload_patterns = [ - /\/uploads\/#{RailsMultisite::ConnectionManagement.current_db}\//, - /\/original\//, - /\/optimized\// - ] - missing_uploads = [] old_scheme_upload_count = 0 - count = 0 - Post.have_uploads.select(:id, :cooked).find_in_batches do |posts| - ids = posts.pluck(:id) - sha1s = Upload.joins(:post_uploads).where("post_uploads.post_id >= ? AND post_uploads.post_id <= ?", ids.min, ids.max).pluck(:sha1) + missing = Post.find_missing_uploads do |post, src, path, sha1| + next if sha1.present? - posts.each do |post| - missing_post_uploads = [] - links = Nokogiri::HTML::fragment(post.cooked).css("a/@href", "img/@src").map { |media| media.value }.uniq + upload_id = nil - links.each do |src| - next if src.blank? || upload_patterns.none? { |pattern| src =~ pattern } + # recovering old scheme upload. + local_store = FileStore::LocalStore.new + public_path = "#{local_store.public_dir}#{path}" + file_path = nil - src = "#{SiteSetting.force_https ? "https" : "http"}:#{src}" if src.start_with?("//") - next unless Discourse.store.has_been_uploaded?(src) || src =~ /\A\/[^\/]/i - - path = begin - URI(URI.unescape(src))&.path - rescue URI::Error - end - - next if path.blank? - - sha1 = - if path.include? "optimized" - OptimizedImage.extract_sha1(path) - else - Upload.extract_sha1(path) - end - - if sha1.blank? || sha1s.exclude?(sha1) - upload_id = nil - - if missing_uploads.exclude?(src) - if sha1.blank? - # recovering old scheme upload. - local_store = FileStore::LocalStore.new - public_path = "#{local_store.public_dir}#{path}" - file_path = nil - - if File.exists?(public_path) - file_path = public_path - else - tombstone_path = public_path.sub("/uploads/", "/uploads/tombstone/") - file_path = tombstone_path if File.exists?(tombstone_path) - end - - if file_path.present? - tmp = Tempfile.new - tmp.write(File.read(file_path)) - tmp.rewind - - if upload = UploadCreator.new(tmp, File.basename(path)).create_for(Discourse.system_user.id) - sha1s << upload.sha1 - upload_id = upload.id - DbHelper.remap(UrlHelper.absolute(src), upload.url) - - post.reload - post.raw.gsub!(src, upload.url) - post.cooked.gsub!(src, upload.url) - - if post.changed? - post.save!(validate: false) - post.rebake! - end - end - - FileUtils.rm(tmp, force: true) - else - old_scheme_upload_count += 1 - end - else - upload_id = Upload.where(sha1: sha1).pluck(:id).first - end - - if upload_id.present? - attributes = { post_id: post.id, upload_id: upload_id } - PostUpload.create!(attributes) unless PostUpload.exists?(attributes) - else - missing_uploads << src - end - end - - missing_post_uploads << src if upload_id.blank? - end - end - - if missing_post_uploads.present? - PostCustomField.create!(post_id: post.id, name: Post::MISSING_UPLOADS, value: missing_post_uploads.to_json) - count += missing_post_uploads.count - putc "x" - else - putc "." - end + if File.exists?(public_path) + file_path = public_path + else + tombstone_path = public_path.sub("/uploads/", "/uploads/tombstone/") + file_path = tombstone_path if File.exists?(tombstone_path) end + + if file_path.present? + tmp = Tempfile.new + tmp.write(File.read(file_path)) + tmp.rewind + + if upload = UploadCreator.new(tmp, File.basename(path)).create_for(Discourse.system_user.id) + sha1s << upload.sha1 + upload_id = upload.id + DbHelper.remap(UrlHelper.absolute(src), upload.url) + + post.reload + post.raw.gsub!(src, upload.url) + post.cooked.gsub!(src, upload.url) + + if post.changed? + post.save!(validate: false) + post.rebake! + end + end + + FileUtils.rm(tmp, force: true) + else + old_scheme_upload_count += 1 + end + + upload_id end - puts "", "#{count} post uploads are missing.", "" + puts "", "#{missing[:count]} post uploads are missing.", "" - if count > 0 - puts "#{missing_uploads.count} uploads are missing." - puts "#{old_scheme_upload_count} of #{missing_uploads.count} are old scheme uploads." if old_scheme_upload_count > 0 - puts "#{get_missing_uploads.count} of #{Post.count} posts are affected.", "" + if missing[:count] > 0 + puts "#{missing[:uploads].count} uploads are missing." + puts "#{old_scheme_upload_count} of #{missing[:uploads].count} are old scheme uploads." if old_scheme_upload_count > 0 + puts "#{missing[:post_uploads].count} of #{Post.count} posts are affected.", "" end end diff --git a/spec/components/s3_inventory_spec.rb b/spec/components/s3_inventory_spec.rb index 3f4a016ec2a..6c26dcd4145 100644 --- a/spec/components/s3_inventory_spec.rb +++ b/spec/components/s3_inventory_spec.rb @@ -75,7 +75,7 @@ describe "S3Inventory" do inventory.backfill_etags_and_list_missing end - expect(output).to eq("#{upload.url}\n1 of 4 uploads are missing\n") + expect(output).to eq("Listing missing post uploads...\n0 post uploads are missing.\n#{upload.url}\n1 of 4 uploads are missing\n") expect(Discourse.stats.get("missing_s3_uploads")).to eq(1) end diff --git a/spec/models/post_spec.rb b/spec/models/post_spec.rb index d993dc420cf..84629708bc8 100644 --- a/spec/models/post_spec.rb +++ b/spec/models/post_spec.rb @@ -1261,19 +1261,23 @@ describe Post do ) end + let(:base_url) { "#{Discourse.base_url_no_prefix}#{Discourse.base_uri}" } + let(:video_url) { "#{base_url}#{video_upload.url}" } + let(:audio_url) { "#{base_url}#{audio_upload.url}" } + let(:raw) do <<~RAW Link RAW end