discourse/lib/cooked_post_processor.rb

409 lines
13 KiB
Ruby
Raw Normal View History

# Post processing that we can do after a post has already been cooked.
2013-07-07 19:39:08 -04:00
# For example, inserting the onebox content, or image sizes/thumbnails.
2013-02-05 14:16:51 -05:00
require_dependency 'url_helper'
require_dependency 'pretty_text'
2013-02-05 14:16:51 -05:00
class CookedPostProcessor
2013-06-21 12:29:40 -04:00
include ActionView::Helpers::NumberHelper
2013-02-19 01:57:14 -05:00
attr_reader :cooking_options
2017-07-27 21:20:09 -04:00
def initialize(post, opts = {})
2013-02-05 14:16:51 -05:00
@dirty = false
@opts = opts
@post = post
@previous_cooked = (@post.cooked || "").dup
# NOTE: we re-cook the post here in order to prevent timing issues with edits
# cf. https://meta.discourse.org/t/edit-of-rebaked-post-doesnt-show-in-html-only-in-raw/33815/6
@cooking_options = post.cooking_options || opts[:cooking_options] || {}
@cooking_options[:topic_id] = post.topic_id
@cooking_options = @cooking_options.symbolize_keys
@cooking_options[:omit_nofollow] = true if post.omit_nofollow?
@cooking_options[:cook_method] = post.cook_method
2016-04-12 14:09:59 -04:00
analyzer = post.post_analyzer
@doc = Nokogiri::HTML::fragment(analyzer.cook(post.raw, @cooking_options))
@has_oneboxes = analyzer.found_oneboxes?
2013-02-19 01:57:14 -05:00
@size_cache = {}
2013-02-05 14:16:51 -05:00
end
2013-11-21 19:52:26 -05:00
def post_process(bypass_bump = false)
DistributedMutex.synchronize("post_process_#{@post.id}") do
DiscourseEvent.trigger(:before_post_process_cooked, @doc, @post)
keep_reverse_index_up_to_date
post_process_images
post_process_oneboxes
optimize_urls
update_post_image
2017-10-23 13:09:38 -04:00
enforce_nofollow
pull_hotlinked_images(bypass_bump)
grant_badges
2017-06-23 14:35:10 -04:00
DiscourseEvent.trigger(:post_process_cooked, @doc, @post)
nil
end
end
2016-04-06 12:02:18 -04:00
def has_emoji?
(@doc.css("img.emoji") - @doc.css(".quote img")).size > 0
end
def grant_badges
return unless Guardian.new.can_see?(@post)
2016-04-13 16:38:24 -04:00
BadgeGranter.grant(Badge.find(Badge::FirstEmoji), @post.user, post_id: @post.id) if has_emoji?
BadgeGranter.grant(Badge.find(Badge::FirstOnebox), @post.user, post_id: @post.id) if @has_oneboxes
BadgeGranter.grant(Badge.find(Badge::FirstReplyByEmail), @post.user, post_id: @post.id) if @post.is_reply_by_email?
2013-02-05 14:16:51 -05:00
end
2013-11-05 13:04:47 -05:00
def keep_reverse_index_up_to_date
upload_ids = []
2013-10-14 08:27:41 -04:00
@doc.css("a/@href", "img/@src").each do |media|
if upload = Upload.get_from_url(media.value)
2013-11-05 13:04:47 -05:00
upload_ids << upload.id
end
end
upload_ids |= oneboxed_image_uploads.pluck(:id)
2017-06-02 05:39:06 -04:00
2017-07-27 21:20:09 -04:00
values = upload_ids.map { |u| "(#{@post.id},#{u})" }.join(",")
2013-11-05 13:04:47 -05:00
PostUpload.transaction do
PostUpload.where(post_id: @post.id).delete_all
2013-11-05 13:04:47 -05:00
if upload_ids.length > 0
PostUpload.exec_sql("INSERT INTO post_uploads (post_id, upload_id) VALUES #{values}")
2013-07-10 16:55:37 -04:00
end
end
end
2013-02-25 11:42:20 -05:00
def post_process_images
2013-07-07 19:39:08 -04:00
images = extract_images
return if images.blank?
2013-02-05 14:16:51 -05:00
2013-02-19 01:57:14 -05:00
images.each do |img|
2013-11-05 13:04:47 -05:00
limit_size!(img)
convert_to_link!(img)
2013-04-13 10:31:20 -04:00
end
end
2013-04-13 10:31:20 -04:00
2013-07-07 19:39:08 -04:00
def extract_images
# all image with a src attribute
@doc.css("img[src]") -
# minus, data images
@doc.css("img[src^='data']") -
2015-08-05 06:57:31 -04:00
# minus, emojis
@doc.css("img.emoji") -
# minus, image inside oneboxes
oneboxed_images -
# minus, images inside quotes
@doc.css(".quote img")
2013-07-07 19:39:08 -04:00
end
def extract_images_for_post
# all image with a src attribute
@doc.css("img[src]") -
# minus, emojis
@doc.css("img.emoji") -
# minus, images inside quotes
@doc.css(".quote img")
end
def oneboxed_images
2017-06-02 05:39:06 -04:00
@doc.css(".onebox-body img, .onebox img")
end
def oneboxed_image_uploads
urls = Set.new
oneboxed_images.each do |img|
url = img["src"].sub(/^https?:/i, "")
urls << url
urls << "http:#{url}"
urls << "https:#{url}"
end
Upload.where(origin: urls.to_a)
end
2013-11-05 13:04:47 -05:00
def limit_size!(img)
2013-11-25 12:36:13 -05:00
# retrieve the size from
# 1) the width/height attributes
# 2) the dimension from the preview (image_sizes)
# 3) the dimension of the original image (HTTP request)
w, h = get_size_from_attributes(img) ||
get_size_from_image_sizes(img["src"], @opts[:image_sizes]) ||
get_size(img["src"])
2013-11-05 13:04:47 -05:00
# limit the size of the thumbnail
img["width"], img["height"] = ImageSizer.resize(w, h)
2013-07-07 19:39:08 -04:00
end
2013-11-25 12:36:13 -05:00
def get_size_from_attributes(img)
w, h = img["width"].to_i, img["height"].to_i
return [w, h] unless w <= 0 || h <= 0
# if only width or height are specified attempt to scale image
if w > 0 || h > 0
w = w.to_f
h = h.to_f
return unless original_image_size = get_size(img["src"])
original_width, original_height = original_image_size.map(&:to_f)
if w > 0
2017-07-27 21:20:09 -04:00
ratio = w / original_width
[w.floor, (original_height * ratio).floor]
else
2017-07-27 21:20:09 -04:00
ratio = h / original_height
[(original_width * ratio).floor, h.floor]
end
end
2013-11-25 12:36:13 -05:00
end
2013-11-05 13:04:47 -05:00
def get_size_from_image_sizes(src, image_sizes)
return unless image_sizes.present?
image_sizes.each do |image_size|
url, size = image_size[0], image_size[1]
if url && url.include?(src) &&
size && size["width"].to_i > 0 && size["height"].to_i > 0
return [size["width"], size["height"]]
end
2013-11-05 13:04:47 -05:00
end
end
2013-02-20 20:07:36 -05:00
2013-11-05 13:04:47 -05:00
def get_size(url)
return @size_cache[url] if @size_cache.has_key?(url)
2013-11-05 13:04:47 -05:00
absolute_url = url
absolute_url = Discourse.base_url_no_prefix + absolute_url if absolute_url =~ /^\/[^\/]/
return unless absolute_url
2013-11-05 13:04:47 -05:00
# FastImage fails when there's no scheme
absolute_url = SiteSetting.scheme + ":" + absolute_url if absolute_url.start_with?("//")
2013-11-05 13:04:47 -05:00
return unless is_valid_image_url?(absolute_url)
2013-11-05 13:04:47 -05:00
# we can *always* crawl our own images
2016-03-07 22:38:26 -05:00
return unless SiteSetting.crawl_images? || Discourse.store.has_been_uploaded?(url)
@size_cache[url] = FastImage.size(absolute_url)
2013-11-05 13:04:47 -05:00
rescue Zlib::BufError # FastImage.size raises BufError for some gifs
2013-06-17 16:46:48 -04:00
end
2013-11-05 13:04:47 -05:00
def is_valid_image_url?(url)
uri = URI.parse(url)
%w(http https).include? uri.scheme
rescue URI::InvalidURIError
2013-02-19 01:57:14 -05:00
end
# only crop when the image is taller than 16:9
# we only use 95% of that to allow for a small margin
MIN_RATIO_TO_CROP ||= (9.0 / 16.0) * 0.95
2013-11-05 13:04:47 -05:00
def convert_to_link!(img)
2013-02-19 01:57:14 -05:00
src = img["src"]
return if src.blank? || is_a_hyperlink?(img)
2013-02-19 01:57:14 -05:00
width, height = img["width"].to_i, img["height"].to_i
# TODO: store original dimentions in db
original_width, original_height = (get_size(src) || [0, 0]).map(&:to_i)
2013-02-19 01:57:14 -05:00
# can't reach the image...
if original_width == 0 || original_height == 0
Rails.logger.info "Can't reach '#{src}' to get its dimension."
return
end
return if original_width <= width && original_height <= height
return if original_width <= SiteSetting.max_image_width && original_height <= SiteSetting.max_image_height
2013-07-07 19:39:08 -04:00
crop = false
if original_width.to_f / original_height.to_f < MIN_RATIO_TO_CROP
crop = true
width, height = ImageSizer.crop(original_width, original_height)
img["width"] = width
img["height"] = height
end
if upload = Upload.get_from_url(src)
upload.create_thumbnail!(width, height, crop)
2013-07-07 19:39:08 -04:00
end
2013-02-19 01:57:14 -05:00
2013-07-07 19:39:08 -04:00
add_lightbox!(img, original_width, original_height, upload)
end
2013-11-05 13:04:47 -05:00
def is_a_hyperlink?(img)
2013-02-19 01:57:14 -05:00
parent = img.parent
while parent
return true if parent.name == "a"
parent = parent.parent if parent.respond_to?(:parent)
2013-02-19 01:57:14 -05:00
end
false
2013-07-07 19:39:08 -04:00
end
2013-02-19 01:57:14 -05:00
2017-07-27 21:20:09 -04:00
def add_lightbox!(img, original_width, original_height, upload = nil)
2013-06-25 20:44:20 -04:00
# first, create a div to hold our lightbox
2013-07-07 19:39:08 -04:00
lightbox = Nokogiri::XML::Node.new("div", @doc)
2013-12-02 04:06:48 -05:00
lightbox["class"] = "lightbox-wrapper"
2013-07-07 19:39:08 -04:00
img.add_next_sibling(lightbox)
lightbox.add_child(img)
2013-06-25 20:44:20 -04:00
# then, the link to our larger image
2013-07-07 19:39:08 -04:00
a = Nokogiri::XML::Node.new("a", @doc)
2013-02-19 01:57:14 -05:00
img.add_next_sibling(a)
if upload && Discourse.store.internal?
a["data-download-href"] = Discourse.store.download_url(upload)
end
2013-11-05 13:04:47 -05:00
a["href"] = img["src"]
2013-02-19 01:57:14 -05:00
a["class"] = "lightbox"
a.add_child(img)
2013-07-07 19:39:08 -04:00
# replace the image by its thumbnail
2013-11-05 13:04:47 -05:00
w, h = img["width"].to_i, img["height"].to_i
img["src"] = upload.thumbnail(w, h).url if upload && upload.has_thumbnail?(w, h)
2013-07-07 19:39:08 -04:00
2013-06-25 20:44:20 -04:00
# then, some overlay informations
2013-07-07 19:39:08 -04:00
meta = Nokogiri::XML::Node.new("div", @doc)
2013-06-25 20:44:20 -04:00
meta["class"] = "meta"
2013-07-07 19:39:08 -04:00
img.add_next_sibling(meta)
2013-06-21 12:29:40 -04:00
2013-11-05 13:04:47 -05:00
filename = get_filename(upload, img["src"])
2013-06-21 12:29:40 -04:00
informations = "#{original_width}x#{original_height}"
2013-07-24 03:24:28 -04:00
informations << " #{number_to_human_size(upload.filesize)}" if upload
2013-06-21 12:29:40 -04:00
2016-08-10 23:27:12 -04:00
a["title"] = CGI.escapeHTML(img["title"] || filename)
2013-11-29 14:03:39 -05:00
2016-08-10 23:27:12 -04:00
meta.add_child create_span_node("filename", a["title"])
2013-06-25 20:44:20 -04:00
meta.add_child create_span_node("informations", informations)
meta.add_child create_span_node("expand")
2013-06-21 12:29:40 -04:00
end
2013-02-19 01:57:14 -05:00
def get_filename(upload, src)
return File.basename(src) unless upload
return upload.original_filename unless upload.original_filename =~ /^blob(\.png)?$/i
2013-11-05 13:04:47 -05:00
return I18n.t("upload.pasted_image_filename")
end
2017-07-27 21:20:09 -04:00
def create_span_node(klass, content = nil)
2013-07-07 19:39:08 -04:00
span = Nokogiri::XML::Node.new("span", @doc)
2013-06-21 12:29:40 -04:00
span.content = content if content
2013-11-05 13:04:47 -05:00
span["class"] = klass
2013-06-21 12:29:40 -04:00
span
2013-02-05 14:16:51 -05:00
end
def update_post_image
img = extract_images_for_post.first
return if img.blank?
if img["src"].present?
@post.update_column(:image_url, img["src"][0...255]) # post
@post.topic.update_column(:image_url, img["src"][0...255]) if @post.is_first_post? # topic
2013-07-07 19:39:08 -04:00
end
end
2013-11-05 13:04:47 -05:00
def post_process_oneboxes
args = {
post_id: @post.id,
invalidate_oneboxes: !!@opts[:invalidate_oneboxes],
}
# apply oneboxes
Oneboxer.apply(@doc, topic_id: @post.topic_id) do |url|
2016-04-12 14:09:59 -04:00
@has_oneboxes = true
Oneboxer.onebox(url, args)
end
2017-10-23 13:09:38 -04:00
2017-06-02 05:39:06 -04:00
uploads = oneboxed_image_uploads.select(:url, :origin)
oneboxed_images.each do |img|
url = img["src"].sub(/^https?:/i, "")
upload = uploads.find { |u| u.origin.sub(/^https?:/i, "") == url }
img["src"] = upload.url if upload.present?
end
2017-10-23 13:09:38 -04:00
# make sure we grab dimensions for oneboxed images
oneboxed_images.each { |img| limit_size!(img) }
2013-02-05 14:16:51 -05:00
end
2013-11-05 13:04:47 -05:00
def optimize_urls
# attachments can't be on the CDN when either setting is enabled
if SiteSetting.login_required || SiteSetting.prevent_anons_from_downloading_files
@doc.css("a.attachment[href]").each do |a|
href = a["href"].to_s
a["href"] = UrlHelper.schemaless UrlHelper.absolute_without_cdn(href) if UrlHelper.is_local(href)
end
end
use_s3_cdn = SiteSetting.Upload.enable_s3_uploads && SiteSetting.Upload.s3_cdn_url.present?
%w{href data-download-href}.each do |selector|
@doc.css("a[#{selector}]").each do |a|
href = a[selector].to_s
a[selector] = UrlHelper.schemaless UrlHelper.absolute(href) if UrlHelper.is_local(href)
2016-06-30 11:15:56 -04:00
a[selector] = Discourse.store.cdn_url(a[selector]) if use_s3_cdn
end
2013-11-05 13:04:47 -05:00
end
@doc.css("img[src]").each do |img|
2013-11-05 13:04:47 -05:00
src = img["src"].to_s
img["src"] = UrlHelper.schemaless UrlHelper.absolute(src) if UrlHelper.is_local(src)
2016-06-30 11:15:56 -04:00
img["src"] = Discourse.store.cdn_url(img["src"]) if use_s3_cdn
2013-11-05 13:04:47 -05:00
end
2013-02-05 14:16:51 -05:00
end
2017-10-23 13:09:38 -04:00
def enforce_nofollow
if !@cooking_options[:omit_nofollow] && SiteSetting.add_rel_nofollow_to_user_content
PrettyText.add_rel_nofollow_to_user_content(@doc)
end
end
2013-02-05 14:16:51 -05:00
2013-11-21 19:52:26 -05:00
def pull_hotlinked_images(bypass_bump = false)
# is the job enabled?
return unless SiteSetting.download_remote_images_to_local?
2013-11-15 10:46:41 -05:00
# have we enough disk space?
return if disable_if_low_on_disk_space
# don't download remote images for posts that are more than n days old
return unless @post.created_at > (Date.today - SiteSetting.download_remote_images_max_days_old)
2013-11-05 13:04:47 -05:00
# we only want to run the job whenever it's changed by a user
2013-12-11 21:41:34 -05:00
return if @post.last_editor_id == Discourse.system_user.id
2013-11-05 13:04:47 -05:00
# make sure no other job is scheduled
Jobs.cancel_scheduled_job(:pull_hotlinked_images, post_id: @post.id)
# schedule the job
delay = SiteSetting.editing_grace_period + 1
2013-11-21 19:52:26 -05:00
Jobs.enqueue_in(delay.seconds.to_i, :pull_hotlinked_images, post_id: @post.id, bypass_bump: bypass_bump)
2013-07-10 16:55:37 -04:00
end
2013-11-15 10:46:41 -05:00
def disable_if_low_on_disk_space
return false if available_disk_space >= SiteSetting.download_remote_images_threshold
SiteSetting.download_remote_images_to_local = false
# log the site setting change
reason = I18n.t("disable_remote_images_download_reason")
staff_action_logger = StaffActionLogger.new(Discourse.system_user)
2017-07-27 21:20:09 -04:00
staff_action_logger.log_site_setting_change("download_remote_images_to_local", true, false, details: reason)
# also send a private message to the site contact user
notify_about_low_disk_space
true
2013-11-15 10:46:41 -05:00
end
def notify_about_low_disk_space
SystemMessage.create_from_system_user(Discourse.site_contact_user, :download_remote_images_disabled)
end
2013-11-15 10:46:41 -05:00
def available_disk_space
100 - `df -P #{Rails.root}/public/uploads | tail -1 | tr -s ' ' | cut -d ' ' -f 5`.to_i
2013-11-15 10:46:41 -05:00
end
def dirty?
@previous_cooked != html
end
def html
@doc.try(:to_html)
2013-02-05 14:16:51 -05:00
end
end