2019-05-02 18:17:27 -04:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2013-05-30 14:34:44 -04:00
|
|
|
class PostAnalyzer
|
|
|
|
def initialize(raw, topic_id)
|
|
|
|
@raw = raw
|
|
|
|
@topic_id = topic_id
|
2018-06-13 14:57:32 -04:00
|
|
|
@onebox_urls = []
|
2019-09-10 06:59:48 -04:00
|
|
|
@found_oneboxes = false
|
2016-04-12 14:09:59 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def found_oneboxes?
|
2019-09-10 06:59:48 -04:00
|
|
|
@found_oneboxes
|
2013-05-30 14:34:44 -04:00
|
|
|
end
|
|
|
|
|
2018-02-08 18:26:56 -05:00
|
|
|
def has_oneboxes?
|
|
|
|
return false unless @raw.present?
|
|
|
|
|
|
|
|
cooked_stripped
|
|
|
|
found_oneboxes?
|
|
|
|
end
|
|
|
|
|
2013-05-30 14:34:44 -04:00
|
|
|
# What we use to cook posts
|
2017-10-17 14:37:51 -04:00
|
|
|
def cook(raw, opts = {})
|
|
|
|
cook_method = opts[:cook_method]
|
|
|
|
return raw if cook_method == Post.cook_methods[:raw_html]
|
|
|
|
|
|
|
|
if cook_method == Post.cook_methods[:email]
|
2017-11-15 10:39:29 -05:00
|
|
|
cooked = EmailCook.new(raw).cook(opts)
|
2017-10-17 14:37:51 -04:00
|
|
|
else
|
|
|
|
cooked = PrettyText.cook(raw, opts)
|
|
|
|
end
|
2013-05-30 14:34:44 -04:00
|
|
|
|
2022-03-23 11:36:08 -04:00
|
|
|
limit = SiteSetting.max_oneboxes_per_post
|
2022-05-20 07:09:50 -04:00
|
|
|
result =
|
|
|
|
Oneboxer.apply(cooked, extra_paths: ".inline-onebox-loading") do |url, element|
|
2020-06-24 05:54:54 -04:00
|
|
|
if opts[:invalidate_oneboxes]
|
|
|
|
Oneboxer.invalidate(url)
|
|
|
|
InlineOneboxer.invalidate(url)
|
2023-01-09 07:20:10 -05:00
|
|
|
end
|
2022-05-20 07:09:50 -04:00
|
|
|
next if element["class"] != Oneboxer::ONEBOX_CSS_CLASS
|
|
|
|
next if limit <= 0
|
|
|
|
limit -= 1
|
|
|
|
@onebox_urls << url
|
2019-09-10 06:59:48 -04:00
|
|
|
onebox = Oneboxer.cached_onebox(url)
|
|
|
|
@found_oneboxes = true if onebox.present?
|
2023-01-09 07:20:10 -05:00
|
|
|
onebox
|
2020-06-24 05:54:54 -04:00
|
|
|
end
|
2013-05-30 14:34:44 -04:00
|
|
|
|
FEATURE: Allow hotlinked media to be blocked (#16940)
This commit introduces a new site setting: `block_hotlinked_media`. When enabled, all attempts to hotlink media (images, videos, and audio) will fail, and be replaced with a linked placeholder. Exceptions to the rule can be added via `block_hotlinked_media_exceptions`.
`download_remote_image_to_local` can be used alongside this feature. In that case, hotlinked images will be blocked immediately when the post is created, but will then be replaced with the downloaded version a few seconds later.
This implementation is purely server-side, and does not impact the composer preview.
Technically, there are two stages to this feature:
1. `PrettyText.sanitize_hotlinked_media` is called during `PrettyText.cook`, and whenever new images are introduced by Onebox. It will iterate over all src/srcset attributes in the post HTML and check if they're allowed. If not, the attributes will be removed and replaced with a `data-blocked-hotlinked-src(set)` attribute
2. In the `CookedPostProcessor`, we iterate over all `data-blocked-hotlinked-src(set)` attributes and check whether we have a downloaded version of the media. If yes, we update the src to use the downloaded version. If not, the entire media element is replaced with a placeholder. The placeholder is labelled 'external media', and is a link to the offsite media.
2022-06-07 10:23:04 -04:00
|
|
|
if result.changed?
|
|
|
|
PrettyText.sanitize_hotlinked_media(result.doc)
|
|
|
|
cooked = result.to_html
|
|
|
|
end
|
|
|
|
|
2013-05-30 14:34:44 -04:00
|
|
|
cooked
|
|
|
|
end
|
|
|
|
|
|
|
|
# How many images are present in the post
|
2020-08-07 12:08:59 -04:00
|
|
|
def embedded_media_count
|
2013-05-30 14:34:44 -04:00
|
|
|
return 0 unless @raw.present?
|
|
|
|
|
2020-08-07 12:08:59 -04:00
|
|
|
# TODO - do we need to look for tags other than img, video and audio?
|
|
|
|
cooked_stripped
|
|
|
|
.css("img", "video", "audio")
|
|
|
|
.reject do |t|
|
2016-12-05 09:19:15 -05:00
|
|
|
if dom_class = t["class"]
|
2020-07-26 20:23:54 -04:00
|
|
|
(Post.allowed_image_classes & dom_class.split).count > 0
|
2023-01-09 07:20:10 -05:00
|
|
|
end
|
2013-05-30 14:34:44 -04:00
|
|
|
end
|
|
|
|
.count
|
|
|
|
end
|
|
|
|
|
2013-07-21 20:39:17 -04:00
|
|
|
# How many attachments are present in the post
|
|
|
|
def attachment_count
|
|
|
|
return 0 unless @raw.present?
|
2013-10-17 12:44:09 -04:00
|
|
|
|
2016-12-05 09:19:15 -05:00
|
|
|
attachments =
|
|
|
|
cooked_stripped.css("a.attachment[href^=\"#{Discourse.store.absolute_base_url}\"]")
|
|
|
|
attachments +=
|
|
|
|
cooked_stripped.css(
|
|
|
|
"a.attachment[href^=\"#{Discourse.store.relative_base_url}\"]",
|
|
|
|
) if Discourse.store.internal?
|
2013-07-31 17:26:34 -04:00
|
|
|
attachments.count
|
2013-07-21 20:39:17 -04:00
|
|
|
end
|
|
|
|
|
2013-05-30 14:34:44 -04:00
|
|
|
def raw_mentions
|
|
|
|
return [] if @raw.blank?
|
|
|
|
return @raw_mentions if @raw_mentions.present?
|
2022-12-06 10:10:36 -05:00
|
|
|
@raw_mentions = PrettyText.extract_mentions(cooked_stripped)
|
2013-05-30 14:34:44 -04:00
|
|
|
end
|
|
|
|
|
2015-03-18 01:25:24 -04:00
|
|
|
# from rack ... compat with ruby 2.2
|
|
|
|
def self.parse_uri_rfc2396(uri)
|
|
|
|
@parser ||= defined?(URI::RFC2396_Parser) ? URI::RFC2396_Parser.new : URI
|
|
|
|
@parser.parse(uri)
|
|
|
|
end
|
|
|
|
|
2013-05-30 14:34:44 -04:00
|
|
|
# Count how many hosts are linked in the post
|
|
|
|
def linked_hosts
|
2018-06-13 14:57:32 -04:00
|
|
|
all_links = raw_links + @onebox_urls
|
|
|
|
|
|
|
|
return {} if all_links.blank?
|
2013-05-30 14:34:44 -04:00
|
|
|
return @linked_hosts if @linked_hosts.present?
|
|
|
|
|
|
|
|
@linked_hosts = {}
|
2013-10-17 12:44:09 -04:00
|
|
|
|
2018-06-13 14:57:32 -04:00
|
|
|
all_links.each do |u|
|
2013-07-15 06:11:23 -04:00
|
|
|
begin
|
2015-03-18 01:25:24 -04:00
|
|
|
uri = self.class.parse_uri_rfc2396(u)
|
2013-07-15 06:11:23 -04:00
|
|
|
host = uri.host
|
2014-03-07 04:44:04 -05:00
|
|
|
@linked_hosts[host] ||= 1 unless host.nil?
|
2018-08-14 06:23:32 -04:00
|
|
|
rescue URI::Error
|
2018-06-13 14:57:32 -04:00
|
|
|
# An invalid URI does not count as a host
|
2013-07-15 06:11:23 -04:00
|
|
|
next
|
|
|
|
end
|
2013-05-30 14:34:44 -04:00
|
|
|
end
|
2013-10-17 12:44:09 -04:00
|
|
|
|
2013-05-30 14:34:44 -04:00
|
|
|
@linked_hosts
|
|
|
|
end
|
|
|
|
|
|
|
|
# Returns an array of all links in a post excluding mentions
|
|
|
|
def raw_links
|
|
|
|
return [] unless @raw.present?
|
|
|
|
return @raw_links if @raw_links.present?
|
|
|
|
|
|
|
|
@raw_links = []
|
2018-03-28 12:32:16 -04:00
|
|
|
cooked_stripped
|
|
|
|
.css("a")
|
|
|
|
.each do |l|
|
2014-03-07 04:44:04 -05:00
|
|
|
# Don't include @mentions in the link count
|
2018-03-28 12:32:16 -04:00
|
|
|
next if link_is_a_mention?(l)
|
2021-04-14 03:27:07 -04:00
|
|
|
# Don't include heading anchor in the link count
|
|
|
|
next if link_is_an_anchor?(l)
|
|
|
|
# Don't include hashtags in the link count
|
|
|
|
next if link_is_a_hashtag?(l)
|
2016-12-05 09:19:15 -05:00
|
|
|
@raw_links << l["href"].to_s
|
2013-05-30 14:34:44 -04:00
|
|
|
end
|
2013-10-17 12:44:09 -04:00
|
|
|
|
2013-05-30 14:34:44 -04:00
|
|
|
@raw_links
|
|
|
|
end
|
|
|
|
|
|
|
|
# How many links are present in the post
|
|
|
|
def link_count
|
2019-09-10 06:59:48 -04:00
|
|
|
raw_links.size + @onebox_urls.size
|
2013-05-30 14:34:44 -04:00
|
|
|
end
|
|
|
|
|
2016-12-05 09:19:15 -05:00
|
|
|
def cooked_stripped
|
|
|
|
@cooked_stripped ||=
|
|
|
|
begin
|
2023-11-08 14:13:25 -05:00
|
|
|
cooked = cook(@raw, topic_id: @topic_id)
|
|
|
|
fragment = Nokogiri::HTML5.fragment(cooked)
|
|
|
|
PostStripper.strip(fragment)
|
2016-12-05 09:19:15 -05:00
|
|
|
end
|
2018-06-07 01:28:18 -04:00
|
|
|
end
|
2016-12-05 09:19:15 -05:00
|
|
|
|
2018-09-13 04:34:32 -04:00
|
|
|
private
|
|
|
|
|
2016-12-05 09:19:15 -05:00
|
|
|
def link_is_a_mention?(l)
|
2017-03-28 12:16:58 -04:00
|
|
|
href = l["href"].to_s
|
2021-04-14 03:27:07 -04:00
|
|
|
l["class"].to_s["mention"] &&
|
|
|
|
(
|
|
|
|
href.start_with?("#{Discourse.base_path}/u/") ||
|
|
|
|
href.start_with?("#{Discourse.base_path}/users/")
|
2023-01-09 07:20:10 -05:00
|
|
|
)
|
2021-04-14 03:27:07 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def link_is_an_anchor?(l)
|
|
|
|
l["class"].to_s["anchor"] && l["href"].to_s.start_with?("#")
|
|
|
|
end
|
|
|
|
|
|
|
|
def link_is_a_hashtag?(l)
|
|
|
|
href = l["href"].to_s
|
|
|
|
l["class"].to_s["hashtag"] &&
|
|
|
|
(
|
|
|
|
href.start_with?("#{Discourse.base_path}/c/") ||
|
|
|
|
href.start_with?("#{Discourse.base_path}/tag/")
|
2023-01-09 07:20:10 -05:00
|
|
|
)
|
2016-12-05 09:19:15 -05:00
|
|
|
end
|
2013-05-30 14:34:44 -04:00
|
|
|
end
|