2019-05-02 18:17:27 -04:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2013-12-31 14:37:43 -05:00
|
|
|
class TopicEmbed < ActiveRecord::Base
|
2017-04-24 14:29:04 -04:00
|
|
|
include Trashable
|
|
|
|
|
2024-01-05 08:09:31 -05:00
|
|
|
EMBED_CONTENT_CACHE_MAX_LENGTH = 32_000
|
|
|
|
|
2013-12-31 14:37:43 -05:00
|
|
|
belongs_to :topic
|
|
|
|
belongs_to :post
|
|
|
|
validates_presence_of :embed_url
|
2015-06-15 12:08:55 -04:00
|
|
|
validates_uniqueness_of :embed_url
|
2024-01-05 08:09:31 -05:00
|
|
|
validates :embed_content_cache, length: { maximum: EMBED_CONTENT_CACHE_MAX_LENGTH }
|
2013-12-31 14:37:43 -05:00
|
|
|
|
2017-04-24 14:29:04 -04:00
|
|
|
before_validation(on: :create) do
|
|
|
|
unless (
|
|
|
|
topic_embed =
|
|
|
|
TopicEmbed
|
|
|
|
.with_deleted
|
|
|
|
.where("deleted_at IS NOT NULL AND embed_url = ?", embed_url)
|
|
|
|
.first
|
|
|
|
).nil?
|
|
|
|
topic_embed.destroy!
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2016-08-30 12:01:04 -04:00
|
|
|
class FetchResponse
|
2023-06-13 13:08:08 -04:00
|
|
|
attr_accessor :title, :body, :author, :url
|
2016-08-30 12:01:04 -04:00
|
|
|
end
|
|
|
|
|
2014-03-19 16:33:21 -04:00
|
|
|
def self.normalize_url(url)
|
2023-06-13 13:08:08 -04:00
|
|
|
# downcase
|
|
|
|
# remove trailing forward slash/
|
|
|
|
# remove consecutive hyphens
|
|
|
|
# remove leading and trailing whitespace
|
2023-01-20 13:52:49 -05:00
|
|
|
url.downcase.sub(%r{/\z}, "").sub(/\-+/, "-").strip
|
2014-03-19 16:33:21 -04:00
|
|
|
end
|
|
|
|
|
2014-04-02 15:54:21 -04:00
|
|
|
def self.imported_from_html(url)
|
2023-04-18 03:05:29 -04:00
|
|
|
url = UrlHelper.normalized_encode(url)
|
2022-08-02 14:49:28 -04:00
|
|
|
I18n.with_locale(SiteSetting.default_locale) do
|
|
|
|
"\n<hr>\n<small>#{I18n.t("embed.imported_from", link: "<a href='#{url}'>#{url}</a>")}</small>\n"
|
|
|
|
end
|
2014-04-02 15:54:21 -04:00
|
|
|
end
|
|
|
|
|
2013-12-31 14:37:43 -05:00
|
|
|
# Import an article from a source (RSS/Atom/Other)
|
2021-09-13 16:01:59 -04:00
|
|
|
def self.import(user, url, title, contents, category_id: nil, cook_method: nil, tags: nil)
|
2023-01-20 13:52:49 -05:00
|
|
|
return unless url =~ %r{\Ahttps?\://}
|
2013-12-31 14:37:43 -05:00
|
|
|
|
2024-01-05 08:09:31 -05:00
|
|
|
original_contents = contents.dup.truncate(EMBED_CONTENT_CACHE_MAX_LENGTH)
|
2021-09-01 14:46:39 -04:00
|
|
|
contents = first_paragraph_from(contents) if SiteSetting.embed_truncate && cook_method.nil?
|
2019-07-25 09:21:01 -04:00
|
|
|
contents ||= ""
|
2021-10-10 22:20:18 -04:00
|
|
|
contents = contents.dup << imported_from_html(url)
|
2013-12-31 14:37:43 -05:00
|
|
|
|
2014-03-26 23:24:57 -04:00
|
|
|
url = normalize_url(url)
|
|
|
|
|
2023-06-13 13:08:08 -04:00
|
|
|
embed = topic_embed_by_url(url)
|
2013-12-31 14:37:43 -05:00
|
|
|
content_sha1 = Digest::SHA1.hexdigest(contents)
|
|
|
|
post = nil
|
|
|
|
|
|
|
|
# If there is no embed, create a topic, post and the embed.
|
|
|
|
if embed.blank?
|
|
|
|
Topic.transaction do
|
2016-08-23 14:55:52 -04:00
|
|
|
eh = EmbeddableHost.record_for_url(url)
|
2015-08-18 17:15:46 -04:00
|
|
|
|
2021-09-01 14:46:39 -04:00
|
|
|
cook_method ||=
|
|
|
|
if SiteSetting.embed_support_markdown
|
2018-03-10 21:26:47 -05:00
|
|
|
Post.cook_methods[:regular]
|
|
|
|
else
|
|
|
|
Post.cook_methods[:raw_html]
|
|
|
|
end
|
|
|
|
|
2020-04-13 15:17:02 -04:00
|
|
|
create_args = {
|
|
|
|
title: title,
|
|
|
|
raw: absolutize_urls(url, contents),
|
|
|
|
skip_validations: true,
|
|
|
|
cook_method: cook_method,
|
2021-09-13 16:01:59 -04:00
|
|
|
category: category_id || eh.try(:category_id),
|
|
|
|
tags: SiteSetting.tagging_enabled ? tags : nil,
|
2023-12-12 09:35:26 -05:00
|
|
|
embed_url: url,
|
|
|
|
embed_content_sha1: content_sha1,
|
2020-04-13 15:17:02 -04:00
|
|
|
}
|
|
|
|
|
2023-12-12 09:35:26 -05:00
|
|
|
post = PostCreator.create(user, create_args)
|
2024-01-05 08:09:31 -05:00
|
|
|
post.topic.topic_embed.update!(embed_content_cache: original_contents)
|
2013-12-31 14:37:43 -05:00
|
|
|
end
|
|
|
|
else
|
2014-03-18 18:02:33 -04:00
|
|
|
absolutize_urls(url, contents)
|
2013-12-31 14:37:43 -05:00
|
|
|
post = embed.post
|
2018-08-21 06:19:03 -04:00
|
|
|
|
2013-12-31 14:37:43 -05:00
|
|
|
# Update the topic if it changed
|
2018-08-21 06:19:03 -04:00
|
|
|
if post&.topic
|
|
|
|
if post.user != user
|
|
|
|
PostOwnerChanger.new(
|
|
|
|
post_ids: [post.id],
|
|
|
|
topic_id: post.topic_id,
|
|
|
|
new_owner: user,
|
|
|
|
acting_user: Discourse.system_user,
|
|
|
|
).change_owner!
|
|
|
|
|
|
|
|
# make sure the post returned has the right author
|
|
|
|
post.reload
|
|
|
|
end
|
|
|
|
|
2020-04-20 14:31:24 -04:00
|
|
|
if (content_sha1 != embed.content_sha1) || (title && title != post&.topic&.title)
|
2020-04-20 14:27:43 -04:00
|
|
|
changes = { raw: absolutize_urls(url, contents) }
|
|
|
|
changes[:title] = title if title.present?
|
|
|
|
|
|
|
|
post.revise(user, changes, skip_validations: true, bypass_rate_limiter: true)
|
2024-01-05 08:09:31 -05:00
|
|
|
embed.update!(content_sha1: content_sha1, embed_content_cache: original_contents)
|
2018-08-23 21:41:54 -04:00
|
|
|
end
|
2013-12-31 14:37:43 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
post
|
|
|
|
end
|
|
|
|
|
2014-04-01 18:16:56 -04:00
|
|
|
def self.find_remote(url)
|
2022-08-09 06:28:29 -04:00
|
|
|
url = UrlHelper.normalized_encode(url)
|
2023-04-27 05:57:06 -04:00
|
|
|
URI.parse(url) # ensure url parses, will raise if not
|
2020-05-27 11:23:55 -04:00
|
|
|
fd = FinalDestination.new(url, validate_uri: true, max_redirects: 5, follow_canonical: true)
|
|
|
|
|
2021-10-27 04:39:28 -04:00
|
|
|
uri = fd.resolve
|
|
|
|
return if uri.blank?
|
2020-05-23 00:56:13 -04:00
|
|
|
|
2023-04-27 05:57:06 -04:00
|
|
|
begin
|
2023-10-26 23:02:20 -04:00
|
|
|
html = FinalDestination::HTTP.get(uri)
|
|
|
|
rescue OpenURI::HTTPError, Net::OpenTimeout, FinalDestination::SSRFDetector::DisallowedIpError
|
2023-04-27 05:57:06 -04:00
|
|
|
return
|
|
|
|
end
|
|
|
|
|
2023-06-13 13:08:08 -04:00
|
|
|
parse_html(html, uri.to_s)
|
2023-04-27 05:57:06 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def self.parse_html(html, url)
|
|
|
|
require "ruby-readability"
|
|
|
|
|
2014-04-15 00:06:51 -04:00
|
|
|
opts = {
|
2023-04-27 05:57:06 -04:00
|
|
|
tags: %w[div p code pre h1 h2 h3 b em i strong a img ul li ol blockquote figure figcaption],
|
2015-09-24 18:20:59 -04:00
|
|
|
attributes: %w[href src class],
|
2014-04-15 00:06:51 -04:00
|
|
|
remove_empty_nodes: false,
|
|
|
|
}
|
|
|
|
|
2020-07-30 15:56:48 -04:00
|
|
|
opts[
|
|
|
|
:whitelist
|
|
|
|
] = SiteSetting.allowed_embed_selectors if SiteSetting.allowed_embed_selectors.present?
|
|
|
|
opts[
|
|
|
|
:blacklist
|
|
|
|
] = SiteSetting.blocked_embed_selectors if SiteSetting.blocked_embed_selectors.present?
|
2020-07-26 20:23:54 -04:00
|
|
|
allowed_embed_classnames =
|
|
|
|
SiteSetting.allowed_embed_classnames if SiteSetting.allowed_embed_classnames.present?
|
2014-04-15 00:06:51 -04:00
|
|
|
|
2016-08-30 12:01:04 -04:00
|
|
|
response = FetchResponse.new
|
2014-04-02 15:54:21 -04:00
|
|
|
|
2020-05-04 23:46:57 -04:00
|
|
|
raw_doc = Nokogiri.HTML5(html)
|
2023-06-13 13:08:08 -04:00
|
|
|
|
|
|
|
response.url = url
|
|
|
|
|
2023-02-28 07:31:59 -05:00
|
|
|
auth_element =
|
|
|
|
raw_doc.at('meta[@name="discourse-username"]') || raw_doc.at('meta[@name="author"]')
|
2016-08-30 12:01:04 -04:00
|
|
|
if auth_element.present?
|
|
|
|
response.author = User.where(username_lower: auth_element[:content].strip).first
|
|
|
|
end
|
|
|
|
|
|
|
|
read_doc = Readability::Document.new(html, opts)
|
|
|
|
|
2019-05-02 18:17:27 -04:00
|
|
|
title = +(raw_doc.title || "")
|
2016-08-22 12:43:02 -04:00
|
|
|
title.strip!
|
|
|
|
|
|
|
|
if SiteSetting.embed_title_scrubber.present?
|
|
|
|
title.sub!(Regexp.new(SiteSetting.embed_title_scrubber), "")
|
|
|
|
title.strip!
|
|
|
|
end
|
2016-08-30 12:01:04 -04:00
|
|
|
response.title = title
|
2020-05-04 23:46:57 -04:00
|
|
|
doc = Nokogiri.HTML5(read_doc.content)
|
2023-01-09 07:20:10 -05:00
|
|
|
|
2016-08-30 12:01:04 -04:00
|
|
|
tags = { "img" => "src", "script" => "src", "a" => "href" }
|
2014-04-02 15:54:21 -04:00
|
|
|
doc
|
|
|
|
.search(tags.keys.join(","))
|
|
|
|
.each do |node|
|
|
|
|
url_param = tags[node.name]
|
|
|
|
src = node[url_param]
|
2015-04-22 19:52:02 -04:00
|
|
|
unless (src.nil? || src.empty?)
|
2014-04-09 11:04:45 -04:00
|
|
|
begin
|
2021-11-17 00:39:49 -05:00
|
|
|
# convert URL to absolute form
|
2022-08-09 06:28:29 -04:00
|
|
|
node[url_param] = URI.join(url, UrlHelper.normalized_encode(src)).to_s
|
2021-04-30 05:10:19 -04:00
|
|
|
rescue URI::Error, Addressable::URI::InvalidURIError
|
2014-04-09 11:04:45 -04:00
|
|
|
# If there is a mistyped URL, just do nothing
|
2020-07-26 20:23:54 -04:00
|
|
|
end
|
2015-09-24 18:20:59 -04:00
|
|
|
end
|
2020-07-26 20:23:54 -04:00
|
|
|
# only allow classes in the allowlist
|
2015-11-06 16:25:11 -05:00
|
|
|
allowed_classes =
|
2020-07-26 20:23:54 -04:00
|
|
|
if allowed_embed_classnames.blank?
|
2023-01-09 07:20:10 -05:00
|
|
|
[]
|
|
|
|
else
|
2020-07-26 20:23:54 -04:00
|
|
|
allowed_embed_classnames.split(/[ ,]+/i)
|
2023-01-09 07:20:10 -05:00
|
|
|
end
|
|
|
|
doc
|
2015-11-06 16:25:11 -05:00
|
|
|
.search('[class]:not([class=""])')
|
|
|
|
.each do |classnode|
|
|
|
|
classes =
|
|
|
|
classnode[:class]
|
2023-01-09 07:20:10 -05:00
|
|
|
.split(" ")
|
2015-11-06 16:25:11 -05:00
|
|
|
.select { |classname| allowed_classes.include?(classname) }
|
2015-09-24 18:20:59 -04:00
|
|
|
if classes.length === 0
|
2015-11-06 16:25:11 -05:00
|
|
|
classnode.delete("class")
|
2023-01-09 07:20:10 -05:00
|
|
|
else
|
2015-11-06 16:25:11 -05:00
|
|
|
classnode[:class] = classes.join(" ")
|
2023-01-09 07:20:10 -05:00
|
|
|
end
|
|
|
|
end
|
2015-09-24 18:20:59 -04:00
|
|
|
end
|
2014-04-02 15:54:21 -04:00
|
|
|
|
2023-04-27 05:57:06 -04:00
|
|
|
response.body = doc.at("body").children.to_html
|
2016-08-30 12:01:04 -04:00
|
|
|
response
|
2014-04-01 18:16:56 -04:00
|
|
|
end
|
2013-12-31 14:37:43 -05:00
|
|
|
|
2023-02-28 07:31:59 -05:00
|
|
|
def self.import_remote(url, opts = nil)
|
2014-04-01 18:16:56 -04:00
|
|
|
opts = opts || {}
|
2016-08-30 12:01:04 -04:00
|
|
|
response = find_remote(url)
|
2017-09-22 11:36:44 -04:00
|
|
|
return if response.nil?
|
|
|
|
|
2016-08-30 12:01:04 -04:00
|
|
|
response.title = opts[:title] if opts[:title].present?
|
2023-02-28 07:31:59 -05:00
|
|
|
import_user = opts[:user] if opts[:user].present?
|
2016-08-30 12:01:04 -04:00
|
|
|
import_user = response.author if response.author.present?
|
2023-06-13 13:08:08 -04:00
|
|
|
url = normalize_url(response.url) if response.url.present?
|
2016-08-30 12:01:04 -04:00
|
|
|
|
|
|
|
TopicEmbed.import(import_user, url, response.title, response.body)
|
2013-12-31 14:37:43 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
# Convert any relative URLs to absolute. RSS is annoying for this.
|
|
|
|
def self.absolutize_urls(url, contents)
|
2014-03-19 16:33:21 -04:00
|
|
|
url = normalize_url(url)
|
2020-02-07 10:54:24 -05:00
|
|
|
begin
|
2022-08-09 06:28:29 -04:00
|
|
|
uri = URI(UrlHelper.normalized_encode(url))
|
2020-02-07 10:54:24 -05:00
|
|
|
rescue URI::Error
|
|
|
|
return contents
|
|
|
|
end
|
2013-12-31 14:37:43 -05:00
|
|
|
prefix = "#{uri.scheme}://#{uri.host}"
|
2020-03-25 11:57:31 -04:00
|
|
|
prefix += ":#{uri.port}" if uri.port != 80 && uri.port != 443
|
2013-12-31 14:37:43 -05:00
|
|
|
|
2020-05-04 23:46:57 -04:00
|
|
|
fragment = Nokogiri::HTML5.fragment("<div>#{contents}</div>")
|
2013-12-31 14:37:43 -05:00
|
|
|
fragment
|
|
|
|
.css("a")
|
|
|
|
.each do |a|
|
2021-11-17 00:39:49 -05:00
|
|
|
if a["href"].present?
|
2022-01-20 21:03:49 -05:00
|
|
|
begin
|
|
|
|
a["href"] = URI.join(prefix, a["href"]).to_s
|
|
|
|
rescue URI::InvalidURIError
|
|
|
|
# NOOP, URL is malformed
|
2023-01-09 07:20:10 -05:00
|
|
|
end
|
2022-01-20 21:03:49 -05:00
|
|
|
end
|
2013-12-31 14:37:43 -05:00
|
|
|
end
|
2022-01-20 21:03:49 -05:00
|
|
|
|
2013-12-31 14:37:43 -05:00
|
|
|
fragment
|
|
|
|
.css("img")
|
|
|
|
.each do |a|
|
2021-11-17 00:39:49 -05:00
|
|
|
if a["src"].present?
|
2022-01-20 21:03:49 -05:00
|
|
|
begin
|
|
|
|
a["src"] = URI.join(prefix, a["src"]).to_s
|
|
|
|
rescue URI::InvalidURIError
|
|
|
|
# NOOP, URL is malformed
|
2023-01-09 07:20:10 -05:00
|
|
|
end
|
2022-01-20 21:03:49 -05:00
|
|
|
end
|
2013-12-31 14:37:43 -05:00
|
|
|
end
|
2022-01-20 21:03:49 -05:00
|
|
|
|
2014-03-18 18:02:33 -04:00
|
|
|
fragment.at("div").inner_html
|
2013-12-31 14:37:43 -05:00
|
|
|
end
|
|
|
|
|
2023-06-13 13:08:08 -04:00
|
|
|
def self.topic_embed_by_url(embed_url)
|
2023-01-20 13:52:49 -05:00
|
|
|
embed_url = normalize_url(embed_url).sub(%r{\Ahttps?\://}, "")
|
2023-06-13 13:08:08 -04:00
|
|
|
TopicEmbed.where("embed_url ~* ?", "^https?://#{Regexp.escape(embed_url)}$").first
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.topic_id_for_embed(embed_url)
|
|
|
|
topic_embed = topic_embed_by_url(embed_url)
|
|
|
|
topic_embed&.topic_id
|
2013-12-31 14:37:43 -05:00
|
|
|
end
|
|
|
|
|
2014-03-18 18:02:33 -04:00
|
|
|
def self.first_paragraph_from(html)
|
2020-05-04 23:46:57 -04:00
|
|
|
doc = Nokogiri.HTML5(html)
|
2014-03-18 18:02:33 -04:00
|
|
|
|
2019-05-02 18:17:27 -04:00
|
|
|
result = +""
|
2014-03-18 18:02:33 -04:00
|
|
|
doc
|
|
|
|
.css("p")
|
|
|
|
.each do |p|
|
|
|
|
if p.text.present?
|
|
|
|
result << p.to_s
|
|
|
|
return result if result.size >= 100
|
2023-01-09 07:20:10 -05:00
|
|
|
end
|
2014-03-18 18:02:33 -04:00
|
|
|
end
|
|
|
|
return result unless result.blank?
|
|
|
|
|
2021-05-20 21:43:47 -04:00
|
|
|
# If there is no first paragraph, return the first div (onebox)
|
2019-08-06 22:45:55 -04:00
|
|
|
doc.css("div").first.to_s
|
2014-03-18 18:02:33 -04:00
|
|
|
end
|
2014-04-03 11:30:43 -04:00
|
|
|
|
|
|
|
def self.expanded_for(post)
|
2019-11-26 20:35:14 -05:00
|
|
|
Discourse
|
|
|
|
.cache
|
|
|
|
.fetch("embed-topic:#{post.topic_id}", expires_in: 10.minutes) do
|
2023-02-12 23:39:45 -05:00
|
|
|
url = TopicEmbed.where(topic_id: post.topic_id).pick(:embed_url)
|
2016-08-30 12:01:04 -04:00
|
|
|
response = TopicEmbed.find_remote(url)
|
2023-01-09 07:20:10 -05:00
|
|
|
|
2016-08-30 12:01:04 -04:00
|
|
|
body = response.body
|
2024-01-05 08:09:31 -05:00
|
|
|
if post&.topic&.topic_embed && body.present?
|
|
|
|
post.topic.topic_embed.update!(
|
|
|
|
embed_content_cache: body.truncate(EMBED_CONTENT_CACHE_MAX_LENGTH),
|
|
|
|
)
|
|
|
|
end
|
2014-04-03 11:30:43 -04:00
|
|
|
body << TopicEmbed.imported_from_html(url)
|
|
|
|
body
|
|
|
|
end
|
|
|
|
end
|
2013-12-31 14:37:43 -05:00
|
|
|
end
|
2014-02-06 19:07:36 -05:00
|
|
|
|
|
|
|
# == Schema Information
|
|
|
|
#
|
|
|
|
# Table name: topic_embeds
|
|
|
|
#
|
2024-01-05 08:09:31 -05:00
|
|
|
# id :integer not null, primary key
|
|
|
|
# topic_id :integer not null
|
|
|
|
# post_id :integer not null
|
|
|
|
# embed_url :string(1000) not null
|
|
|
|
# content_sha1 :string(40)
|
|
|
|
# created_at :datetime not null
|
|
|
|
# updated_at :datetime not null
|
|
|
|
# deleted_at :datetime
|
|
|
|
# deleted_by_id :integer
|
|
|
|
# embed_content_cache :text
|
2014-02-06 19:07:36 -05:00
|
|
|
#
|
|
|
|
# Indexes
|
|
|
|
#
|
|
|
|
# index_topic_embeds_on_embed_url (embed_url) UNIQUE
|
|
|
|
#
|