SECURITY: Use canonical url for topic embeddings (#22088)
This prevents duplicate topics from being created when using embed_urls that only differ on query params.
This commit is contained in:
parent
8189ea6858
commit
5e3106387f
|
@ -21,11 +21,15 @@ class TopicEmbed < ActiveRecord::Base
|
|||
end
|
||||
|
||||
class FetchResponse
|
||||
attr_accessor :title, :body, :author
|
||||
attr_accessor :title, :body, :author, :url
|
||||
end
|
||||
|
||||
def self.normalize_url(url)
|
||||
url.downcase.sub(%r{/$}, "").sub(/\-+/, "-").strip
|
||||
# downcase
|
||||
# remove trailing forward slash/
|
||||
# remove consecutive hyphens
|
||||
# remove leading and trailing whitespace
|
||||
url.downcase.sub(%r{/\z}, "").sub(/\-+/, "-").strip
|
||||
end
|
||||
|
||||
def self.imported_from_html(url)
|
||||
|
@ -45,7 +49,7 @@ class TopicEmbed < ActiveRecord::Base
|
|||
|
||||
url = normalize_url(url)
|
||||
|
||||
embed = TopicEmbed.find_by("lower(embed_url) = ?", url)
|
||||
embed = topic_embed_by_url(url)
|
||||
content_sha1 = Digest::SHA1.hexdigest(contents)
|
||||
post = nil
|
||||
|
||||
|
@ -123,6 +127,18 @@ class TopicEmbed < ActiveRecord::Base
|
|||
uri = fd.resolve
|
||||
return if uri.blank?
|
||||
|
||||
begin
|
||||
html = uri.read
|
||||
rescue OpenURI::HTTPError, Net::OpenTimeout
|
||||
return
|
||||
end
|
||||
|
||||
parse_html(html, uri.to_s)
|
||||
end
|
||||
|
||||
def self.parse_html(html, url)
|
||||
require "ruby-readability"
|
||||
|
||||
opts = {
|
||||
tags: %w[div p code pre h1 h2 h3 b em i strong a img ul li ol blockquote],
|
||||
attributes: %w[href src class],
|
||||
|
@ -139,14 +155,13 @@ class TopicEmbed < ActiveRecord::Base
|
|||
SiteSetting.allowed_embed_classnames if SiteSetting.allowed_embed_classnames.present?
|
||||
|
||||
response = FetchResponse.new
|
||||
begin
|
||||
html = uri.read
|
||||
rescue OpenURI::HTTPError, Net::OpenTimeout
|
||||
return
|
||||
end
|
||||
|
||||
raw_doc = Nokogiri.HTML5(html)
|
||||
auth_element = raw_doc.at('meta[@name="author"]')
|
||||
|
||||
response.url = url
|
||||
|
||||
auth_element =
|
||||
raw_doc.at('meta[@name="discourse-username"]') || raw_doc.at('meta[@name="author"]')
|
||||
if auth_element.present?
|
||||
response.author = User.where(username_lower: auth_element[:content].strip).first
|
||||
end
|
||||
|
@ -203,13 +218,15 @@ class TopicEmbed < ActiveRecord::Base
|
|||
response
|
||||
end
|
||||
|
||||
def self.import_remote(import_user, url, opts = nil)
|
||||
def self.import_remote(url, opts = nil)
|
||||
opts = opts || {}
|
||||
response = find_remote(url)
|
||||
return if response.nil?
|
||||
|
||||
response.title = opts[:title] if opts[:title].present?
|
||||
import_user = opts[:user] if opts[:user].present?
|
||||
import_user = response.author if response.author.present?
|
||||
url = normalize_url(response.url) if response.url.present?
|
||||
|
||||
TopicEmbed.import(import_user, url, response.title, response.body)
|
||||
end
|
||||
|
@ -253,11 +270,14 @@ class TopicEmbed < ActiveRecord::Base
|
|||
fragment.at("div").inner_html
|
||||
end
|
||||
|
||||
def self.topic_embed_by_url(embed_url)
|
||||
embed_url = normalize_url(embed_url).sub(%r{\Ahttps?\://}, "")
|
||||
TopicEmbed.where("embed_url ~* ?", "^https?://#{Regexp.escape(embed_url)}$").first
|
||||
end
|
||||
|
||||
def self.topic_id_for_embed(embed_url)
|
||||
embed_url = normalize_url(embed_url).sub(%r{^https?\://}, "")
|
||||
TopicEmbed.where("embed_url ~* ?", "^https?://#{Regexp.escape(embed_url)}$").pluck_first(
|
||||
:topic_id,
|
||||
)
|
||||
topic_embed = topic_embed_by_url(embed_url)
|
||||
topic_embed&.topic_id
|
||||
end
|
||||
|
||||
def self.first_paragraph_from(html)
|
||||
|
|
|
@ -123,6 +123,17 @@ RSpec.describe TopicEmbed do
|
|||
expect(imported_post.topic.category).to eq(category)
|
||||
end
|
||||
|
||||
it "does not create duplicate topics with different protocols in the embed_url" do
|
||||
Jobs.run_immediately!
|
||||
expect {
|
||||
TopicEmbed.import(user, "http://eviltrout.com/abcd", title, "some random content")
|
||||
}.to change { Topic.all.count }.by(1)
|
||||
|
||||
expect {
|
||||
TopicEmbed.import(user, "https://eviltrout.com/abcd", title, "some random content")
|
||||
}.to_not change { Topic.all.count }
|
||||
end
|
||||
|
||||
it "creates the topic with the tag passed as a parameter" do
|
||||
Jobs.run_immediately!
|
||||
SiteSetting.tagging_enabled = true
|
||||
|
@ -395,21 +406,53 @@ RSpec.describe TopicEmbed do
|
|||
end
|
||||
|
||||
context "with canonical links" do
|
||||
fab!(:user) { Fabricate(:user) }
|
||||
let(:title) { "How to turn a fish from good to evil in 30 seconds" }
|
||||
let(:url) { "http://eviltrout.com/123?asd" }
|
||||
let(:canonical_url) { "http://eviltrout.com/123" }
|
||||
let(:url2) { "http://eviltrout.com/blog?post=1&canonical=false" }
|
||||
let(:canonical_url2) { "http://eviltrout.com/blog?post=1" }
|
||||
let(:content) { "<head><link rel=\"canonical\" href=\"#{canonical_url}\"></head>" }
|
||||
let(:content2) { "<head><link rel=\"canonical\" href=\"#{canonical_url2}\"></head>" }
|
||||
let(:canonical_content) { "<title>Canonical</title><body></body>" }
|
||||
|
||||
before do
|
||||
stub_request(:get, url).to_return(status: 200, body: content)
|
||||
stub_request(:head, canonical_url)
|
||||
stub_request(:get, canonical_url).to_return(status: 200, body: canonical_content)
|
||||
|
||||
stub_request(:get, url2).to_return(status: 200, body: content2)
|
||||
stub_request(:head, canonical_url2)
|
||||
stub_request(:get, canonical_url2).to_return(status: 200, body: canonical_content)
|
||||
end
|
||||
|
||||
it "a" do
|
||||
it "fetches canonical content" do
|
||||
response = TopicEmbed.find_remote(url)
|
||||
|
||||
expect(response.title).to eq("Canonical")
|
||||
expect(response.url).to eq(canonical_url)
|
||||
end
|
||||
|
||||
it "does not create duplicate topics when url differs from canonical_url" do
|
||||
Jobs.run_immediately!
|
||||
expect { TopicEmbed.import_remote(canonical_url, { title: title, user: user }) }.to change {
|
||||
Topic.all.count
|
||||
}.by(1)
|
||||
|
||||
expect { TopicEmbed.import_remote(url, { title: title, user: user }) }.to_not change {
|
||||
Topic.all.count
|
||||
}
|
||||
end
|
||||
|
||||
it "does not create duplicate topics when url contains extra params" do
|
||||
Jobs.run_immediately!
|
||||
expect {
|
||||
TopicEmbed.import_remote(canonical_url2, { title: title, user: user })
|
||||
}.to change { Topic.all.count }.by(1)
|
||||
|
||||
expect { TopicEmbed.import_remote(url2, { title: title, user: user }) }.to_not change {
|
||||
Topic.all.count
|
||||
}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue