SECURITY: Use canonical url for topic embeddings (#22088)

This prevents duplicate topics from being created when using embed_urls
that only differ on query params.
This commit is contained in:
Blake Erickson 2023-06-13 11:09:23 -06:00 committed by GitHub
parent 8189ea6858
commit 5e3106387f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 78 additions and 15 deletions

View File

@ -21,11 +21,15 @@ class TopicEmbed < ActiveRecord::Base
end
class FetchResponse
attr_accessor :title, :body, :author
attr_accessor :title, :body, :author, :url
end
def self.normalize_url(url)
url.downcase.sub(%r{/$}, "").sub(/\-+/, "-").strip
# downcase
# remove trailing forward slash/
# remove consecutive hyphens
# remove leading and trailing whitespace
url.downcase.sub(%r{/\z}, "").sub(/\-+/, "-").strip
end
def self.imported_from_html(url)
@ -45,7 +49,7 @@ class TopicEmbed < ActiveRecord::Base
url = normalize_url(url)
embed = TopicEmbed.find_by("lower(embed_url) = ?", url)
embed = topic_embed_by_url(url)
content_sha1 = Digest::SHA1.hexdigest(contents)
post = nil
@ -123,6 +127,18 @@ class TopicEmbed < ActiveRecord::Base
uri = fd.resolve
return if uri.blank?
begin
html = uri.read
rescue OpenURI::HTTPError, Net::OpenTimeout
return
end
parse_html(html, uri.to_s)
end
def self.parse_html(html, url)
require "ruby-readability"
opts = {
tags: %w[div p code pre h1 h2 h3 b em i strong a img ul li ol blockquote],
attributes: %w[href src class],
@ -139,14 +155,13 @@ class TopicEmbed < ActiveRecord::Base
SiteSetting.allowed_embed_classnames if SiteSetting.allowed_embed_classnames.present?
response = FetchResponse.new
begin
html = uri.read
rescue OpenURI::HTTPError, Net::OpenTimeout
return
end
raw_doc = Nokogiri.HTML5(html)
auth_element = raw_doc.at('meta[@name="author"]')
response.url = url
auth_element =
raw_doc.at('meta[@name="discourse-username"]') || raw_doc.at('meta[@name="author"]')
if auth_element.present?
response.author = User.where(username_lower: auth_element[:content].strip).first
end
@ -203,13 +218,15 @@ class TopicEmbed < ActiveRecord::Base
response
end
def self.import_remote(import_user, url, opts = nil)
def self.import_remote(url, opts = nil)
opts = opts || {}
response = find_remote(url)
return if response.nil?
response.title = opts[:title] if opts[:title].present?
import_user = opts[:user] if opts[:user].present?
import_user = response.author if response.author.present?
url = normalize_url(response.url) if response.url.present?
TopicEmbed.import(import_user, url, response.title, response.body)
end
@ -253,11 +270,14 @@ class TopicEmbed < ActiveRecord::Base
fragment.at("div").inner_html
end
def self.topic_embed_by_url(embed_url)
embed_url = normalize_url(embed_url).sub(%r{\Ahttps?\://}, "")
TopicEmbed.where("embed_url ~* ?", "^https?://#{Regexp.escape(embed_url)}$").first
end
def self.topic_id_for_embed(embed_url)
embed_url = normalize_url(embed_url).sub(%r{^https?\://}, "")
TopicEmbed.where("embed_url ~* ?", "^https?://#{Regexp.escape(embed_url)}$").pluck_first(
:topic_id,
)
topic_embed = topic_embed_by_url(embed_url)
topic_embed&.topic_id
end
def self.first_paragraph_from(html)

View File

@ -123,6 +123,17 @@ RSpec.describe TopicEmbed do
expect(imported_post.topic.category).to eq(category)
end
it "does not create duplicate topics with different protocols in the embed_url" do
Jobs.run_immediately!
expect {
TopicEmbed.import(user, "http://eviltrout.com/abcd", title, "some random content")
}.to change { Topic.all.count }.by(1)
expect {
TopicEmbed.import(user, "https://eviltrout.com/abcd", title, "some random content")
}.to_not change { Topic.all.count }
end
it "creates the topic with the tag passed as a parameter" do
Jobs.run_immediately!
SiteSetting.tagging_enabled = true
@ -395,21 +406,53 @@ RSpec.describe TopicEmbed do
end
context "with canonical links" do
fab!(:user) { Fabricate(:user) }
let(:title) { "How to turn a fish from good to evil in 30 seconds" }
let(:url) { "http://eviltrout.com/123?asd" }
let(:canonical_url) { "http://eviltrout.com/123" }
let(:url2) { "http://eviltrout.com/blog?post=1&canonical=false" }
let(:canonical_url2) { "http://eviltrout.com/blog?post=1" }
let(:content) { "<head><link rel=\"canonical\" href=\"#{canonical_url}\"></head>" }
let(:content2) { "<head><link rel=\"canonical\" href=\"#{canonical_url2}\"></head>" }
let(:canonical_content) { "<title>Canonical</title><body></body>" }
before do
stub_request(:get, url).to_return(status: 200, body: content)
stub_request(:head, canonical_url)
stub_request(:get, canonical_url).to_return(status: 200, body: canonical_content)
stub_request(:get, url2).to_return(status: 200, body: content2)
stub_request(:head, canonical_url2)
stub_request(:get, canonical_url2).to_return(status: 200, body: canonical_content)
end
it "a" do
it "fetches canonical content" do
response = TopicEmbed.find_remote(url)
expect(response.title).to eq("Canonical")
expect(response.url).to eq(canonical_url)
end
it "does not create duplicate topics when url differs from canonical_url" do
Jobs.run_immediately!
expect { TopicEmbed.import_remote(canonical_url, { title: title, user: user }) }.to change {
Topic.all.count
}.by(1)
expect { TopicEmbed.import_remote(url, { title: title, user: user }) }.to_not change {
Topic.all.count
}
end
it "does not create duplicate topics when url contains extra params" do
Jobs.run_immediately!
expect {
TopicEmbed.import_remote(canonical_url2, { title: title, user: user })
}.to change { Topic.all.count }.by(1)
expect { TopicEmbed.import_remote(url2, { title: title, user: user }) }.to_not change {
Topic.all.count
}
end
end
end