SECURITY: Use canonical url for topic embeddings (#22085)

This prevents duplicate topics from being created when using embed_urls
that only differ on query params.
This commit is contained in:
Blake Erickson 2023-06-13 11:08:08 -06:00 committed by GitHub
parent 56b74e6042
commit 644dded000
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 62 additions and 6 deletions

View File

@ -21,10 +21,14 @@ class TopicEmbed < ActiveRecord::Base
end
class FetchResponse
attr_accessor :title, :body, :author
attr_accessor :title, :body, :author, :url
end
def self.normalize_url(url)
# downcase
# remove trailing forward slash/
# remove consecutive hyphens
# remove leading and trailing whitespace
url.downcase.sub(%r{/\z}, "").sub(/\-+/, "-").strip
end
@ -45,7 +49,7 @@ class TopicEmbed < ActiveRecord::Base
url = normalize_url(url)
embed = TopicEmbed.find_by("lower(embed_url) = ?", url)
embed = topic_embed_by_url(url)
content_sha1 = Digest::SHA1.hexdigest(contents)
post = nil
@ -127,7 +131,7 @@ class TopicEmbed < ActiveRecord::Base
return
end
parse_html(html, url)
parse_html(html, uri.to_s)
end
def self.parse_html(html, url)
@ -151,6 +155,9 @@ class TopicEmbed < ActiveRecord::Base
response = FetchResponse.new
raw_doc = Nokogiri.HTML5(html)
response.url = url
auth_element =
raw_doc.at('meta[@name="discourse-username"]') || raw_doc.at('meta[@name="author"]')
if auth_element.present?
@ -217,6 +224,7 @@ class TopicEmbed < ActiveRecord::Base
response.title = opts[:title] if opts[:title].present?
import_user = opts[:user] if opts[:user].present?
import_user = response.author if response.author.present?
url = normalize_url(response.url) if response.url.present?
TopicEmbed.import(import_user, url, response.title, response.body)
end
@ -260,9 +268,14 @@ class TopicEmbed < ActiveRecord::Base
fragment.at("div").inner_html
end
def self.topic_id_for_embed(embed_url)
def self.topic_embed_by_url(embed_url)
embed_url = normalize_url(embed_url).sub(%r{\Ahttps?\://}, "")
TopicEmbed.where("embed_url ~* ?", "^https?://#{Regexp.escape(embed_url)}$").pick(:topic_id)
TopicEmbed.where("embed_url ~* ?", "^https?://#{Regexp.escape(embed_url)}$").first
end
def self.topic_id_for_embed(embed_url)
topic_embed = topic_embed_by_url(embed_url)
topic_embed&.topic_id
end
def self.first_paragraph_from(html)

View File

@ -158,6 +158,17 @@ RSpec.describe TopicEmbed do
expect(imported_post.topic.category).to eq(category)
end
it "does not create duplicate topics with different protocols in the embed_url" do
Jobs.run_immediately!
expect {
TopicEmbed.import(user, "http://eviltrout.com/abcd", title, "some random content")
}.to change { Topic.all.count }.by(1)
expect {
TopicEmbed.import(user, "https://eviltrout.com/abcd", title, "some random content")
}.to_not change { Topic.all.count }
end
it "creates the topic with the tag passed as a parameter" do
Jobs.run_immediately!
SiteSetting.tagging_enabled = true
@ -430,21 +441,53 @@ RSpec.describe TopicEmbed do
end
context "with canonical links" do
fab!(:user) { Fabricate(:user) }
let(:title) { "How to turn a fish from good to evil in 30 seconds" }
let(:url) { "http://eviltrout.com/123?asd" }
let(:canonical_url) { "http://eviltrout.com/123" }
let(:url2) { "http://eviltrout.com/blog?post=1&canonical=false" }
let(:canonical_url2) { "http://eviltrout.com/blog?post=1" }
let(:content) { "<head><link rel=\"canonical\" href=\"#{canonical_url}\"></head>" }
let(:content2) { "<head><link rel=\"canonical\" href=\"#{canonical_url2}\"></head>" }
let(:canonical_content) { "<title>Canonical</title><body></body>" }
before do
stub_request(:get, url).to_return(status: 200, body: content)
stub_request(:head, canonical_url)
stub_request(:get, canonical_url).to_return(status: 200, body: canonical_content)
stub_request(:get, url2).to_return(status: 200, body: content2)
stub_request(:head, canonical_url2)
stub_request(:get, canonical_url2).to_return(status: 200, body: canonical_content)
end
it "a" do
it "fetches canonical content" do
response = TopicEmbed.find_remote(url)
expect(response.title).to eq("Canonical")
expect(response.url).to eq(canonical_url)
end
it "does not create duplicate topics when url differs from canonical_url" do
Jobs.run_immediately!
expect { TopicEmbed.import_remote(canonical_url, { title: title, user: user }) }.to change {
Topic.all.count
}.by(1)
expect { TopicEmbed.import_remote(url, { title: title, user: user }) }.to_not change {
Topic.all.count
}
end
it "does not create duplicate topics when url contains extra params" do
Jobs.run_immediately!
expect {
TopicEmbed.import_remote(canonical_url2, { title: title, user: user })
}.to change { Topic.all.count }.by(1)
expect { TopicEmbed.import_remote(url2, { title: title, user: user }) }.to_not change {
Topic.all.count
}
end
end
end