SECURITY: Use canonical url for topic embeddings (#22085)
This prevents duplicate topics from being created when using embed_urls that only differ on query params.
This commit is contained in:
parent
56b74e6042
commit
644dded000
|
@ -21,10 +21,14 @@ class TopicEmbed < ActiveRecord::Base
|
|||
end
|
||||
|
||||
class FetchResponse
|
||||
attr_accessor :title, :body, :author
|
||||
attr_accessor :title, :body, :author, :url
|
||||
end
|
||||
|
||||
def self.normalize_url(url)
|
||||
# downcase
|
||||
# remove trailing forward slash/
|
||||
# remove consecutive hyphens
|
||||
# remove leading and trailing whitespace
|
||||
url.downcase.sub(%r{/\z}, "").sub(/\-+/, "-").strip
|
||||
end
|
||||
|
||||
|
@ -45,7 +49,7 @@ class TopicEmbed < ActiveRecord::Base
|
|||
|
||||
url = normalize_url(url)
|
||||
|
||||
embed = TopicEmbed.find_by("lower(embed_url) = ?", url)
|
||||
embed = topic_embed_by_url(url)
|
||||
content_sha1 = Digest::SHA1.hexdigest(contents)
|
||||
post = nil
|
||||
|
||||
|
@ -127,7 +131,7 @@ class TopicEmbed < ActiveRecord::Base
|
|||
return
|
||||
end
|
||||
|
||||
parse_html(html, url)
|
||||
parse_html(html, uri.to_s)
|
||||
end
|
||||
|
||||
def self.parse_html(html, url)
|
||||
|
@ -151,6 +155,9 @@ class TopicEmbed < ActiveRecord::Base
|
|||
response = FetchResponse.new
|
||||
|
||||
raw_doc = Nokogiri.HTML5(html)
|
||||
|
||||
response.url = url
|
||||
|
||||
auth_element =
|
||||
raw_doc.at('meta[@name="discourse-username"]') || raw_doc.at('meta[@name="author"]')
|
||||
if auth_element.present?
|
||||
|
@ -217,6 +224,7 @@ class TopicEmbed < ActiveRecord::Base
|
|||
response.title = opts[:title] if opts[:title].present?
|
||||
import_user = opts[:user] if opts[:user].present?
|
||||
import_user = response.author if response.author.present?
|
||||
url = normalize_url(response.url) if response.url.present?
|
||||
|
||||
TopicEmbed.import(import_user, url, response.title, response.body)
|
||||
end
|
||||
|
@ -260,9 +268,14 @@ class TopicEmbed < ActiveRecord::Base
|
|||
fragment.at("div").inner_html
|
||||
end
|
||||
|
||||
def self.topic_id_for_embed(embed_url)
|
||||
def self.topic_embed_by_url(embed_url)
|
||||
embed_url = normalize_url(embed_url).sub(%r{\Ahttps?\://}, "")
|
||||
TopicEmbed.where("embed_url ~* ?", "^https?://#{Regexp.escape(embed_url)}$").pick(:topic_id)
|
||||
TopicEmbed.where("embed_url ~* ?", "^https?://#{Regexp.escape(embed_url)}$").first
|
||||
end
|
||||
|
||||
def self.topic_id_for_embed(embed_url)
|
||||
topic_embed = topic_embed_by_url(embed_url)
|
||||
topic_embed&.topic_id
|
||||
end
|
||||
|
||||
def self.first_paragraph_from(html)
|
||||
|
|
|
@ -158,6 +158,17 @@ RSpec.describe TopicEmbed do
|
|||
expect(imported_post.topic.category).to eq(category)
|
||||
end
|
||||
|
||||
it "does not create duplicate topics with different protocols in the embed_url" do
|
||||
Jobs.run_immediately!
|
||||
expect {
|
||||
TopicEmbed.import(user, "http://eviltrout.com/abcd", title, "some random content")
|
||||
}.to change { Topic.all.count }.by(1)
|
||||
|
||||
expect {
|
||||
TopicEmbed.import(user, "https://eviltrout.com/abcd", title, "some random content")
|
||||
}.to_not change { Topic.all.count }
|
||||
end
|
||||
|
||||
it "creates the topic with the tag passed as a parameter" do
|
||||
Jobs.run_immediately!
|
||||
SiteSetting.tagging_enabled = true
|
||||
|
@ -430,21 +441,53 @@ RSpec.describe TopicEmbed do
|
|||
end
|
||||
|
||||
context "with canonical links" do
|
||||
fab!(:user) { Fabricate(:user) }
|
||||
let(:title) { "How to turn a fish from good to evil in 30 seconds" }
|
||||
let(:url) { "http://eviltrout.com/123?asd" }
|
||||
let(:canonical_url) { "http://eviltrout.com/123" }
|
||||
let(:url2) { "http://eviltrout.com/blog?post=1&canonical=false" }
|
||||
let(:canonical_url2) { "http://eviltrout.com/blog?post=1" }
|
||||
let(:content) { "<head><link rel=\"canonical\" href=\"#{canonical_url}\"></head>" }
|
||||
let(:content2) { "<head><link rel=\"canonical\" href=\"#{canonical_url2}\"></head>" }
|
||||
let(:canonical_content) { "<title>Canonical</title><body></body>" }
|
||||
|
||||
before do
|
||||
stub_request(:get, url).to_return(status: 200, body: content)
|
||||
stub_request(:head, canonical_url)
|
||||
stub_request(:get, canonical_url).to_return(status: 200, body: canonical_content)
|
||||
|
||||
stub_request(:get, url2).to_return(status: 200, body: content2)
|
||||
stub_request(:head, canonical_url2)
|
||||
stub_request(:get, canonical_url2).to_return(status: 200, body: canonical_content)
|
||||
end
|
||||
|
||||
it "a" do
|
||||
it "fetches canonical content" do
|
||||
response = TopicEmbed.find_remote(url)
|
||||
|
||||
expect(response.title).to eq("Canonical")
|
||||
expect(response.url).to eq(canonical_url)
|
||||
end
|
||||
|
||||
it "does not create duplicate topics when url differs from canonical_url" do
|
||||
Jobs.run_immediately!
|
||||
expect { TopicEmbed.import_remote(canonical_url, { title: title, user: user }) }.to change {
|
||||
Topic.all.count
|
||||
}.by(1)
|
||||
|
||||
expect { TopicEmbed.import_remote(url, { title: title, user: user }) }.to_not change {
|
||||
Topic.all.count
|
||||
}
|
||||
end
|
||||
|
||||
it "does not create duplicate topics when url contains extra params" do
|
||||
Jobs.run_immediately!
|
||||
expect {
|
||||
TopicEmbed.import_remote(canonical_url2, { title: title, user: user })
|
||||
}.to change { Topic.all.count }.by(1)
|
||||
|
||||
expect { TopicEmbed.import_remote(url2, { title: title, user: user }) }.to_not change {
|
||||
Topic.all.count
|
||||
}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue