SECURITY: Use canonical url for topic embeddings (#22088)
This prevents duplicate topics from being created when using embed_urls that only differ on query params.
This commit is contained in:
parent
8189ea6858
commit
5e3106387f
|
@ -21,11 +21,15 @@ class TopicEmbed < ActiveRecord::Base
|
||||||
end
|
end
|
||||||
|
|
||||||
class FetchResponse
|
class FetchResponse
|
||||||
attr_accessor :title, :body, :author
|
attr_accessor :title, :body, :author, :url
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.normalize_url(url)
|
def self.normalize_url(url)
|
||||||
url.downcase.sub(%r{/$}, "").sub(/\-+/, "-").strip
|
# downcase
|
||||||
|
# remove trailing forward slash/
|
||||||
|
# remove consecutive hyphens
|
||||||
|
# remove leading and trailing whitespace
|
||||||
|
url.downcase.sub(%r{/\z}, "").sub(/\-+/, "-").strip
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.imported_from_html(url)
|
def self.imported_from_html(url)
|
||||||
|
@ -45,7 +49,7 @@ class TopicEmbed < ActiveRecord::Base
|
||||||
|
|
||||||
url = normalize_url(url)
|
url = normalize_url(url)
|
||||||
|
|
||||||
embed = TopicEmbed.find_by("lower(embed_url) = ?", url)
|
embed = topic_embed_by_url(url)
|
||||||
content_sha1 = Digest::SHA1.hexdigest(contents)
|
content_sha1 = Digest::SHA1.hexdigest(contents)
|
||||||
post = nil
|
post = nil
|
||||||
|
|
||||||
|
@ -123,6 +127,18 @@ class TopicEmbed < ActiveRecord::Base
|
||||||
uri = fd.resolve
|
uri = fd.resolve
|
||||||
return if uri.blank?
|
return if uri.blank?
|
||||||
|
|
||||||
|
begin
|
||||||
|
html = uri.read
|
||||||
|
rescue OpenURI::HTTPError, Net::OpenTimeout
|
||||||
|
return
|
||||||
|
end
|
||||||
|
|
||||||
|
parse_html(html, uri.to_s)
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.parse_html(html, url)
|
||||||
|
require "ruby-readability"
|
||||||
|
|
||||||
opts = {
|
opts = {
|
||||||
tags: %w[div p code pre h1 h2 h3 b em i strong a img ul li ol blockquote],
|
tags: %w[div p code pre h1 h2 h3 b em i strong a img ul li ol blockquote],
|
||||||
attributes: %w[href src class],
|
attributes: %w[href src class],
|
||||||
|
@ -139,14 +155,13 @@ class TopicEmbed < ActiveRecord::Base
|
||||||
SiteSetting.allowed_embed_classnames if SiteSetting.allowed_embed_classnames.present?
|
SiteSetting.allowed_embed_classnames if SiteSetting.allowed_embed_classnames.present?
|
||||||
|
|
||||||
response = FetchResponse.new
|
response = FetchResponse.new
|
||||||
begin
|
|
||||||
html = uri.read
|
|
||||||
rescue OpenURI::HTTPError, Net::OpenTimeout
|
|
||||||
return
|
|
||||||
end
|
|
||||||
|
|
||||||
raw_doc = Nokogiri.HTML5(html)
|
raw_doc = Nokogiri.HTML5(html)
|
||||||
auth_element = raw_doc.at('meta[@name="author"]')
|
|
||||||
|
response.url = url
|
||||||
|
|
||||||
|
auth_element =
|
||||||
|
raw_doc.at('meta[@name="discourse-username"]') || raw_doc.at('meta[@name="author"]')
|
||||||
if auth_element.present?
|
if auth_element.present?
|
||||||
response.author = User.where(username_lower: auth_element[:content].strip).first
|
response.author = User.where(username_lower: auth_element[:content].strip).first
|
||||||
end
|
end
|
||||||
|
@ -203,13 +218,15 @@ class TopicEmbed < ActiveRecord::Base
|
||||||
response
|
response
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.import_remote(import_user, url, opts = nil)
|
def self.import_remote(url, opts = nil)
|
||||||
opts = opts || {}
|
opts = opts || {}
|
||||||
response = find_remote(url)
|
response = find_remote(url)
|
||||||
return if response.nil?
|
return if response.nil?
|
||||||
|
|
||||||
response.title = opts[:title] if opts[:title].present?
|
response.title = opts[:title] if opts[:title].present?
|
||||||
|
import_user = opts[:user] if opts[:user].present?
|
||||||
import_user = response.author if response.author.present?
|
import_user = response.author if response.author.present?
|
||||||
|
url = normalize_url(response.url) if response.url.present?
|
||||||
|
|
||||||
TopicEmbed.import(import_user, url, response.title, response.body)
|
TopicEmbed.import(import_user, url, response.title, response.body)
|
||||||
end
|
end
|
||||||
|
@ -253,11 +270,14 @@ class TopicEmbed < ActiveRecord::Base
|
||||||
fragment.at("div").inner_html
|
fragment.at("div").inner_html
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def self.topic_embed_by_url(embed_url)
|
||||||
|
embed_url = normalize_url(embed_url).sub(%r{\Ahttps?\://}, "")
|
||||||
|
TopicEmbed.where("embed_url ~* ?", "^https?://#{Regexp.escape(embed_url)}$").first
|
||||||
|
end
|
||||||
|
|
||||||
def self.topic_id_for_embed(embed_url)
|
def self.topic_id_for_embed(embed_url)
|
||||||
embed_url = normalize_url(embed_url).sub(%r{^https?\://}, "")
|
topic_embed = topic_embed_by_url(embed_url)
|
||||||
TopicEmbed.where("embed_url ~* ?", "^https?://#{Regexp.escape(embed_url)}$").pluck_first(
|
topic_embed&.topic_id
|
||||||
:topic_id,
|
|
||||||
)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.first_paragraph_from(html)
|
def self.first_paragraph_from(html)
|
||||||
|
|
|
@ -123,6 +123,17 @@ RSpec.describe TopicEmbed do
|
||||||
expect(imported_post.topic.category).to eq(category)
|
expect(imported_post.topic.category).to eq(category)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it "does not create duplicate topics with different protocols in the embed_url" do
|
||||||
|
Jobs.run_immediately!
|
||||||
|
expect {
|
||||||
|
TopicEmbed.import(user, "http://eviltrout.com/abcd", title, "some random content")
|
||||||
|
}.to change { Topic.all.count }.by(1)
|
||||||
|
|
||||||
|
expect {
|
||||||
|
TopicEmbed.import(user, "https://eviltrout.com/abcd", title, "some random content")
|
||||||
|
}.to_not change { Topic.all.count }
|
||||||
|
end
|
||||||
|
|
||||||
it "creates the topic with the tag passed as a parameter" do
|
it "creates the topic with the tag passed as a parameter" do
|
||||||
Jobs.run_immediately!
|
Jobs.run_immediately!
|
||||||
SiteSetting.tagging_enabled = true
|
SiteSetting.tagging_enabled = true
|
||||||
|
@ -395,21 +406,53 @@ RSpec.describe TopicEmbed do
|
||||||
end
|
end
|
||||||
|
|
||||||
context "with canonical links" do
|
context "with canonical links" do
|
||||||
|
fab!(:user) { Fabricate(:user) }
|
||||||
|
let(:title) { "How to turn a fish from good to evil in 30 seconds" }
|
||||||
let(:url) { "http://eviltrout.com/123?asd" }
|
let(:url) { "http://eviltrout.com/123?asd" }
|
||||||
let(:canonical_url) { "http://eviltrout.com/123" }
|
let(:canonical_url) { "http://eviltrout.com/123" }
|
||||||
|
let(:url2) { "http://eviltrout.com/blog?post=1&canonical=false" }
|
||||||
|
let(:canonical_url2) { "http://eviltrout.com/blog?post=1" }
|
||||||
let(:content) { "<head><link rel=\"canonical\" href=\"#{canonical_url}\"></head>" }
|
let(:content) { "<head><link rel=\"canonical\" href=\"#{canonical_url}\"></head>" }
|
||||||
|
let(:content2) { "<head><link rel=\"canonical\" href=\"#{canonical_url2}\"></head>" }
|
||||||
let(:canonical_content) { "<title>Canonical</title><body></body>" }
|
let(:canonical_content) { "<title>Canonical</title><body></body>" }
|
||||||
|
|
||||||
before do
|
before do
|
||||||
stub_request(:get, url).to_return(status: 200, body: content)
|
stub_request(:get, url).to_return(status: 200, body: content)
|
||||||
stub_request(:head, canonical_url)
|
stub_request(:head, canonical_url)
|
||||||
stub_request(:get, canonical_url).to_return(status: 200, body: canonical_content)
|
stub_request(:get, canonical_url).to_return(status: 200, body: canonical_content)
|
||||||
|
|
||||||
|
stub_request(:get, url2).to_return(status: 200, body: content2)
|
||||||
|
stub_request(:head, canonical_url2)
|
||||||
|
stub_request(:get, canonical_url2).to_return(status: 200, body: canonical_content)
|
||||||
end
|
end
|
||||||
|
|
||||||
it "a" do
|
it "fetches canonical content" do
|
||||||
response = TopicEmbed.find_remote(url)
|
response = TopicEmbed.find_remote(url)
|
||||||
|
|
||||||
expect(response.title).to eq("Canonical")
|
expect(response.title).to eq("Canonical")
|
||||||
|
expect(response.url).to eq(canonical_url)
|
||||||
|
end
|
||||||
|
|
||||||
|
it "does not create duplicate topics when url differs from canonical_url" do
|
||||||
|
Jobs.run_immediately!
|
||||||
|
expect { TopicEmbed.import_remote(canonical_url, { title: title, user: user }) }.to change {
|
||||||
|
Topic.all.count
|
||||||
|
}.by(1)
|
||||||
|
|
||||||
|
expect { TopicEmbed.import_remote(url, { title: title, user: user }) }.to_not change {
|
||||||
|
Topic.all.count
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
it "does not create duplicate topics when url contains extra params" do
|
||||||
|
Jobs.run_immediately!
|
||||||
|
expect {
|
||||||
|
TopicEmbed.import_remote(canonical_url2, { title: title, user: user })
|
||||||
|
}.to change { Topic.all.count }.by(1)
|
||||||
|
|
||||||
|
expect { TopicEmbed.import_remote(url2, { title: title, user: user }) }.to_not change {
|
||||||
|
Topic.all.count
|
||||||
|
}
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue