From 5e3106387feb6de59d749202649971fb3c36ed41 Mon Sep 17 00:00:00 2001 From: Blake Erickson Date: Tue, 13 Jun 2023 11:09:23 -0600 Subject: [PATCH] SECURITY: Use canonical url for topic embeddings (#22088) This prevents duplicate topics from being created when using embed_urls that only differ on query params. --- app/models/topic_embed.rb | 48 +++++++++++++++++++++++---------- spec/models/topic_embed_spec.rb | 45 ++++++++++++++++++++++++++++++- 2 files changed, 78 insertions(+), 15 deletions(-) diff --git a/app/models/topic_embed.rb b/app/models/topic_embed.rb index 98754b5fe73..a318f7e576e 100644 --- a/app/models/topic_embed.rb +++ b/app/models/topic_embed.rb @@ -21,11 +21,15 @@ class TopicEmbed < ActiveRecord::Base end class FetchResponse - attr_accessor :title, :body, :author + attr_accessor :title, :body, :author, :url end def self.normalize_url(url) - url.downcase.sub(%r{/$}, "").sub(/\-+/, "-").strip + # downcase + # remove trailing forward slash/ + # remove consecutive hyphens + # remove leading and trailing whitespace + url.downcase.sub(%r{/\z}, "").sub(/\-+/, "-").strip end def self.imported_from_html(url) @@ -45,7 +49,7 @@ class TopicEmbed < ActiveRecord::Base url = normalize_url(url) - embed = TopicEmbed.find_by("lower(embed_url) = ?", url) + embed = topic_embed_by_url(url) content_sha1 = Digest::SHA1.hexdigest(contents) post = nil @@ -123,6 +127,18 @@ class TopicEmbed < ActiveRecord::Base uri = fd.resolve return if uri.blank? + begin + html = uri.read + rescue OpenURI::HTTPError, Net::OpenTimeout + return + end + + parse_html(html, uri.to_s) + end + + def self.parse_html(html, url) + require "ruby-readability" + opts = { tags: %w[div p code pre h1 h2 h3 b em i strong a img ul li ol blockquote], attributes: %w[href src class], @@ -139,14 +155,13 @@ class TopicEmbed < ActiveRecord::Base SiteSetting.allowed_embed_classnames if SiteSetting.allowed_embed_classnames.present? response = FetchResponse.new - begin - html = uri.read - rescue OpenURI::HTTPError, Net::OpenTimeout - return - end raw_doc = Nokogiri.HTML5(html) - auth_element = raw_doc.at('meta[@name="author"]') + + response.url = url + + auth_element = + raw_doc.at('meta[@name="discourse-username"]') || raw_doc.at('meta[@name="author"]') if auth_element.present? response.author = User.where(username_lower: auth_element[:content].strip).first end @@ -203,13 +218,15 @@ class TopicEmbed < ActiveRecord::Base response end - def self.import_remote(import_user, url, opts = nil) + def self.import_remote(url, opts = nil) opts = opts || {} response = find_remote(url) return if response.nil? response.title = opts[:title] if opts[:title].present? + import_user = opts[:user] if opts[:user].present? import_user = response.author if response.author.present? + url = normalize_url(response.url) if response.url.present? TopicEmbed.import(import_user, url, response.title, response.body) end @@ -253,11 +270,14 @@ class TopicEmbed < ActiveRecord::Base fragment.at("div").inner_html end + def self.topic_embed_by_url(embed_url) + embed_url = normalize_url(embed_url).sub(%r{\Ahttps?\://}, "") + TopicEmbed.where("embed_url ~* ?", "^https?://#{Regexp.escape(embed_url)}$").first + end + def self.topic_id_for_embed(embed_url) - embed_url = normalize_url(embed_url).sub(%r{^https?\://}, "") - TopicEmbed.where("embed_url ~* ?", "^https?://#{Regexp.escape(embed_url)}$").pluck_first( - :topic_id, - ) + topic_embed = topic_embed_by_url(embed_url) + topic_embed&.topic_id end def self.first_paragraph_from(html) diff --git a/spec/models/topic_embed_spec.rb b/spec/models/topic_embed_spec.rb index 82ed4957e66..66e160385fe 100644 --- a/spec/models/topic_embed_spec.rb +++ b/spec/models/topic_embed_spec.rb @@ -123,6 +123,17 @@ RSpec.describe TopicEmbed do expect(imported_post.topic.category).to eq(category) end + it "does not create duplicate topics with different protocols in the embed_url" do + Jobs.run_immediately! + expect { + TopicEmbed.import(user, "http://eviltrout.com/abcd", title, "some random content") + }.to change { Topic.all.count }.by(1) + + expect { + TopicEmbed.import(user, "https://eviltrout.com/abcd", title, "some random content") + }.to_not change { Topic.all.count } + end + it "creates the topic with the tag passed as a parameter" do Jobs.run_immediately! SiteSetting.tagging_enabled = true @@ -395,21 +406,53 @@ RSpec.describe TopicEmbed do end context "with canonical links" do + fab!(:user) { Fabricate(:user) } + let(:title) { "How to turn a fish from good to evil in 30 seconds" } let(:url) { "http://eviltrout.com/123?asd" } let(:canonical_url) { "http://eviltrout.com/123" } + let(:url2) { "http://eviltrout.com/blog?post=1&canonical=false" } + let(:canonical_url2) { "http://eviltrout.com/blog?post=1" } let(:content) { "" } + let(:content2) { "" } let(:canonical_content) { "Canonical" } before do stub_request(:get, url).to_return(status: 200, body: content) stub_request(:head, canonical_url) stub_request(:get, canonical_url).to_return(status: 200, body: canonical_content) + + stub_request(:get, url2).to_return(status: 200, body: content2) + stub_request(:head, canonical_url2) + stub_request(:get, canonical_url2).to_return(status: 200, body: canonical_content) end - it "a" do + it "fetches canonical content" do response = TopicEmbed.find_remote(url) expect(response.title).to eq("Canonical") + expect(response.url).to eq(canonical_url) + end + + it "does not create duplicate topics when url differs from canonical_url" do + Jobs.run_immediately! + expect { TopicEmbed.import_remote(canonical_url, { title: title, user: user }) }.to change { + Topic.all.count + }.by(1) + + expect { TopicEmbed.import_remote(url, { title: title, user: user }) }.to_not change { + Topic.all.count + } + end + + it "does not create duplicate topics when url contains extra params" do + Jobs.run_immediately! + expect { + TopicEmbed.import_remote(canonical_url2, { title: title, user: user }) + }.to change { Topic.all.count }.by(1) + + expect { TopicEmbed.import_remote(url2, { title: title, user: user }) }.to_not change { + Topic.all.count + } end end end