From aac9f43038fc24acead1d1b853d93613afb59c4f Mon Sep 17 00:00:00 2001 From: Natalie Tay Date: Mon, 31 Jan 2022 15:35:12 +0800 Subject: [PATCH] Only block domains at the final destination (#15689) In an earlier PR, we decided that we only want to block a domain if the blocked domain in the SiteSetting is the final destination (/t/59305). That PR used `FinalDestination#get`. `resolve` however is used several places but blocks domains along the redirect chain when certain options are provided. This commit changes the default options for `resolve` to not do that. Existing users of `FinalDestination#resolve` are - `Oneboxer#external_onebox` - our onebox helper `fetch_html_doc`, which is used in amazon, standard embed and youtube - these folks already go through `Oneboxer#external_onebox` which already blocks correctly --- lib/final_destination.rb | 2 +- lib/inline_oneboxer.rb | 8 +-- lib/onebox/domain_checker.rb | 11 ++++ lib/oneboxer.rb | 7 +-- lib/retrieve_title.rb | 2 +- spec/components/oneboxer_spec.rb | 82 ++++++++++++++++++-------- spec/components/retrieve_title_spec.rb | 2 - spec/lib/onebox/domain_checker_spec.rb | 27 +++++++++ 8 files changed, 101 insertions(+), 40 deletions(-) create mode 100644 lib/onebox/domain_checker.rb create mode 100644 spec/lib/onebox/domain_checker_spec.rb diff --git a/lib/final_destination.rb b/lib/final_destination.rb index 09f4d0c58eb..2fbc5711b18 100644 --- a/lib/final_destination.rb +++ b/lib/final_destination.rb @@ -44,9 +44,9 @@ class FinalDestination @opts[:max_redirects] ||= 5 @opts[:lookup_ip] ||= lambda { |host| FinalDestination.lookup_ip(host) } - @ignored = @opts[:ignore_hostnames] || [] @limit = @opts[:max_redirects] + @ignored = [] if @limit > 0 ignore_redirects = [Discourse.base_url_no_prefix] diff --git a/lib/inline_oneboxer.rb b/lib/inline_oneboxer.rb index 2e2dcf2c60f..2ce9b34a329 100644 --- a/lib/inline_oneboxer.rb +++ b/lib/inline_oneboxer.rb @@ -61,7 +61,7 @@ class InlineOneboxer if uri.present? && uri.hostname.present? && (always_allow || allowed_domains.include?(uri.hostname)) && - !domain_is_blocked?(uri.hostname) + !Onebox::DomainChecker.is_blocked?(uri.hostname) title = RetrieveTitle.crawl(url) title = nil if title && title.length < MIN_TITLE_LENGTH return onebox_for(url, title, opts) @@ -73,12 +73,6 @@ class InlineOneboxer private - def self.domain_is_blocked?(hostname) - SiteSetting.blocked_onebox_domains&.split('|').any? do |blocked| - hostname == blocked || hostname.end_with?(".#{blocked}") - end - end - def self.onebox_for(url, title, opts) title = title && Emoji.gsub_emoji_to_unicode(title) if title && opts[:post_number] diff --git a/lib/onebox/domain_checker.rb b/lib/onebox/domain_checker.rb new file mode 100644 index 00000000000..1dd810491e2 --- /dev/null +++ b/lib/onebox/domain_checker.rb @@ -0,0 +1,11 @@ +# frozen_string_literal: true + +module Onebox + class DomainChecker + def self.is_blocked?(hostname) + SiteSetting.blocked_onebox_domains&.split('|').any? do |blocked| + hostname == blocked || hostname.end_with?(".#{blocked}") + end + end + end +end diff --git a/lib/oneboxer.rb b/lib/oneboxer.rb index 61393627af6..132f30d6b25 100644 --- a/lib/oneboxer.rb +++ b/lib/oneboxer.rb @@ -379,10 +379,6 @@ module Oneboxer end end - def self.blocked_domains - SiteSetting.blocked_onebox_domains.split("|") - end - def self.preserve_fragment_url_hosts @preserve_fragment_url_hosts ||= ['http://github.com'] end @@ -420,7 +416,7 @@ module Oneboxer return error_box end - return blank_onebox if uri.blank? || blocked_domains.any? { |hostname| uri.hostname.match?(hostname) } + return blank_onebox if uri.blank? || Onebox::DomainChecker.is_blocked?(uri.hostname) onebox_options = { max_width: 695, @@ -538,7 +534,6 @@ module Oneboxer def self.get_final_destination_options(url, strategy = nil) fd_options = { ignore_redirects: ignore_redirects, - ignore_hostnames: blocked_domains, force_get_hosts: force_get_hosts, force_custom_user_agent_hosts: force_custom_user_agent_hosts, preserve_fragment_url_hosts: preserve_fragment_url_hosts, diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb index 2fd361ca902..78a518e738b 100644 --- a/lib/retrieve_title.rb +++ b/lib/retrieve_title.rb @@ -60,7 +60,7 @@ module RetrieveTitle encoding = nil fd.get do |_response, chunk, uri| - if (uri.present? && InlineOneboxer.domain_is_blocked?(uri.hostname)) + if (uri.present? && Onebox::DomainChecker.is_blocked?(uri.hostname)) throw :done end diff --git a/spec/components/oneboxer_spec.rb b/spec/components/oneboxer_spec.rb index 40ce9e49b82..562c06b3115 100644 --- a/spec/components/oneboxer_spec.rb +++ b/spec/components/oneboxer_spec.rb @@ -160,28 +160,54 @@ describe Oneboxer do end end - it "does not crawl blocklisted URLs" do - SiteSetting.blocked_onebox_domains = "git.*.com|bitbucket.com" - url = 'https://github.com/discourse/discourse/commit/21b562852885f883be43032e03c709241e8e6d4f' - stub_request(:head, 'https://discourse.org/').to_return(status: 302, body: "", headers: { location: url }) + context ".external_onebox" do + html = <<~HTML + + + + + + +

body

+ + + HTML - expect(Oneboxer.external_onebox(url)[:onebox]).to be_empty - expect(Oneboxer.external_onebox('https://discourse.org/')[:onebox]).to be_empty - end + context "blacklisted domains" do - it "does not consider ignore_redirects domains as blocklisted" do - url = 'https://store.steampowered.com/app/271590/Grand_Theft_Auto_V/' - stub_request(:head, url).to_return(status: 200, body: "", headers: {}) - stub_request(:get, url).to_return(status: 200, body: "", headers: {}) + it "does not return a onebox if redirect uri final destination is in blacklist" do + SiteSetting.blocked_onebox_domains = "kitten.com" - expect(Oneboxer.external_onebox(url)[:onebox]).to be_present - end + stub_request(:get, "http://cat.com/meow").to_return(status: 301, body: "", headers: { "location" => "https://kitten.com" }) + stub_request(:head, "http://cat.com/meow").to_return(status: 301, body: "", headers: { "location" => "https://kitten.com" }) - it "censors external oneboxes" do - Fabricate(:watched_word, action: WatchedWord.actions[:censor], word: "bad word") + stub_request(:get, "https://kitten.com").to_return(status: 200, body: html, headers: {}) + stub_request(:head, "https://kitten.com").to_return(status: 200, body: "", headers: {}) - url = 'https://example.com/' - stub_request(:any, url).to_return(status: 200, body: <<~HTML, headers: {}) + expect(Oneboxer.external_onebox("http://cat.com/meow")[:onebox]).to be_empty + expect(Oneboxer.external_onebox("https://kitten.com")[:onebox]).to be_empty + end + + it "returns onebox if 'midway redirect' is blocked but final redirect uri is not blocked" do + SiteSetting.blocked_onebox_domains = "middle.com" + + stub_request(:get, "https://cat.com/start").to_return(status: 301, body: "a", headers: { "location" => "https://middle.com/midway" }) + stub_request(:head, "https://cat.com/start").to_return(status: 301, body: "a", headers: { "location" => "https://middle.com/midway" }) + + stub_request(:head, "https://middle.com/midway").to_return(status: 301, body: "b", headers: { "location" => "https://cat.com/end" }) + + stub_request(:get, "https://cat.com/end").to_return(status: 200, body: html) + stub_request(:head, "https://cat.com/end").to_return(status: 200, body: "", headers: {}) + + expect(Oneboxer.external_onebox("https://cat.com/start")[:onebox]).to be_present + end + end + + it "censors external oneboxes" do + Fabricate(:watched_word, action: WatchedWord.actions[:censor], word: "bad word") + + url = 'https://example.com/' + stub_request(:any, url).to_return(status: 200, body: <<~HTML, headers: {}) @@ -191,13 +217,23 @@ describe Oneboxer do

content with bad word

- HTML + HTML - onebox = Oneboxer.external_onebox(url) - expect(onebox[:onebox]).to include('title with') - expect(onebox[:onebox]).not_to include('bad word') - expect(onebox[:preview]).to include('title with') - expect(onebox[:preview]).not_to include('bad word') + onebox = Oneboxer.external_onebox(url) + expect(onebox[:onebox]).to include('title with') + expect(onebox[:onebox]).not_to include('bad word') + expect(onebox[:preview]).to include('title with') + expect(onebox[:preview]).not_to include('bad word') + end + + it "returns onebox" do + SiteSetting.blocked_onebox_domains = "not.me" + + stub_request(:get, "https://its.me").to_return(status: 200, body: html) + stub_request(:head, "https://its.me").to_return(status: 200, body: "", headers: {}) + + expect(Oneboxer.external_onebox("https://its.me")[:onebox]).to be_present + end end it "uses the Onebox custom user agent on specified hosts" do diff --git a/spec/components/retrieve_title_spec.rb b/spec/components/retrieve_title_spec.rb index 6b16a9174d3..563a1da77ad 100644 --- a/spec/components/retrieve_title_spec.rb +++ b/spec/components/retrieve_title_spec.rb @@ -110,7 +110,6 @@ describe RetrieveTitle do stub_request(:get, "https://wikipedia.com/amazing") .to_return(status: 200, body: "very amazing", headers: {}) - IPSocket.stubs(:getaddress).returns('100.2.3.4') expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq(nil) end @@ -126,7 +125,6 @@ describe RetrieveTitle do stub_request(:get, "https://cat.com/meow") .to_return(status: 200, body: "very amazing", headers: {}) - IPSocket.stubs(:getaddress).returns('100.2.3.4') expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq("very amazing") end end diff --git a/spec/lib/onebox/domain_checker_spec.rb b/spec/lib/onebox/domain_checker_spec.rb new file mode 100644 index 00000000000..0132babc746 --- /dev/null +++ b/spec/lib/onebox/domain_checker_spec.rb @@ -0,0 +1,27 @@ +# frozen_string_literal: true + +require "rails_helper" + +describe Onebox::DomainChecker do + describe '.is_blocked?' do + before do + SiteSetting.blocked_onebox_domains = "api.cat.org|kitten.cloud" + end + + describe "returns true when entirely matched" do + it { expect(described_class.is_blocked?("api.cat.org")).to be(true) } + it { expect(described_class.is_blocked?("kitten.cloud")).to be(true) } + it { expect(described_class.is_blocked?("api.dog.org")).to be(false) } + it { expect(described_class.is_blocked?("puppy.cloud")).to be(false) } + end + + describe "returns true when ends with ." do + it { expect(described_class.is_blocked?("dev.api.cat.org")).to be(true) } + it { expect(described_class.is_blocked?(".api.cat.org")).to be(true) } + it { expect(described_class.is_blocked?("dev.kitten.cloud")).to be(true) } + it { expect(described_class.is_blocked?(".kitten.cloud")).to be(true) } + it { expect(described_class.is_blocked?("xapi.cat.org")).to be(false) } + it { expect(described_class.is_blocked?("xkitten.cloud")).to be(false) } + end + end +end