FIX: Apply onebox blocked domain checks on every redirect (#16150)
The `blocked onebox domains` setting lets site owners change what sites are allowed to be oneboxed. When a link is entered into a post, Discourse checks the domain of the link against that setting and blocks the onebox if the domain is blocked. But if there's a chain of redirects, then only the final destination website is checked against the site setting. This commit amends that behavior so that every website in the redirect chain is checked against the site setting, and if anything is blocked the original link doesn't onebox at all in the post. The `Discourse-No-Onebox` header is also checked in every response and the onebox is blocked if the header is set to "1". Additionally, Discourse will now include the `Discourse-No-Onebox` header with every response if the site requires login to access content. This is done to signal to a Discourse instance that it shouldn't attempt to onebox other Discourse instances if they're login-only. Non-Discourse websites can also use include that header if they don't wish to have Discourse onebox their content. Internal ticket: t59305.
This commit is contained in:
parent
8e010aecfb
commit
b0656f3ed0
|
@ -90,6 +90,9 @@ class ApplicationController < ActionController::Base
|
|||
response.cache_control[:no_cache] = true
|
||||
response.cache_control[:extras] = ["no-store"]
|
||||
end
|
||||
if SiteSetting.login_required
|
||||
response.headers['Discourse-No-Onebox'] = '1'
|
||||
end
|
||||
end
|
||||
|
||||
def conditionally_allow_site_embedding
|
||||
|
|
|
@ -74,6 +74,7 @@ class FinalDestination
|
|||
@preserve_fragment_url = @preserve_fragment_url_hosts.any? { |host| hostname_matches?(host) }
|
||||
@validate_uri = @opts.fetch(:validate_uri) { true }
|
||||
@user_agent = @force_custom_user_agent_hosts.any? { |host| hostname_matches?(host) } ? Onebox.options.user_agent : @default_user_agent
|
||||
@stop_at_blocked_pages = @opts[:stop_at_blocked_pages]
|
||||
end
|
||||
|
||||
def self.connection_timeout
|
||||
|
@ -140,9 +141,12 @@ class FinalDestination
|
|||
uri = URI(uri.to_s)
|
||||
end
|
||||
|
||||
return nil unless validate_uri
|
||||
return if !validate_uri
|
||||
return if @stop_at_blocked_pages && blocked_domain?(uri)
|
||||
|
||||
result, (location, cookie) = safe_get(uri, &blk)
|
||||
result, headers_subset = safe_get(uri, &blk)
|
||||
cookie = headers_subset.set_cookie
|
||||
location = headers_subset.location
|
||||
|
||||
if result == :redirect && (redirects == 0 || !location)
|
||||
return nil
|
||||
|
@ -222,6 +226,13 @@ class FinalDestination
|
|||
response_block: request_validator
|
||||
)
|
||||
|
||||
if @stop_at_blocked_pages
|
||||
if blocked_domain?(@uri) || response.headers['Discourse-No-Onebox'] == "1"
|
||||
@status = :blocked_page
|
||||
return
|
||||
end
|
||||
end
|
||||
|
||||
location = nil
|
||||
response_headers = nil
|
||||
response_status = response.status.to_i
|
||||
|
@ -253,6 +264,18 @@ class FinalDestination
|
|||
when 103, 400, 405, 406, 409, 500, 501
|
||||
response_status, small_headers = small_get(request_headers)
|
||||
|
||||
if @stop_at_blocked_pages
|
||||
# this may seem weird, but the #to_hash method of the response object
|
||||
# of ruby's net/http lib returns a hash where each value is an array.
|
||||
# small_headers here is like that so our no onebox header value is an
|
||||
# array if it's set. Also the hash keys are always lower-cased.
|
||||
dont_onebox = small_headers["discourse-no-onebox"]&.join("") == "1"
|
||||
if dont_onebox || blocked_domain?(@uri)
|
||||
@status = :blocked_page
|
||||
return
|
||||
end
|
||||
end
|
||||
|
||||
if response_status == 200
|
||||
@status = :resolved
|
||||
return @uri
|
||||
|
@ -425,6 +448,7 @@ class FinalDestination
|
|||
def safe_get(uri)
|
||||
result = nil
|
||||
unsafe_close = false
|
||||
headers_subset = Struct.new(:location, :set_cookie).new
|
||||
|
||||
safe_session(uri) do |http|
|
||||
headers = request_headers.merge(
|
||||
|
@ -435,8 +459,19 @@ class FinalDestination
|
|||
req = Net::HTTP::Get.new(uri.request_uri, headers)
|
||||
|
||||
http.request(req) do |resp|
|
||||
headers_subset.set_cookie = resp['Set-Cookie']
|
||||
|
||||
if @stop_at_blocked_pages
|
||||
dont_onebox = resp["Discourse-No-Onebox"] == "1"
|
||||
if dont_onebox
|
||||
result = :blocked, headers_subset
|
||||
next
|
||||
end
|
||||
end
|
||||
|
||||
if Net::HTTPRedirection === resp
|
||||
result = :redirect, [resp['location'], resp['Set-Cookie']]
|
||||
headers_subset.location = resp['location']
|
||||
result = :redirect, headers_subset
|
||||
end
|
||||
|
||||
if Net::HTTPSuccess === resp
|
||||
|
@ -460,7 +495,7 @@ class FinalDestination
|
|||
raise StandardError
|
||||
end
|
||||
end
|
||||
result = :ok
|
||||
result = :ok, headers_subset
|
||||
else
|
||||
catch(:done) do
|
||||
yield resp, nil, nil
|
||||
|
@ -471,7 +506,7 @@ class FinalDestination
|
|||
|
||||
result
|
||||
rescue StandardError
|
||||
unsafe_close ? :ok : raise
|
||||
unsafe_close ? [:ok, headers_subset] : raise
|
||||
end
|
||||
|
||||
def safe_session(uri)
|
||||
|
@ -505,4 +540,8 @@ class FinalDestination
|
|||
|
||||
uri(complete_url)
|
||||
end
|
||||
|
||||
def blocked_domain?(uri)
|
||||
Onebox::DomainChecker.is_blocked?(uri.hostname)
|
||||
end
|
||||
end
|
||||
|
|
|
@ -397,9 +397,14 @@ module Oneboxer
|
|||
available_strategies ||= Oneboxer.ordered_strategies(uri.hostname)
|
||||
strategy = available_strategies.shift
|
||||
|
||||
fd = FinalDestination.new(url, get_final_destination_options(url, strategy))
|
||||
fd = FinalDestination.new(
|
||||
url,
|
||||
get_final_destination_options(url, strategy).merge(stop_at_blocked_pages: true)
|
||||
)
|
||||
uri = fd.resolve
|
||||
|
||||
return blank_onebox if fd.status == :blocked_page
|
||||
|
||||
if fd.status != :resolved
|
||||
args = { link: url }
|
||||
if fd.status == :invalid_address
|
||||
|
@ -416,7 +421,7 @@ module Oneboxer
|
|||
return error_box
|
||||
end
|
||||
|
||||
return blank_onebox if uri.blank? || Onebox::DomainChecker.is_blocked?(uri.hostname)
|
||||
return blank_onebox if uri.blank?
|
||||
|
||||
onebox_options = {
|
||||
max_width: 695,
|
||||
|
|
|
@ -5,8 +5,9 @@ module RetrieveTitle
|
|||
|
||||
def self.crawl(url)
|
||||
fetch_title(url)
|
||||
rescue Exception
|
||||
# If there was a connection error, do nothing
|
||||
rescue Exception => ex
|
||||
raise if Rails.env.test?
|
||||
Rails.logger.error(ex)
|
||||
end
|
||||
|
||||
def self.extract_title(html, encoding = nil)
|
||||
|
@ -52,17 +53,13 @@ module RetrieveTitle
|
|||
|
||||
# Fetch the beginning of a HTML document at a url
|
||||
def self.fetch_title(url)
|
||||
fd = FinalDestination.new(url, timeout: CRAWL_TIMEOUT)
|
||||
fd = FinalDestination.new(url, timeout: CRAWL_TIMEOUT, stop_at_blocked_pages: true)
|
||||
|
||||
current = nil
|
||||
title = nil
|
||||
encoding = nil
|
||||
|
||||
fd.get do |_response, chunk, uri|
|
||||
if (uri.present? && Onebox::DomainChecker.is_blocked?(uri.hostname))
|
||||
throw :done
|
||||
end
|
||||
|
||||
unless Net::HTTPRedirection === _response
|
||||
if current
|
||||
current << chunk
|
||||
|
|
|
@ -29,34 +29,19 @@ describe InlineOneboxer do
|
|||
|
||||
it "puts an entry in the cache" do
|
||||
SiteSetting.enable_inline_onebox_on_all_domains = true
|
||||
url = "https://example.com/random-url"
|
||||
url = "https://example.com/good-url"
|
||||
stub_request(:get, url).to_return(status: 200, body: "<html><head><title>a blog</title></head></html>")
|
||||
|
||||
InlineOneboxer.invalidate(url)
|
||||
expect(InlineOneboxer.cache_lookup(url)).to be_blank
|
||||
|
||||
result = InlineOneboxer.lookup(url)
|
||||
expect(result).to be_present
|
||||
expect(result[:title]).to be_present
|
||||
|
||||
cached = InlineOneboxer.cache_lookup(url)
|
||||
expect(cached[:url]).to eq(url)
|
||||
expect(cached[:title]).to eq('a blog')
|
||||
end
|
||||
|
||||
it "puts an entry in the cache for failed onebox" do
|
||||
SiteSetting.enable_inline_onebox_on_all_domains = true
|
||||
url = "https://example.com/random-url"
|
||||
|
||||
InlineOneboxer.invalidate(url)
|
||||
expect(InlineOneboxer.cache_lookup(url)).to be_blank
|
||||
|
||||
result = InlineOneboxer.lookup(url)
|
||||
expect(result).to be_present
|
||||
|
||||
cached = InlineOneboxer.cache_lookup(url)
|
||||
expect(cached[:url]).to eq(url)
|
||||
expect(cached[:title]).to be_nil
|
||||
end
|
||||
end
|
||||
|
||||
context ".lookup" do
|
||||
|
@ -199,6 +184,11 @@ describe InlineOneboxer do
|
|||
describe "lookups for blocked domains in the hostname" do
|
||||
shared_examples "blocks the domain" do |setting, domain_to_test|
|
||||
it "does not retrieve title" do
|
||||
stub_request(:get, domain_to_test)
|
||||
.to_return(
|
||||
status: 200,
|
||||
body: "<html><head><title>hello world</title></head></html>",
|
||||
)
|
||||
SiteSetting.blocked_onebox_domains = setting
|
||||
|
||||
onebox = InlineOneboxer.lookup(domain_to_test, skip_cache: true)
|
||||
|
@ -209,11 +199,16 @@ describe InlineOneboxer do
|
|||
|
||||
shared_examples "does not fulfil blocked domain" do |setting, domain_to_test|
|
||||
it "retrieves title" do
|
||||
stub_request(:get, domain_to_test)
|
||||
.to_return(
|
||||
status: 200,
|
||||
body: "<html><head><title>hello world</title></head></html>",
|
||||
)
|
||||
SiteSetting.blocked_onebox_domains = setting
|
||||
|
||||
onebox = InlineOneboxer.lookup(domain_to_test, skip_cache: true)
|
||||
|
||||
expect(onebox).to be_present
|
||||
expect(onebox[:title]).to be_present
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -228,6 +223,47 @@ describe InlineOneboxer do
|
|||
include_examples "does not fulfil blocked domain", "kitten.cloud", "https://cat.2kitten.cloud"
|
||||
include_examples "does not fulfil blocked domain", "kitten.cloud", "https://cat.kitten.cloud9"
|
||||
include_examples "does not fulfil blocked domain", "api.cat.org", "https://api-cat.org"
|
||||
|
||||
it "doesn't retrieve title if a blocked domain is encountered anywhere in the redirect chain" do
|
||||
SiteSetting.blocked_onebox_domains = "redirect.com"
|
||||
stub_request(:get, "https://mainwebsite.com/blah")
|
||||
.to_return(status: 301, body: "", headers: { "location" => "https://redirect.com/blah" })
|
||||
stub_request(:get, "https://redirect.com/blah")
|
||||
.to_return(status: 301, body: "", headers: { "location" => "https://finalwebsite.com/blah" })
|
||||
stub_request(:get, "https://finalwebsite.com/blah")
|
||||
.to_return(status: 200, body: "<html><head><title>hello world</title></head></html>")
|
||||
onebox = InlineOneboxer.lookup("https://mainwebsite.com/blah", skip_cache: true)
|
||||
|
||||
expect(onebox[:title]).to be_blank
|
||||
end
|
||||
|
||||
it "doesn't retrieve title if the Discourse-No-Onebox header == 1" do
|
||||
stub_request(:get, "https://mainwebsite.com/blah")
|
||||
.to_return(
|
||||
status: 200,
|
||||
body: "<html><head><title>hello world</title></head></html>",
|
||||
headers: { "Discourse-No-Onebox" => "1" }
|
||||
)
|
||||
onebox = InlineOneboxer.lookup("https://mainwebsite.com/blah", skip_cache: true)
|
||||
expect(onebox[:title]).to be_blank
|
||||
|
||||
stub_request(:get, "https://mainwebsite.com/blah/2")
|
||||
.to_return(
|
||||
status: 301,
|
||||
body: "",
|
||||
headers: { "location" => "https://mainwebsite.com/blah/2/redirect" }
|
||||
)
|
||||
stub_request(:get, "https://mainwebsite.com/blah/2/redirect")
|
||||
.to_return(
|
||||
status: 301,
|
||||
body: "",
|
||||
headers: { "Discourse-No-Onebox" => "1", "location" => "https://somethingdoesnotmatter.com" }
|
||||
)
|
||||
onebox = InlineOneboxer.lookup("https://mainwebsite.com/blah/2", skip_cache: true)
|
||||
expect(onebox[:title]).to be_blank
|
||||
onebox = InlineOneboxer.lookup("https://mainwebsite.com/blah/2/redirect", skip_cache: true)
|
||||
expect(onebox[:title]).to be_blank
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -182,11 +182,16 @@ describe Oneboxer do
|
|||
stub_request(:get, "https://kitten.com").to_return(status: 200, body: html, headers: {})
|
||||
stub_request(:head, "https://kitten.com").to_return(status: 200, body: "", headers: {})
|
||||
|
||||
expect(Oneboxer.external_onebox("http://cat.com/meow")[:onebox]).to be_empty
|
||||
expect(Oneboxer.external_onebox("https://kitten.com")[:onebox]).to be_empty
|
||||
result = Oneboxer.external_onebox("http://cat.com/meow")
|
||||
expect(result[:onebox]).to be_empty
|
||||
expect(result[:preview]).to be_empty
|
||||
|
||||
result = Oneboxer.external_onebox("http://kitten.com")
|
||||
expect(result[:onebox]).to be_empty
|
||||
expect(result[:preview]).to be_empty
|
||||
end
|
||||
|
||||
it "returns onebox if 'midway redirect' is blocked but final redirect uri is not blocked" do
|
||||
it "does not return onebox if anything in the redirect chain is blocked" do
|
||||
SiteSetting.blocked_onebox_domains = "middle.com"
|
||||
|
||||
stub_request(:get, "https://cat.com/start").to_return(status: 301, body: "a", headers: { "location" => "https://middle.com/midway" })
|
||||
|
@ -197,7 +202,44 @@ describe Oneboxer do
|
|||
stub_request(:get, "https://cat.com/end").to_return(status: 200, body: html)
|
||||
stub_request(:head, "https://cat.com/end").to_return(status: 200, body: "", headers: {})
|
||||
|
||||
expect(Oneboxer.external_onebox("https://cat.com/start")[:onebox]).to be_present
|
||||
result = Oneboxer.external_onebox("https://cat.com/start")
|
||||
expect(result[:onebox]).to be_empty
|
||||
expect(result[:preview]).to be_empty
|
||||
end
|
||||
|
||||
it "does not return onebox if the Discourse-No-Onebox header == 1" do
|
||||
stub_request(:get, "https://website.com/discourse-no-onebox")
|
||||
.to_return(status: 200, body: "abc", headers: { "Discourse-No-Onebox" => "1" })
|
||||
stub_request(:head, "https://website.com/discourse-no-onebox")
|
||||
.to_return(status: 200, body: "", headers: { "Discourse-No-Onebox" => "1" })
|
||||
|
||||
result = Oneboxer.external_onebox("https://website.com/discourse-no-onebox")
|
||||
expect(result[:onebox]).to be_empty
|
||||
expect(result[:preview]).to be_empty
|
||||
end
|
||||
|
||||
it "does not return onebox if the Discourse-No-Onebox header == 1 anywhere in the redirect chain" do
|
||||
stub_request(:get, "https://website.com/redirect-no-onebox")
|
||||
.to_return(status: 301, body: "", headers: { "Discourse-No-Onebox" => "1", "location" => "https://willneverreach.com" })
|
||||
stub_request(:head, "https://website.com/redirect-no-onebox")
|
||||
.to_return(status: 301, body: "", headers: { "Discourse-No-Onebox" => "1", "location" => "https://willneverreach.com" })
|
||||
|
||||
result = Oneboxer.external_onebox("https://website.com/redirect-no-onebox")
|
||||
expect(result[:onebox]).to be_empty
|
||||
expect(result[:preview]).to be_empty
|
||||
|
||||
stub_request(:get, "https://website.com/redirect")
|
||||
.to_return(status: 301, body: "", headers: { "location" => "https://website.com/redirect/dont-onebox" })
|
||||
stub_request(:head, "https://website.com/redirect")
|
||||
.to_return(status: 301, body: "", headers: { "location" => "https://website.com/redirect/dont-onebox" })
|
||||
stub_request(:get, "https://website.com/redirect/dont-onebox")
|
||||
.to_return(status: 301, body: "", headers: { "Discourse-No-Onebox" => "1", "location" => "https://wontreachme.com" })
|
||||
stub_request(:head, "https://website.com/redirect/dont-onebox")
|
||||
.to_return(status: 301, body: "", headers: { "Discourse-No-Onebox" => "1", "location" => "https://wontreachme.com" })
|
||||
|
||||
result = Oneboxer.external_onebox("https://website.com/redirect")
|
||||
expect(result[:onebox]).to be_empty
|
||||
expect(result[:preview]).to be_empty
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -111,7 +111,7 @@ describe RetrieveTitle do
|
|||
expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq(nil)
|
||||
end
|
||||
|
||||
it "returns title if 'midway redirect' is blocked but final redirect uri is not blocked" do
|
||||
it "doesn't return title if a blocked domain is encountered anywhere in the redirect chain" do
|
||||
SiteSetting.blocked_onebox_domains = "wikipedia.com"
|
||||
|
||||
stub_request(:get, "http://foobar.com/amazing")
|
||||
|
@ -123,7 +123,18 @@ describe RetrieveTitle do
|
|||
stub_request(:get, "https://cat.com/meow")
|
||||
.to_return(status: 200, body: "<html><title>very amazing</title>", headers: {})
|
||||
|
||||
expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq("very amazing")
|
||||
expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to be_blank
|
||||
end
|
||||
|
||||
it "doesn't return title if the Discourse-No-Onebox header == 1" do
|
||||
stub_request(:get, "https://cat.com/meow/no-onebox")
|
||||
.to_return(
|
||||
status: 200,
|
||||
body: "<html><title>discourse stay away</title>",
|
||||
headers: { "Discourse-No-Onebox" => "1" }
|
||||
)
|
||||
|
||||
expect(RetrieveTitle.crawl("https://cat.com/meow/no-onebox")).to be_blank
|
||||
end
|
||||
end
|
||||
|
||||
|
|
Loading…
Reference in New Issue