From b6963b8ffb274fb191df79149be2d7d412f19d8e Mon Sep 17 00:00:00 2001 From: Bianca Nenciu Date: Sun, 26 Aug 2018 16:31:02 +0200 Subject: [PATCH] FIX: Ignore OneBox blacklisted domains. --- lib/final_destination.rb | 26 +++++++++++++++-------- lib/oneboxer.rb | 6 ++++-- spec/components/final_destination_spec.rb | 8 +++++++ spec/components/oneboxer_spec.rb | 9 ++++++++ 4 files changed, 38 insertions(+), 11 deletions(-) diff --git a/lib/final_destination.rb b/lib/final_destination.rb index 7e1ed92396f..b69b2e0ebd7 100644 --- a/lib/final_destination.rb +++ b/lib/final_destination.rb @@ -26,7 +26,7 @@ class FinalDestination "HTTPS_DOMAIN_#{domain}" end - attr_reader :status, :cookie, :status_code + attr_reader :status, :cookie, :status_code, :ignored def initialize(url, opts = nil) @url = url @@ -36,7 +36,15 @@ class FinalDestination @force_get_hosts = @opts[:force_get_hosts] || [] @opts[:max_redirects] ||= 5 @opts[:lookup_ip] ||= lambda { |host| FinalDestination.lookup_ip(host) } - @ignored = [Discourse.base_url_no_prefix] + (@opts[:ignore_redirects] || []) + + @ignored = @opts[:ignore_hostnames] || [] + [Discourse.base_url_no_prefix].concat(@opts[:ignore_redirects] || []).each do |url| + url = uri(url) + if url.present? && url.hostname + @ignored << url.hostname + end + end + @limit = @opts[:max_redirects] @status = :ready @http_verb = @force_get_hosts.any? { |host| hostname_matches?(host) } ? :get : :head @@ -131,18 +139,18 @@ class FinalDestination return nil end - @ignored.each do |host| - if hostname_matches?(host) - @status = :resolved - return @uri - end - end - unless validate_uri log(:warn, "FinalDestination could not resolve URL (invalid URI): #{@uri}") if @verbose return nil end + @ignored.each do |host| + if @uri&.hostname&.match?(host) + @status = :resolved + return @uri + end + end + headers = request_headers response = Excon.public_send(@http_verb, @uri.to_s, diff --git a/lib/oneboxer.rb b/lib/oneboxer.rb index 2f99c0f8145..a0f48d43b33 100644 --- a/lib/oneboxer.rb +++ b/lib/oneboxer.rb @@ -250,9 +250,11 @@ module Oneboxer def self.external_onebox(url) Rails.cache.fetch(onebox_cache_key(url), expires_in: 1.day) do - fd = FinalDestination.new(url, ignore_redirects: ignore_redirects, force_get_hosts: force_get_hosts) + ignored = SiteSetting.onebox_domains_blacklist.split("|") + + fd = FinalDestination.new(url, ignore_redirects: ignore_redirects, ignore_hostnames: ignored, force_get_hosts: force_get_hosts) uri = fd.resolve - return blank_onebox if uri.blank? || SiteSetting.onebox_domains_blacklist.include?(uri.hostname) + return blank_onebox if uri.blank? || ignored.map { |hostname| uri.hostname.match?(hostname) }.any? options = { cache: {}, diff --git a/spec/components/final_destination_spec.rb b/spec/components/final_destination_spec.rb index 6a30757a7c3..78105c7f0d3 100644 --- a/spec/components/final_destination_spec.rb +++ b/spec/components/final_destination_spec.rb @@ -47,6 +47,14 @@ describe FinalDestination do FinalDestination.new(url, opts) end + it 'correctly parses ignored hostnames' do + fd = FinalDestination.new('https://meta.discourse.org', + ignore_redirects: ['http://google.com', 'youtube.com', 'https://meta.discourse.org', '://bing.com'] + ) + + expect(fd.ignored).to eq(['test.localhost', 'google.com', 'meta.discourse.org']) + end + describe '.resolve' do it "has a ready status code before anything happens" do diff --git a/spec/components/oneboxer_spec.rb b/spec/components/oneboxer_spec.rb index b5ebd305ce2..79520340ab6 100644 --- a/spec/components/oneboxer_spec.rb +++ b/spec/components/oneboxer_spec.rb @@ -107,4 +107,13 @@ describe Oneboxer do end end + it "does not crawl blacklisted URLs" do + SiteSetting.onebox_domains_blacklist = "git.*.com|bitbucket.com" + url = 'https://github.com/discourse/discourse/commit/21b562852885f883be43032e03c709241e8e6d4f' + stub_request(:head, 'https://discourse.org/').to_return(status: 302, body: "", headers: { location: url }) + + expect(Oneboxer.external_onebox(url)[:onebox]).to be_empty + expect(Oneboxer.external_onebox('https://discourse.org/')[:onebox]).to be_empty + end + end