diff --git a/lib/final_destination.rb b/lib/final_destination.rb index a8175135d39..dae0cea24bb 100644 --- a/lib/final_destination.rb +++ b/lib/final_destination.rb @@ -1,11 +1,30 @@ -require "socket" -require "ipaddr" +require 'socket' +require 'ipaddr' require 'excon' require 'rate_limiter' # Determine the final endpoint for a Web URI, following redirects class FinalDestination + def self.clear_https_cache!(domain) + key = redis_https_key(domain) + $redis.without_namespace.del(key) + end + + def self.cache_https_domain(domain) + key = redis_https_key(domain) + $redis.without_namespace.setex(key, "1", 1.day.to_i).present? + end + + def self.is_https_domain?(domain) + key = redis_https_key(domain) + $redis.without_namespace.get(key).present? + end + + def self.redis_https_key(domain) + "HTTPS_DOMAIN_#{domain}" + end + attr_reader :status, :cookie, :status_code def initialize(url, opts = nil) @@ -31,6 +50,7 @@ class FinalDestination @status = :ready @http_verb = @force_get_hosts.any? { |host| hostname_matches?(host) } ? :get : :head @cookie = nil + @limited_ips = [] end def self.connection_timeout @@ -66,6 +86,11 @@ class FinalDestination end def resolve + if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname) + @uri.scheme = "https" + @uri = URI(@uri.to_s) + end + if @limit < 0 @status = :too_many_redirects return nil @@ -132,9 +157,17 @@ class FinalDestination end if location + old_port = @uri.port + location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/" @uri = URI(location) rescue nil @limit -= 1 + + # https redirect, so just cache that whole new domain is https + if old_port == 80 && @uri.port == 443 && (URI::HTTPS === @uri) + FinalDestination.cache_https_domain(@uri.hostname) + end + return resolve end @@ -191,8 +224,9 @@ class FinalDestination end # Rate limit how often this IP can be crawled - unless @opts[:skip_rate_limit] - RateLimiter.new(nil, "crawl-destination-ip:#{address_s}", 100, 1.hour).performed! + if !@opts[:skip_rate_limit] && !@limited_ips.include?(address) + @limited_ips << address + RateLimiter.new(nil, "crawl-destination-ip:#{address_s}", 1000, 1.hour).performed! end true diff --git a/spec/components/final_destination_spec.rb b/spec/components/final_destination_spec.rb index 5248299e72b..6599f38d0d8 100644 --- a/spec/components/final_destination_spec.rb +++ b/spec/components/final_destination_spec.rb @@ -20,6 +20,7 @@ describe FinalDestination do when 'internal-ipv6.com' then '2001:abc:de:01:3:3d0:6a65:c2bf' when 'ignore-me.com' then '53.84.143.152' when 'force.get.com' then '22.102.29.40' + when 'wikipedia.com' then '1.2.3.4' else as_ip = IPAddr.new(host) rescue nil raise "couldn't lookup #{host}" if as_ip.nil? @@ -308,6 +309,27 @@ describe FinalDestination do end end + describe "https cache" do + it 'will cache https lookups' do + + FinalDestination.clear_https_cache!("wikipedia.com") + + stub_request(:head, "http://wikipedia.com/image.png") + .to_return(status: 302, body: "", headers: { location: 'https://wikipedia.com/image.png' }) + stub_request(:head, "https://wikipedia.com/image.png") + .to_return(status: 200, body: "", headers: []) + stub_request(:get, "https://wikipedia.com/image.png").to_return(status: 200, body: "", headers: {}) + + fd('http://wikipedia.com/image.png').resolve + + stub_request(:head, "https://wikipedia.com/image2.png") + .to_return(status: 200, body: "", headers: []) + stub_request(:get, "https://wikipedia.com/image2.png").to_return(status: 200, body: "", headers: {}) + + fd('http://wikipedia.com/image2.png').resolve + end + end + describe "#escape_url" do it "correctly escapes url" do fragment_url = "https://eviltrout.com/2016/02/25/fixing-android-performance.html#discourse-comments"