From 8185b8cb06b1bec438c0cdf98dd495b87e0d3fcf Mon Sep 17 00:00:00 2001 From: Sam Saffron Date: Tue, 17 Oct 2017 16:22:38 +1100 Subject: [PATCH] FEATURE: cache https redirects per hostname If a hostname does an https redirect we cache that so next lookup does not incur it. Also, only rate limit per ip once per final destination Raise final destination protection to 1000 ip lookups an hour --- lib/final_destination.rb | 42 ++++++++++++++++++++--- spec/components/final_destination_spec.rb | 22 ++++++++++++ 2 files changed, 60 insertions(+), 4 deletions(-) diff --git a/lib/final_destination.rb b/lib/final_destination.rb index a8175135d39..dae0cea24bb 100644 --- a/lib/final_destination.rb +++ b/lib/final_destination.rb @@ -1,11 +1,30 @@ -require "socket" -require "ipaddr" +require 'socket' +require 'ipaddr' require 'excon' require 'rate_limiter' # Determine the final endpoint for a Web URI, following redirects class FinalDestination + def self.clear_https_cache!(domain) + key = redis_https_key(domain) + $redis.without_namespace.del(key) + end + + def self.cache_https_domain(domain) + key = redis_https_key(domain) + $redis.without_namespace.setex(key, "1", 1.day.to_i).present? + end + + def self.is_https_domain?(domain) + key = redis_https_key(domain) + $redis.without_namespace.get(key).present? + end + + def self.redis_https_key(domain) + "HTTPS_DOMAIN_#{domain}" + end + attr_reader :status, :cookie, :status_code def initialize(url, opts = nil) @@ -31,6 +50,7 @@ class FinalDestination @status = :ready @http_verb = @force_get_hosts.any? { |host| hostname_matches?(host) } ? :get : :head @cookie = nil + @limited_ips = [] end def self.connection_timeout @@ -66,6 +86,11 @@ class FinalDestination end def resolve + if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname) + @uri.scheme = "https" + @uri = URI(@uri.to_s) + end + if @limit < 0 @status = :too_many_redirects return nil @@ -132,9 +157,17 @@ class FinalDestination end if location + old_port = @uri.port + location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/" @uri = URI(location) rescue nil @limit -= 1 + + # https redirect, so just cache that whole new domain is https + if old_port == 80 && @uri.port == 443 && (URI::HTTPS === @uri) + FinalDestination.cache_https_domain(@uri.hostname) + end + return resolve end @@ -191,8 +224,9 @@ class FinalDestination end # Rate limit how often this IP can be crawled - unless @opts[:skip_rate_limit] - RateLimiter.new(nil, "crawl-destination-ip:#{address_s}", 100, 1.hour).performed! + if !@opts[:skip_rate_limit] && !@limited_ips.include?(address) + @limited_ips << address + RateLimiter.new(nil, "crawl-destination-ip:#{address_s}", 1000, 1.hour).performed! end true diff --git a/spec/components/final_destination_spec.rb b/spec/components/final_destination_spec.rb index 5248299e72b..6599f38d0d8 100644 --- a/spec/components/final_destination_spec.rb +++ b/spec/components/final_destination_spec.rb @@ -20,6 +20,7 @@ describe FinalDestination do when 'internal-ipv6.com' then '2001:abc:de:01:3:3d0:6a65:c2bf' when 'ignore-me.com' then '53.84.143.152' when 'force.get.com' then '22.102.29.40' + when 'wikipedia.com' then '1.2.3.4' else as_ip = IPAddr.new(host) rescue nil raise "couldn't lookup #{host}" if as_ip.nil? @@ -308,6 +309,27 @@ describe FinalDestination do end end + describe "https cache" do + it 'will cache https lookups' do + + FinalDestination.clear_https_cache!("wikipedia.com") + + stub_request(:head, "http://wikipedia.com/image.png") + .to_return(status: 302, body: "", headers: { location: 'https://wikipedia.com/image.png' }) + stub_request(:head, "https://wikipedia.com/image.png") + .to_return(status: 200, body: "", headers: []) + stub_request(:get, "https://wikipedia.com/image.png").to_return(status: 200, body: "", headers: {}) + + fd('http://wikipedia.com/image.png').resolve + + stub_request(:head, "https://wikipedia.com/image2.png") + .to_return(status: 200, body: "", headers: []) + stub_request(:get, "https://wikipedia.com/image2.png").to_return(status: 200, body: "", headers: {}) + + fd('http://wikipedia.com/image2.png').resolve + end + end + describe "#escape_url" do it "correctly escapes url" do fragment_url = "https://eviltrout.com/2016/02/25/fixing-android-performance.html#discourse-comments"