FEATURE: cache https redirects per hostname

If a hostname does an https redirect we cache that so next
lookup does not incur it.

Also, only rate limit per ip once per final destination

Raise final destination protection to 1000 ip lookups an hour
This commit is contained in:
Sam Saffron 2017-10-17 16:22:38 +11:00
parent 646c6eb7cd
commit 8185b8cb06
2 changed files with 60 additions and 4 deletions

View File

@ -1,11 +1,30 @@
require "socket" require 'socket'
require "ipaddr" require 'ipaddr'
require 'excon' require 'excon'
require 'rate_limiter' require 'rate_limiter'
# Determine the final endpoint for a Web URI, following redirects # Determine the final endpoint for a Web URI, following redirects
class FinalDestination class FinalDestination
def self.clear_https_cache!(domain)
key = redis_https_key(domain)
$redis.without_namespace.del(key)
end
def self.cache_https_domain(domain)
key = redis_https_key(domain)
$redis.without_namespace.setex(key, "1", 1.day.to_i).present?
end
def self.is_https_domain?(domain)
key = redis_https_key(domain)
$redis.without_namespace.get(key).present?
end
def self.redis_https_key(domain)
"HTTPS_DOMAIN_#{domain}"
end
attr_reader :status, :cookie, :status_code attr_reader :status, :cookie, :status_code
def initialize(url, opts = nil) def initialize(url, opts = nil)
@ -31,6 +50,7 @@ class FinalDestination
@status = :ready @status = :ready
@http_verb = @force_get_hosts.any? { |host| hostname_matches?(host) } ? :get : :head @http_verb = @force_get_hosts.any? { |host| hostname_matches?(host) } ? :get : :head
@cookie = nil @cookie = nil
@limited_ips = []
end end
def self.connection_timeout def self.connection_timeout
@ -66,6 +86,11 @@ class FinalDestination
end end
def resolve def resolve
if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname)
@uri.scheme = "https"
@uri = URI(@uri.to_s)
end
if @limit < 0 if @limit < 0
@status = :too_many_redirects @status = :too_many_redirects
return nil return nil
@ -132,9 +157,17 @@ class FinalDestination
end end
if location if location
old_port = @uri.port
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/" location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
@uri = URI(location) rescue nil @uri = URI(location) rescue nil
@limit -= 1 @limit -= 1
# https redirect, so just cache that whole new domain is https
if old_port == 80 && @uri.port == 443 && (URI::HTTPS === @uri)
FinalDestination.cache_https_domain(@uri.hostname)
end
return resolve return resolve
end end
@ -191,8 +224,9 @@ class FinalDestination
end end
# Rate limit how often this IP can be crawled # Rate limit how often this IP can be crawled
unless @opts[:skip_rate_limit] if !@opts[:skip_rate_limit] && !@limited_ips.include?(address)
RateLimiter.new(nil, "crawl-destination-ip:#{address_s}", 100, 1.hour).performed! @limited_ips << address
RateLimiter.new(nil, "crawl-destination-ip:#{address_s}", 1000, 1.hour).performed!
end end
true true

View File

@ -20,6 +20,7 @@ describe FinalDestination do
when 'internal-ipv6.com' then '2001:abc:de:01:3:3d0:6a65:c2bf' when 'internal-ipv6.com' then '2001:abc:de:01:3:3d0:6a65:c2bf'
when 'ignore-me.com' then '53.84.143.152' when 'ignore-me.com' then '53.84.143.152'
when 'force.get.com' then '22.102.29.40' when 'force.get.com' then '22.102.29.40'
when 'wikipedia.com' then '1.2.3.4'
else else
as_ip = IPAddr.new(host) rescue nil as_ip = IPAddr.new(host) rescue nil
raise "couldn't lookup #{host}" if as_ip.nil? raise "couldn't lookup #{host}" if as_ip.nil?
@ -308,6 +309,27 @@ describe FinalDestination do
end end
end end
describe "https cache" do
it 'will cache https lookups' do
FinalDestination.clear_https_cache!("wikipedia.com")
stub_request(:head, "http://wikipedia.com/image.png")
.to_return(status: 302, body: "", headers: { location: 'https://wikipedia.com/image.png' })
stub_request(:head, "https://wikipedia.com/image.png")
.to_return(status: 200, body: "", headers: [])
stub_request(:get, "https://wikipedia.com/image.png").to_return(status: 200, body: "", headers: {})
fd('http://wikipedia.com/image.png').resolve
stub_request(:head, "https://wikipedia.com/image2.png")
.to_return(status: 200, body: "", headers: [])
stub_request(:get, "https://wikipedia.com/image2.png").to_return(status: 200, body: "", headers: {})
fd('http://wikipedia.com/image2.png').resolve
end
end
describe "#escape_url" do describe "#escape_url" do
it "correctly escapes url" do it "correctly escapes url" do
fragment_url = "https://eviltrout.com/2016/02/25/fixing-android-performance.html#discourse-comments" fragment_url = "https://eviltrout.com/2016/02/25/fixing-android-performance.html#discourse-comments"