FEATURE: cache https redirects per hostname
If a hostname does an https redirect we cache that so next lookup does not incur it. Also, only rate limit per ip once per final destination Raise final destination protection to 1000 ip lookups an hour
This commit is contained in:
parent
646c6eb7cd
commit
8185b8cb06
|
@ -1,11 +1,30 @@
|
||||||
require "socket"
|
require 'socket'
|
||||||
require "ipaddr"
|
require 'ipaddr'
|
||||||
require 'excon'
|
require 'excon'
|
||||||
require 'rate_limiter'
|
require 'rate_limiter'
|
||||||
|
|
||||||
# Determine the final endpoint for a Web URI, following redirects
|
# Determine the final endpoint for a Web URI, following redirects
|
||||||
class FinalDestination
|
class FinalDestination
|
||||||
|
|
||||||
|
def self.clear_https_cache!(domain)
|
||||||
|
key = redis_https_key(domain)
|
||||||
|
$redis.without_namespace.del(key)
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.cache_https_domain(domain)
|
||||||
|
key = redis_https_key(domain)
|
||||||
|
$redis.without_namespace.setex(key, "1", 1.day.to_i).present?
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.is_https_domain?(domain)
|
||||||
|
key = redis_https_key(domain)
|
||||||
|
$redis.without_namespace.get(key).present?
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.redis_https_key(domain)
|
||||||
|
"HTTPS_DOMAIN_#{domain}"
|
||||||
|
end
|
||||||
|
|
||||||
attr_reader :status, :cookie, :status_code
|
attr_reader :status, :cookie, :status_code
|
||||||
|
|
||||||
def initialize(url, opts = nil)
|
def initialize(url, opts = nil)
|
||||||
|
@ -31,6 +50,7 @@ class FinalDestination
|
||||||
@status = :ready
|
@status = :ready
|
||||||
@http_verb = @force_get_hosts.any? { |host| hostname_matches?(host) } ? :get : :head
|
@http_verb = @force_get_hosts.any? { |host| hostname_matches?(host) } ? :get : :head
|
||||||
@cookie = nil
|
@cookie = nil
|
||||||
|
@limited_ips = []
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.connection_timeout
|
def self.connection_timeout
|
||||||
|
@ -66,6 +86,11 @@ class FinalDestination
|
||||||
end
|
end
|
||||||
|
|
||||||
def resolve
|
def resolve
|
||||||
|
if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname)
|
||||||
|
@uri.scheme = "https"
|
||||||
|
@uri = URI(@uri.to_s)
|
||||||
|
end
|
||||||
|
|
||||||
if @limit < 0
|
if @limit < 0
|
||||||
@status = :too_many_redirects
|
@status = :too_many_redirects
|
||||||
return nil
|
return nil
|
||||||
|
@ -132,9 +157,17 @@ class FinalDestination
|
||||||
end
|
end
|
||||||
|
|
||||||
if location
|
if location
|
||||||
|
old_port = @uri.port
|
||||||
|
|
||||||
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
|
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
|
||||||
@uri = URI(location) rescue nil
|
@uri = URI(location) rescue nil
|
||||||
@limit -= 1
|
@limit -= 1
|
||||||
|
|
||||||
|
# https redirect, so just cache that whole new domain is https
|
||||||
|
if old_port == 80 && @uri.port == 443 && (URI::HTTPS === @uri)
|
||||||
|
FinalDestination.cache_https_domain(@uri.hostname)
|
||||||
|
end
|
||||||
|
|
||||||
return resolve
|
return resolve
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -191,8 +224,9 @@ class FinalDestination
|
||||||
end
|
end
|
||||||
|
|
||||||
# Rate limit how often this IP can be crawled
|
# Rate limit how often this IP can be crawled
|
||||||
unless @opts[:skip_rate_limit]
|
if !@opts[:skip_rate_limit] && !@limited_ips.include?(address)
|
||||||
RateLimiter.new(nil, "crawl-destination-ip:#{address_s}", 100, 1.hour).performed!
|
@limited_ips << address
|
||||||
|
RateLimiter.new(nil, "crawl-destination-ip:#{address_s}", 1000, 1.hour).performed!
|
||||||
end
|
end
|
||||||
|
|
||||||
true
|
true
|
||||||
|
|
|
@ -20,6 +20,7 @@ describe FinalDestination do
|
||||||
when 'internal-ipv6.com' then '2001:abc:de:01:3:3d0:6a65:c2bf'
|
when 'internal-ipv6.com' then '2001:abc:de:01:3:3d0:6a65:c2bf'
|
||||||
when 'ignore-me.com' then '53.84.143.152'
|
when 'ignore-me.com' then '53.84.143.152'
|
||||||
when 'force.get.com' then '22.102.29.40'
|
when 'force.get.com' then '22.102.29.40'
|
||||||
|
when 'wikipedia.com' then '1.2.3.4'
|
||||||
else
|
else
|
||||||
as_ip = IPAddr.new(host) rescue nil
|
as_ip = IPAddr.new(host) rescue nil
|
||||||
raise "couldn't lookup #{host}" if as_ip.nil?
|
raise "couldn't lookup #{host}" if as_ip.nil?
|
||||||
|
@ -308,6 +309,27 @@ describe FinalDestination do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
describe "https cache" do
|
||||||
|
it 'will cache https lookups' do
|
||||||
|
|
||||||
|
FinalDestination.clear_https_cache!("wikipedia.com")
|
||||||
|
|
||||||
|
stub_request(:head, "http://wikipedia.com/image.png")
|
||||||
|
.to_return(status: 302, body: "", headers: { location: 'https://wikipedia.com/image.png' })
|
||||||
|
stub_request(:head, "https://wikipedia.com/image.png")
|
||||||
|
.to_return(status: 200, body: "", headers: [])
|
||||||
|
stub_request(:get, "https://wikipedia.com/image.png").to_return(status: 200, body: "", headers: {})
|
||||||
|
|
||||||
|
fd('http://wikipedia.com/image.png').resolve
|
||||||
|
|
||||||
|
stub_request(:head, "https://wikipedia.com/image2.png")
|
||||||
|
.to_return(status: 200, body: "", headers: [])
|
||||||
|
stub_request(:get, "https://wikipedia.com/image2.png").to_return(status: 200, body: "", headers: {})
|
||||||
|
|
||||||
|
fd('http://wikipedia.com/image2.png').resolve
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
describe "#escape_url" do
|
describe "#escape_url" do
|
||||||
it "correctly escapes url" do
|
it "correctly escapes url" do
|
||||||
fragment_url = "https://eviltrout.com/2016/02/25/fixing-android-performance.html#discourse-comments"
|
fragment_url = "https://eviltrout.com/2016/02/25/fixing-android-performance.html#discourse-comments"
|
||||||
|
|
Loading…
Reference in New Issue