discourse/lib/final_destination.rb

179 lines
4.1 KiB
Ruby
Raw Normal View History

require "socket"
require "ipaddr"
require 'excon'
require 'rate_limiter'
# Determine the final endpoint for a Web URI, following redirects
class FinalDestination
attr_reader :status
attr_reader :cookie
def initialize(url, opts=nil)
@uri = URI(url) rescue nil
@opts = opts || {}
@opts[:max_redirects] ||= 5
@opts[:lookup_ip] ||= lambda do |host|
begin
IPSocket::getaddress(host)
rescue SocketError
nil
end
end
@limit = @opts[:max_redirects]
@status = :ready
@cookie = nil
end
def self.connection_timeout
20
end
def redirected?
@limit < @opts[:max_redirects]
end
def request_headers
result = {
"User-Agent" => "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"Accept" => "text/html",
"Host" => @uri.hostname
}
result['cookie'] = @cookie if @cookie
result
end
def small_get(headers)
Net::HTTP.start(@uri.host, @uri.port, use_ssl: @uri.is_a?(URI::HTTPS)) do |http|
http.open_timeout = FinalDestination.connection_timeout
http.read_timeout = FinalDestination.connection_timeout
request = Net::HTTP::Get.new(@uri.request_uri, headers)
http.request(request) do |response|
return response
end
end
end
def resolve
if @limit < 0
@status = :too_many_redirects
return nil
end
return nil unless validate_uri
headers = request_headers
response = Excon.head(
@uri.to_s,
read_timeout: FinalDestination.connection_timeout,
headers: headers
)
location = nil
case response.status
when 200
@status = :resolved
return @uri
when 405, 501
get_response = small_get(headers)
if get_response.code.to_i == 200
@status = :resolved
return @uri
end
if cookie_val = get_response.get_fields('set-cookie')
@cookie = cookie_val.join
end
if location_val = get_response.get_fields('location')
location = location_val.join
end
else
response.headers.each do |k, v|
case k.downcase
when 'set-cookie' then @cookie = v
when 'location' then location = v
end
end
end
if location
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
@uri = URI(location) rescue nil
@limit -= 1
return resolve
end
nil
end
def validate_uri
validate_uri_format && is_dest_valid?
end
def validate_uri_format
return false unless @uri
return false unless ['https', 'http'].include?(@uri.scheme)
2017-05-23 13:07:18 -04:00
return false if @uri.scheme == 'http' && @uri.port != 80
return false if @uri.scheme == 'https' && @uri.port != 443
2017-05-23 13:07:18 -04:00
# Disallow IP based crawling
(IPAddr.new(@uri.hostname) rescue nil).nil?
end
def is_dest_valid?
2017-05-23 16:32:54 -04:00
# CDNs are always allowed
return true if SiteSetting.s3_cdn_url.present? &&
@uri.hostname == URI(SiteSetting.s3_cdn_url).hostname
global_cdn = GlobalSetting.try(:cdn_url)
return true if global_cdn.present? &&
@uri.hostname == URI(global_cdn).hostname
return false unless @uri && @uri.host
address_s = @opts[:lookup_ip].call(@uri.hostname)
return false unless address_s
address = IPAddr.new(address_s)
if private_ranges.any? {|r| r === address }
@status = :invalid_address
return false
end
# Rate limit how often this IP can be crawled
unless @opts[:skip_rate_limit]
RateLimiter.new(nil, "crawl-destination-ip:#{address_s}", 100, 1.hour).performed!
end
true
rescue RateLimiter::LimitExceeded
false
end
def private_ranges
FinalDestination.standard_private_ranges +
SiteSetting.blacklist_ip_blocks.split('|').map {|r| IPAddr.new(r) rescue nil }.compact
end
def self.standard_private_ranges
@private_ranges ||= [
IPAddr.new('127.0.0.1'),
IPAddr.new('172.16.0.0/12'),
IPAddr.new('192.168.0.0/16'),
IPAddr.new('10.0.0.0/8'),
IPAddr.new('fc00::/7')
]
end
def self.lookup_ip(host)
IPSocket::getaddress(host)
end
end