2019-05-02 18:17:27 -04:00
# frozen_string_literal: true
2017-10-17 01:22:38 -04:00
require 'socket'
require 'ipaddr'
2017-05-22 12:23:04 -04:00
require 'excon'
2017-05-23 15:03:04 -04:00
require 'rate_limiter'
2017-12-12 11:50:39 -05:00
require 'url_helper'
2017-05-22 12:23:04 -04:00
# Determine the final endpoint for a Web URI, following redirects
class FinalDestination
2017-10-17 01:22:38 -04:00
def self . clear_https_cache! ( domain )
key = redis_https_key ( domain )
2019-12-03 04:05:53 -05:00
Discourse . redis . without_namespace . del ( key )
2017-10-17 01:22:38 -04:00
end
def self . cache_https_domain ( domain )
key = redis_https_key ( domain )
2019-12-03 04:05:53 -05:00
Discourse . redis . without_namespace . setex ( key , " 1 " , 1 . day . to_i ) . present?
2017-10-17 01:22:38 -04:00
end
def self . is_https_domain? ( domain )
key = redis_https_key ( domain )
2019-12-03 04:05:53 -05:00
Discourse . redis . without_namespace . get ( key ) . present?
2017-10-17 01:22:38 -04:00
end
def self . redis_https_key ( domain )
" HTTPS_DOMAIN_ #{ domain } "
end
2018-08-26 10:31:02 -04:00
attr_reader :status , :cookie , :status_code , :ignored
2017-05-22 12:23:04 -04:00
2017-05-23 11:51:23 -04:00
def initialize ( url , opts = nil )
2017-07-29 12:42:04 -04:00
@url = url
2018-03-28 04:20:08 -04:00
@uri = uri ( escape_url ) if @url
2017-07-17 20:50:06 -04:00
2017-05-22 12:23:04 -04:00
@opts = opts || { }
2017-08-08 05:44:27 -04:00
@force_get_hosts = @opts [ :force_get_hosts ] || [ ]
2018-12-19 01:27:07 -05:00
@preserve_fragment_url_hosts = @opts [ :preserve_fragment_url_hosts ] || [ ]
2019-11-07 04:14:43 -05:00
@force_custom_user_agent_hosts = @opts [ :force_custom_user_agent_hosts ] || [ ]
2017-05-22 12:23:04 -04:00
@opts [ :max_redirects ] || = 5
2018-03-28 04:20:08 -04:00
@opts [ :lookup_ip ] || = lambda { | host | FinalDestination . lookup_ip ( host ) }
2018-08-26 10:31:02 -04:00
@ignored = @opts [ :ignore_hostnames ] || [ ]
2019-05-27 20:28:57 -04:00
@limit = @opts [ :max_redirects ]
2018-09-03 22:16:21 -04:00
2019-05-27 20:28:57 -04:00
if @limit > 0
ignore_redirects = [ Discourse . base_url_no_prefix ]
2018-09-03 22:16:21 -04:00
2019-05-27 20:28:57 -04:00
if @opts [ :ignore_redirects ]
ignore_redirects . concat ( @opts [ :ignore_redirects ] )
end
2018-09-03 22:16:21 -04:00
2019-05-27 20:28:57 -04:00
ignore_redirects . each do | ignore_redirect |
ignore_redirect = uri ( ignore_redirect )
if ignore_redirect . present? && ignore_redirect . hostname
@ignored << ignore_redirect . hostname
end
2018-08-26 10:31:02 -04:00
end
end
2017-05-22 12:23:04 -04:00
@status = :ready
2017-08-08 05:44:27 -04:00
@http_verb = @force_get_hosts . any? { | host | hostname_matches? ( host ) } ? :get : :head
2017-06-06 13:53:49 -04:00
@cookie = nil
2017-10-17 01:22:38 -04:00
@limited_ips = [ ]
2017-10-31 12:03:03 -04:00
@verbose = @opts [ :verbose ] || false
2018-01-28 23:36:52 -05:00
@timeout = @opts [ :timeout ] || nil
2018-12-19 01:27:07 -05:00
@preserve_fragment_url = @preserve_fragment_url_hosts . any? { | host | hostname_matches? ( host ) }
2019-05-27 20:28:57 -04:00
@validate_uri = @opts . fetch ( :validate_uri ) { true }
2020-11-18 12:55:16 -05:00
@user_agent = @force_custom_user_agent_hosts . any? { | host | hostname_matches? ( host ) } ? Onebox . options . user_agent : " Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15 "
2017-06-06 13:53:49 -04:00
end
def self . connection_timeout
20
2017-05-22 12:23:04 -04:00
end
2018-01-28 23:36:52 -05:00
def timeout
@timeout || FinalDestination . connection_timeout
end
2017-05-22 12:23:04 -04:00
def redirected?
@limit < @opts [ :max_redirects ]
end
def request_headers
2017-06-06 13:53:49 -04:00
result = {
2019-11-07 04:14:43 -05:00
" User-Agent " = > @user_agent ,
2017-11-17 11:24:35 -05:00
" Accept " = > " */* " ,
2017-06-06 13:53:49 -04:00
" Host " = > @uri . hostname
}
2018-02-18 10:08:07 -05:00
result [ 'Cookie' ] = @cookie if @cookie
2017-06-06 13:53:49 -04:00
result
end
2019-03-08 15:36:49 -05:00
def small_get ( request_headers )
status_code , response_headers = nil
2019-02-26 22:51:15 -05:00
catch ( :done ) do
Net :: HTTP . start ( @uri . host , @uri . port , use_ssl : @uri . is_a? ( URI :: HTTPS ) ) do | http |
http . open_timeout = timeout
http . read_timeout = timeout
2019-03-08 15:36:49 -05:00
http . request_get ( @uri . request_uri , request_headers ) do | resp |
2019-02-26 22:51:15 -05:00
status_code = resp . code . to_i
2019-03-08 15:36:49 -05:00
response_headers = resp . to_hash
2019-02-26 22:51:15 -05:00
# see: https://bugs.ruby-lang.org/issues/15624
# if we allow response to return then body will be read
# got to abort without reading body
throw :done
end
end
2017-06-06 13:53:49 -04:00
end
2019-02-26 22:51:15 -05:00
2019-03-08 15:36:49 -05:00
[ status_code , response_headers ]
2017-05-22 12:23:04 -04:00
end
2018-01-28 23:36:52 -05:00
# this is a new interface for simply getting
# N bytes accounting for all internal logic
def get ( uri = @uri , redirects = @limit , extra_headers : { } , & blk )
raise " Must specify block " unless block_given?
if uri && uri . port == 80 && FinalDestination . is_https_domain? ( uri . hostname )
uri . scheme = " https "
uri = URI ( uri . to_s )
end
2018-02-24 06:35:57 -05:00
return nil unless validate_uri
2018-01-28 23:36:52 -05:00
result , ( location , cookie ) = safe_get ( uri , & blk )
if result == :redirect && ( redirects == 0 || ! location )
return nil
end
if result == :redirect
old_port = uri . port
location = " #{ uri . scheme } :// #{ uri . host } #{ location } " if location [ 0 ] == " / "
2018-03-28 04:20:08 -04:00
uri = uri ( location )
2018-01-28 23:36:52 -05:00
# https redirect, so just cache that whole new domain is https
if old_port == 80 && uri & . port == 443 && ( URI :: HTTPS === uri )
FinalDestination . cache_https_domain ( uri . hostname )
end
return nil if ! uri
extra = nil
2018-02-24 06:35:57 -05:00
extra = { 'Cookie' = > cookie } if cookie
2018-01-28 23:36:52 -05:00
get ( uri , redirects - 1 , extra_headers : extra , & blk )
elsif result == :ok
uri . to_s
else
nil
end
end
2017-05-22 12:23:04 -04:00
def resolve
2017-10-17 01:22:38 -04:00
if @uri && @uri . port == 80 && FinalDestination . is_https_domain? ( @uri . hostname )
@uri . scheme = " https "
@uri = URI ( @uri . to_s )
end
2017-05-22 12:23:04 -04:00
if @limit < 0
@status = :too_many_redirects
2017-10-31 12:03:03 -04:00
log ( :warn , " FinalDestination could not resolve URL (too many redirects): #{ @uri } " ) if @verbose
2017-05-22 12:23:04 -04:00
return nil
end
2018-08-26 10:31:02 -04:00
unless validate_uri
2020-11-18 12:55:16 -05:00
@status = :invalid_address
2018-08-26 10:31:02 -04:00
log ( :warn , " FinalDestination could not resolve URL (invalid URI): #{ @uri } " ) if @verbose
return nil
end
2017-06-26 15:38:23 -04:00
@ignored . each do | host |
2018-08-26 10:31:02 -04:00
if @uri & . hostname & . match? ( host )
2017-06-26 15:38:23 -04:00
@status = :resolved
return @uri
end
2017-06-12 13:22:29 -04:00
end
2017-05-22 12:23:04 -04:00
headers = request_headers
2017-08-08 05:44:27 -04:00
response = Excon . public_send ( @http_verb ,
2017-06-06 13:53:49 -04:00
@uri . to_s ,
2018-01-28 23:36:52 -05:00
read_timeout : timeout ,
2017-06-06 13:53:49 -04:00
headers : headers
)
location = nil
2018-02-18 10:08:07 -05:00
response_headers = nil
2017-09-28 02:35:27 -04:00
response_status = response . status . to_i
2017-06-06 13:53:49 -04:00
case response . status
when 200
2017-05-22 12:23:04 -04:00
@status = :resolved
return @uri
2018-02-27 06:05:48 -05:00
when 400 , 405 , 406 , 409 , 501
2019-02-26 22:51:15 -05:00
response_status , small_headers = small_get ( request_headers )
2017-06-06 13:53:49 -04:00
2017-09-28 02:35:27 -04:00
if response_status == 200
2017-06-06 13:53:49 -04:00
@status = :resolved
return @uri
end
2018-02-18 10:08:07 -05:00
response_headers = { }
2019-02-26 22:51:15 -05:00
if cookie_val = small_headers [ 'set-cookie' ]
2018-02-18 10:08:07 -05:00
response_headers [ :cookies ] = cookie_val
2017-06-06 13:53:49 -04:00
end
2019-02-26 22:51:15 -05:00
if location_val = small_headers [ 'location' ]
2018-02-18 10:08:07 -05:00
response_headers [ :location ] = location_val . join
2017-06-06 13:53:49 -04:00
end
2017-09-28 02:35:27 -04:00
end
2018-02-18 10:08:07 -05:00
unless response_headers
response_headers = {
cookies : response . data [ :cookies ] || response . headers [ :" set-cookie " ] ,
location : response . headers [ :location ]
}
2017-05-22 12:23:04 -04:00
end
2017-09-28 02:35:27 -04:00
if ( 300 .. 399 ) . include? ( response_status )
2018-02-18 10:08:07 -05:00
location = response_headers [ :location ]
2017-09-28 02:35:27 -04:00
end
2018-02-18 10:08:07 -05:00
if cookies = response_headers [ :cookies ]
@cookie = Array . wrap ( cookies ) . map { | c | c . split ( ';' ) . first . strip } . join ( '; ' )
2017-09-28 02:35:27 -04:00
end
2017-05-22 12:23:04 -04:00
if location
2019-08-07 06:56:03 -04:00
redirect_uri = uri ( location )
if @uri . host == redirect_uri . host && ( redirect_uri . path =~ / \/ login / || redirect_uri . path =~ / \/ session / )
@status = :resolved
return @uri
end
2017-10-17 01:22:38 -04:00
old_port = @uri . port
2018-12-19 01:27:07 -05:00
location = " #{ location } # #{ @uri . fragment } " if @preserve_fragment_url && @uri . fragment . present?
2017-05-22 12:23:04 -04:00
location = " #{ @uri . scheme } :// #{ @uri . host } #{ location } " if location [ 0 ] == " / "
2019-08-07 07:09:58 -04:00
@uri = uri ( location )
2017-05-22 12:23:04 -04:00
@limit -= 1
2017-10-17 01:22:38 -04:00
# https redirect, so just cache that whole new domain is https
if old_port == 80 && @uri . port == 443 && ( URI :: HTTPS === @uri )
FinalDestination . cache_https_domain ( @uri . hostname )
end
2017-05-22 12:23:04 -04:00
return resolve
end
2017-09-28 02:35:27 -04:00
# this is weird an exception seems better
@status = :failure
@status_code = response . status
2017-10-31 12:03:03 -04:00
log ( :warn , " FinalDestination could not resolve URL (status #{ response . status } ): #{ @uri } " ) if @verbose
2017-09-27 02:52:49 -04:00
nil
rescue Excon :: Errors :: Timeout
2017-10-31 12:03:03 -04:00
log ( :warn , " FinalDestination could not resolve URL (timeout): #{ @uri } " ) if @verbose
2017-05-22 12:23:04 -04:00
nil
end
def validate_uri
2019-05-27 20:28:57 -04:00
! @validate_uri || ( validate_uri_format && is_dest_valid? )
2017-05-22 12:23:04 -04:00
end
def validate_uri_format
return false unless @uri
return false unless [ 'https' , 'http' ] . include? ( @uri . scheme )
2017-05-23 13:07:18 -04:00
return false if @uri . scheme == 'http' && @uri . port != 80
return false if @uri . scheme == 'https' && @uri . port != 443
2017-05-22 12:23:04 -04:00
2017-05-23 13:07:18 -04:00
# Disallow IP based crawling
( IPAddr . new ( @uri . hostname ) rescue nil ) . nil?
2017-05-22 12:23:04 -04:00
end
2020-11-18 12:55:16 -05:00
def hostname
@uri . hostname
end
2017-06-12 13:22:29 -04:00
def hostname_matches? ( url )
2018-03-28 04:20:08 -04:00
url = uri ( url )
@uri && url . present? && @uri . hostname == url & . hostname
2017-06-12 13:22:29 -04:00
end
2017-05-23 16:32:54 -04:00
2017-06-12 13:22:29 -04:00
def is_dest_valid?
2017-05-22 12:23:04 -04:00
return false unless @uri && @uri . host
2020-07-26 20:23:54 -04:00
# Allowlisted hosts
2017-10-06 01:20:01 -04:00
return true if hostname_matches? ( SiteSetting . Upload . s3_cdn_url ) ||
2017-06-12 13:22:29 -04:00
hostname_matches? ( GlobalSetting . try ( :cdn_url ) ) ||
hostname_matches? ( Discourse . base_url_no_prefix )
2020-07-26 20:23:54 -04:00
if SiteSetting . allowed_internal_hosts . present?
return true if SiteSetting . allowed_internal_hosts . split ( " | " ) . any? { | h | h . downcase == @uri . hostname . downcase }
2017-06-13 12:59:54 -04:00
end
2017-05-22 12:23:04 -04:00
address_s = @opts [ :lookup_ip ] . call ( @uri . hostname )
return false unless address_s
address = IPAddr . new ( address_s )
2017-05-23 11:51:23 -04:00
if private_ranges . any? { | r | r === address }
2017-05-22 12:23:04 -04:00
@status = :invalid_address
return false
end
2017-05-23 15:03:04 -04:00
# Rate limit how often this IP can be crawled
2017-10-17 01:22:38 -04:00
if ! @opts [ :skip_rate_limit ] && ! @limited_ips . include? ( address )
@limited_ips << address
RateLimiter . new ( nil , " crawl-destination-ip: #{ address_s } " , 1000 , 1 . hour ) . performed!
2017-05-24 13:46:57 -04:00
end
2017-05-23 15:03:04 -04:00
2017-05-22 12:23:04 -04:00
true
2017-05-23 15:03:04 -04:00
rescue RateLimiter :: LimitExceeded
false
2017-05-22 12:23:04 -04:00
end
2017-07-29 12:42:04 -04:00
def escape_url
2019-12-11 21:49:21 -05:00
UrlHelper . escape_uri ( @url )
2017-07-29 12:42:04 -04:00
end
2017-05-23 11:51:23 -04:00
def private_ranges
FinalDestination . standard_private_ranges +
2020-07-26 20:23:54 -04:00
SiteSetting . blocked_ip_blocks . split ( '|' ) . map { | r | IPAddr . new ( r ) rescue nil } . compact
2017-05-23 11:51:23 -04:00
end
2017-10-31 07:08:34 -04:00
def log ( log_level , message )
2018-05-24 22:11:16 -04:00
return if @status_code == 404
2017-11-17 09:38:26 -05:00
2017-10-31 07:08:34 -04:00
Rails . logger . public_send (
log_level ,
" #{ RailsMultisite :: ConnectionManagement . current_db } : #{ message } "
)
end
2017-05-23 11:51:23 -04:00
def self . standard_private_ranges
2017-05-22 12:23:04 -04:00
@private_ranges || = [
2018-07-24 11:15:37 -04:00
IPAddr . new ( '0.0.0.0/8' ) ,
2017-05-22 12:23:04 -04:00
IPAddr . new ( '127.0.0.1' ) ,
IPAddr . new ( '172.16.0.0/12' ) ,
IPAddr . new ( '192.168.0.0/16' ) ,
IPAddr . new ( '10.0.0.0/8' ) ,
IPAddr . new ( 'fc00::/7' )
]
end
def self . lookup_ip ( host )
2018-03-28 02:44:42 -04:00
if Rails . env . test?
2018-07-24 12:00:23 -04:00
" 1.1.1.1 "
2018-03-28 02:44:42 -04:00
else
IPSocket :: getaddress ( host )
end
2017-11-12 19:19:06 -05:00
rescue SocketError
nil
2017-05-22 12:23:04 -04:00
end
2018-01-28 23:36:52 -05:00
protected
def safe_get ( uri )
result = nil
unsafe_close = false
safe_session ( uri ) do | http |
headers = request_headers . merge (
'Accept-Encoding' = > 'gzip' ,
'Host' = > uri . host
)
req = Net :: HTTP :: Get . new ( uri . request_uri , headers )
http . request ( req ) do | resp |
if Net :: HTTPRedirection === resp
result = :redirect , [ resp [ 'location' ] , resp [ 'Set-Cookie' ] ]
end
if Net :: HTTPSuccess === resp
resp . decode_content = true
2018-02-24 06:35:57 -05:00
resp . read_body do | chunk |
2018-01-28 23:36:52 -05:00
read_next = true
catch ( :done ) do
if read_next
read_next = false
yield resp , chunk , uri
read_next = true
end
end
# no clean way of finishing abruptly cause
# response likes reading till the end
if ! read_next
unsafe_close = true
http . finish
raise StandardError
end
2018-02-24 06:35:57 -05:00
end
2018-01-28 23:36:52 -05:00
result = :ok
2018-02-24 06:35:57 -05:00
else
catch ( :done ) do
yield resp , nil , nil
end
2018-01-28 23:36:52 -05:00
end
end
end
result
rescue StandardError
2018-02-24 06:35:57 -05:00
unsafe_close ? :ok : raise
2018-01-28 23:36:52 -05:00
end
def safe_session ( uri )
Net :: HTTP . start ( uri . host , uri . port , use_ssl : ( uri . scheme == " https " ) ) do | http |
http . read_timeout = timeout
http . open_timeout = timeout
yield http
end
end
2018-03-28 04:20:08 -04:00
private
def uri ( location )
begin
2018-08-14 06:23:32 -04:00
URI . parse ( location )
rescue URI :: Error
2018-03-28 04:20:08 -04:00
end
end
2017-05-22 12:23:04 -04:00
end