# frozen_string_literal: true require 'socket' require 'ipaddr' require 'excon' require 'rate_limiter' require 'url_helper' # Determine the final endpoint for a Web URI, following redirects class FinalDestination MAX_REQUEST_TIME_SECONDS = 10 MAX_REQUEST_SIZE_BYTES = 5_242_880 # 1024 * 1024 * 5 def self.clear_https_cache!(domain) key = redis_https_key(domain) Discourse.redis.without_namespace.del(key) end def self.cache_https_domain(domain) key = redis_https_key(domain) Discourse.redis.without_namespace.setex(key, 1.day.to_i, "1") end def self.is_https_domain?(domain) key = redis_https_key(domain) Discourse.redis.without_namespace.get(key).present? end def self.redis_https_key(domain) "HTTPS_DOMAIN_#{domain}" end DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15" attr_reader :status, :cookie, :status_code, :content_type, :ignored def initialize(url, opts = nil) @url = url @uri = uri(escape_url) if @url @opts = opts || {} @force_get_hosts = @opts[:force_get_hosts] || [] @preserve_fragment_url_hosts = @opts[:preserve_fragment_url_hosts] || [] @force_custom_user_agent_hosts = @opts[:force_custom_user_agent_hosts] || [] @default_user_agent = @opts[:default_user_agent] || DEFAULT_USER_AGENT @opts[:max_redirects] ||= 5 @opts[:lookup_ip] ||= lambda { |host| FinalDestination.lookup_ip(host) } @limit = @opts[:max_redirects] @ignored = [] if @limit > 0 ignore_redirects = [Discourse.base_url_no_prefix] if @opts[:ignore_redirects] ignore_redirects.concat(@opts[:ignore_redirects]) end ignore_redirects.each do |ignore_redirect| ignore_redirect = uri(ignore_redirect) if ignore_redirect.present? && ignore_redirect.hostname @ignored << ignore_redirect.hostname end end end @status = :ready @follow_canonical = @opts[:follow_canonical] @http_verb = http_verb(@force_get_hosts, @follow_canonical) @cookie = nil @limited_ips = [] @verbose = @opts[:verbose] || false @timeout = @opts[:timeout] || nil @preserve_fragment_url = @preserve_fragment_url_hosts.any? { |host| hostname_matches?(host) } @validate_uri = @opts.fetch(:validate_uri) { true } @user_agent = @force_custom_user_agent_hosts.any? { |host| hostname_matches?(host) } ? Onebox.options.user_agent : @default_user_agent @stop_at_blocked_pages = @opts[:stop_at_blocked_pages] end def self.connection_timeout 20 end def http_verb(force_get_hosts, follow_canonical) if follow_canonical || force_get_hosts.any? { |host| hostname_matches?(host) } :get else :head end end def timeout @timeout || FinalDestination.connection_timeout end def redirected? @limit < @opts[:max_redirects] end def request_headers result = { "User-Agent" => @user_agent, "Accept" => "*/*", "Accept-Language" => "*", "Host" => @uri.hostname } result['Cookie'] = @cookie if @cookie result end def small_get(request_headers) status_code, response_headers = nil catch(:done) do Net::HTTP.start(@uri.host, @uri.port, use_ssl: @uri.is_a?(URI::HTTPS), open_timeout: timeout) do |http| http.read_timeout = timeout http.request_get(@uri.request_uri, request_headers) do |resp| status_code = resp.code.to_i response_headers = resp.to_hash # see: https://bugs.ruby-lang.org/issues/15624 # if we allow response to return then body will be read # got to abort without reading body throw :done end end end [status_code, response_headers] end # this is a new interface for simply getting # N bytes accounting for all internal logic def get(redirects = @limit, extra_headers: {}, &blk) raise "Must specify block" unless block_given? if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname) @uri.scheme = "https" @uri = URI(@uri.to_s) end return if !validate_uri return if @stop_at_blocked_pages && blocked_domain?(@uri) result, headers_subset = safe_get(@uri, &blk) cookie = headers_subset.set_cookie location = headers_subset.location if result == :redirect && (redirects == 0 || !location) return nil end if result == :redirect old_port = @uri.port location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/" @uri = uri(location) # https redirect, so just cache that whole new domain is https if old_port == 80 && @uri&.port == 443 && (URI::HTTPS === @uri) FinalDestination.cache_https_domain(@uri.hostname) end return nil if !@uri extra = nil extra = { 'Cookie' => cookie } if cookie get(redirects - 1, extra_headers: extra, &blk) elsif result == :ok @uri.to_s else nil end end def resolve if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname) @uri.scheme = "https" @uri = URI(@uri.to_s) end if @limit < 0 @status = :too_many_redirects log(:warn, "FinalDestination could not resolve URL (too many redirects): #{@uri}") if @verbose return nil end unless validate_uri @status = :invalid_address log(:warn, "FinalDestination could not resolve URL (invalid URI): #{@uri}") if @verbose return nil end @ignored.each do |host| if @uri&.hostname&.match?(host) @status = :resolved return @uri end end if Oneboxer.cached_response_body_exists?(@uri.to_s) @status = :resolved return @uri end headers = request_headers middlewares = Excon.defaults[:middlewares].dup middlewares << Excon::Middleware::Decompress if @http_verb == :get request_start_time = Time.now response_body = +"" request_validator = lambda do |chunk, _remaining_bytes, _total_bytes| response_body << chunk raise Excon::Errors::ExpectationFailed.new("response size too big: #{@uri.to_s}") if response_body.bytesize > MAX_REQUEST_SIZE_BYTES raise Excon::Errors::ExpectationFailed.new("connect timeout reached: #{@uri.to_s}") if Time.now - request_start_time > MAX_REQUEST_TIME_SECONDS end response = Excon.public_send(@http_verb, @uri.to_s, read_timeout: timeout, connect_timeout: timeout, headers: headers, middlewares: middlewares, response_block: request_validator ) if @stop_at_blocked_pages if blocked_domain?(@uri) || response.headers['Discourse-No-Onebox'] == "1" @status = :blocked_page return end end location = nil response_headers = nil response_status = response.status.to_i case response.status when 200 # Cache body of successful `get` requests if @http_verb == :get if Oneboxer.cache_response_body?(@uri) Oneboxer.cache_response_body(@uri.to_s, response_body) end end if @follow_canonical next_url = fetch_canonical_url(response_body) if next_url.to_s.present? && next_url != @uri @follow_canonical = false @uri = next_url @http_verb = http_verb(@force_get_hosts, @follow_canonical) return resolve end end @content_type = response.headers['Content-Type'] if response.headers.has_key?('Content-Type') @status = :resolved return @uri when 103, 400, 405, 406, 409, 500, 501 response_status, small_headers = small_get(request_headers) if @stop_at_blocked_pages # this may seem weird, but the #to_hash method of the response object # of ruby's net/http lib returns a hash where each value is an array. # small_headers here is like that so our no onebox header value is an # array if it's set. Also the hash keys are always lower-cased. dont_onebox = small_headers["discourse-no-onebox"]&.join("") == "1" if dont_onebox || blocked_domain?(@uri) @status = :blocked_page return end end if response_status == 200 @status = :resolved return @uri end response_headers = {} if cookie_val = small_headers['set-cookie'] response_headers[:cookies] = cookie_val end if location_val = small_headers['location'] response_headers[:location] = location_val.join end end unless response_headers response_headers = { cookies: response.data[:cookies] || response.headers[:"set-cookie"], location: response.headers[:location] } end if (300..399).include?(response_status) location = response_headers[:location] end if cookies = response_headers[:cookies] @cookie = Array.wrap(cookies).map { |c| c.split(';').first.strip }.join('; ') end if location redirect_uri = uri(location) if @uri.host == redirect_uri.host && (redirect_uri.path =~ /\/login/ || redirect_uri.path =~ /\/session/) @status = :resolved return @uri end old_port = @uri.port location = "#{location}##{@uri.fragment}" if @preserve_fragment_url && @uri.fragment.present? location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/" @uri = uri(location) @limit -= 1 # https redirect, so just cache that whole new domain is https if old_port == 80 && @uri.port == 443 && (URI::HTTPS === @uri) FinalDestination.cache_https_domain(@uri.hostname) end return resolve end # this is weird an exception seems better @status = :failure @status_code = response.status log(:warn, "FinalDestination could not resolve URL (status #{response.status}): #{@uri}") if @verbose nil rescue Excon::Errors::Timeout log(:warn, "FinalDestination could not resolve URL (timeout): #{@uri}") if @verbose nil end def validate_uri !@validate_uri || (validate_uri_format && is_dest_valid?) end def validate_uri_format return false unless @uri return false unless ['https', 'http'].include?(@uri.scheme) return false if @uri.scheme == 'http' && @uri.port != 80 return false if @uri.scheme == 'https' && @uri.port != 443 # Disallow IP based crawling (IPAddr.new(@uri.hostname) rescue nil).nil? end def hostname @uri.hostname end def hostname_matches?(url) url = uri(url) if @uri&.hostname.present? && url&.hostname.present? hostname_parts = url.hostname.split('.') has_wildcard = hostname_parts.first == '*' if has_wildcard @uri.hostname.end_with?(hostname_parts[1..-1].join('.')) else @uri.hostname == url.hostname end end end def is_dest_valid? return false unless @uri && @uri.host # Allowlisted hosts return true if hostname_matches?(SiteSetting.Upload.s3_cdn_url) || hostname_matches?(GlobalSetting.try(:cdn_url)) || hostname_matches?(Discourse.base_url_no_prefix) if SiteSetting.allowed_internal_hosts.present? return true if SiteSetting.allowed_internal_hosts.split("|").any? { |h| h.downcase == @uri.hostname.downcase } end address_s = @opts[:lookup_ip].call(@uri.hostname) return false unless address_s address = IPAddr.new(address_s) if private_ranges.any? { |r| r === address } @status = :invalid_address return false end # Rate limit how often this IP can be crawled if !@opts[:skip_rate_limit] && !@limited_ips.include?(address) @limited_ips << address RateLimiter.new(nil, "crawl-destination-ip:#{address_s}", 1000, 1.hour).performed! end true rescue RateLimiter::LimitExceeded false end def escape_url UrlHelper.escape_uri(@url) end def private_ranges FinalDestination.standard_private_ranges + SiteSetting.blocked_ip_blocks.split('|').map { |r| IPAddr.new(r) rescue nil }.compact end def log(log_level, message) return if @status_code == 404 Rails.logger.public_send( log_level, "#{RailsMultisite::ConnectionManagement.current_db}: #{message}" ) end def self.standard_private_ranges @private_ranges ||= [ IPAddr.new('0.0.0.0/8'), IPAddr.new('127.0.0.1'), IPAddr.new('172.16.0.0/12'), IPAddr.new('192.168.0.0/16'), IPAddr.new('10.0.0.0/8'), IPAddr.new('fc00::/7') ] end def self.lookup_ip(host) if Rails.env.test? "1.1.1.1" else IPSocket::getaddress(host) end rescue SocketError nil end protected def safe_get(uri) result = nil unsafe_close = false headers_subset = Struct.new(:location, :set_cookie).new safe_session(uri) do |http| headers = request_headers.merge( 'Accept-Encoding' => 'gzip', 'Host' => uri.host ) req = Net::HTTP::Get.new(uri.request_uri, headers) http.request(req) do |resp| headers_subset.set_cookie = resp['Set-Cookie'] if @stop_at_blocked_pages dont_onebox = resp["Discourse-No-Onebox"] == "1" if dont_onebox result = :blocked, headers_subset next end end if Net::HTTPRedirection === resp headers_subset.location = resp['location'] result = :redirect, headers_subset end if Net::HTTPSuccess === resp resp.decode_content = true resp.read_body do |chunk| read_next = true catch(:done) do if read_next read_next = false yield resp, chunk, uri read_next = true end end # no clean way of finishing abruptly cause # response likes reading till the end if !read_next unsafe_close = true http.finish raise StandardError end end result = :ok, headers_subset else catch(:done) do yield resp, nil, nil end end end end result rescue StandardError unsafe_close ? [:ok, headers_subset] : raise end def safe_session(uri) Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == "https"), open_timeout: timeout) do |http| http.read_timeout = timeout yield http end end private def uri(location) begin URI.parse(location) rescue URI::Error end end def fetch_canonical_url(body) return if body.blank? canonical_element = Nokogiri::HTML5(body).at("link[rel='canonical']") return if canonical_element.nil? canonical_uri = uri(canonical_element['href']) return if canonical_uri.blank? return canonical_uri if canonical_uri.host.present? parts = [@uri.host, canonical_uri.to_s] complete_url = canonical_uri.to_s.starts_with?('/') ? parts.join('') : parts.join('/') complete_url = "#{@uri.scheme}://#{complete_url}" if @uri.scheme uri(complete_url) end def blocked_domain?(uri) Onebox::DomainChecker.is_blocked?(uri.hostname) end end