diff --git a/config/site_settings.yml b/config/site_settings.yml index 33bd1327e4b..90cd1922ae9 100644 --- a/config/site_settings.yml +++ b/config/site_settings.yml @@ -1638,6 +1638,16 @@ onebox: facebook_app_access_token: default: "" secret: true + cache_onebox_response_body: + default: false + hidden: true + cache_onebox_response_body_domains: + default: "" + type: list + hidden: true + cache_onebox_user_agent: + default: "" + hidden: true spam: add_rel_nofollow_to_user_content: true hide_post_sensitivity: diff --git a/lib/final_destination.rb b/lib/final_destination.rb index 8bf06ee6e0f..0c0db2e6220 100644 --- a/lib/final_destination.rb +++ b/lib/final_destination.rb @@ -16,7 +16,7 @@ class FinalDestination def self.cache_https_domain(domain) key = redis_https_key(domain) - Discourse.redis.without_namespace.setex(key, "1", 1.day.to_i).present? + Discourse.redis.without_namespace.setex(key, 1.day.to_i, "1") end def self.is_https_domain?(domain) @@ -28,6 +28,8 @@ class FinalDestination "HTTPS_DOMAIN_#{domain}" end + DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15" + attr_reader :status, :cookie, :status_code, :ignored def initialize(url, opts = nil) @@ -38,6 +40,7 @@ class FinalDestination @force_get_hosts = @opts[:force_get_hosts] || [] @preserve_fragment_url_hosts = @opts[:preserve_fragment_url_hosts] || [] @force_custom_user_agent_hosts = @opts[:force_custom_user_agent_hosts] || [] + @default_user_agent = @opts[:default_user_agent] || DEFAULT_USER_AGENT @opts[:max_redirects] ||= 5 @opts[:lookup_ip] ||= lambda { |host| FinalDestination.lookup_ip(host) } @@ -67,7 +70,7 @@ class FinalDestination @timeout = @opts[:timeout] || nil @preserve_fragment_url = @preserve_fragment_url_hosts.any? { |host| hostname_matches?(host) } @validate_uri = @opts.fetch(:validate_uri) { true } - @user_agent = @force_custom_user_agent_hosts.any? { |host| hostname_matches?(host) } ? Onebox.options.user_agent : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15" + @user_agent = @force_custom_user_agent_hosts.any? { |host| hostname_matches?(host) } ? Onebox.options.user_agent : @default_user_agent end def self.connection_timeout @@ -182,20 +185,35 @@ class FinalDestination end end + if Oneboxer.cached_response_body_exists?(@uri.to_s) + @status = :resolved + return @uri + end + headers = request_headers + middlewares = Excon.defaults[:middlewares] + middlewares << Excon::Middleware::Decompress if @http_verb == :get + response = Excon.public_send(@http_verb, @uri.to_s, read_timeout: timeout, - headers: headers + headers: headers, + middlewares: middlewares ) location = nil response_headers = nil - response_status = response.status.to_i case response.status when 200 + # Cache body of successful `get` requests + if @http_verb == :get + if Oneboxer.cache_response_body?(@uri) + Oneboxer.cache_response_body(@uri.to_s, response.body) + end + end + @status = :resolved return @uri when 400, 405, 406, 409, 501 diff --git a/lib/oneboxer.rb b/lib/oneboxer.rb index 79f5070374f..c30c4e98dec 100644 --- a/lib/oneboxer.rb +++ b/lib/oneboxer.rb @@ -26,13 +26,17 @@ module Oneboxer @ignore_redirects ||= ['http://www.dropbox.com', 'http://store.steampowered.com', 'http://vimeo.com', Discourse.base_url] end - def self.force_get_hosts - @force_get_hosts ||= begin - hosts = ['http://us.battle.net', 'https://news.yahoo.com'] - amazon_suffixes = %w(com com.br ca cn fr de in it co.jp com.mx nl pl sa sg es se com.tr ae co.uk) + def self.amazon_domains + amazon_suffixes = %w(com com.br ca cn fr de in it co.jp com.mx nl pl sa sg es se com.tr ae co.uk) + amazon_suffixes.collect { |suffix| "https://www.amazon.#{suffix}" } + end - hosts + amazon_suffixes.collect { |suffix| "https://www.amazon.#{suffix}" } - end + def self.force_get_hosts + hosts = ['http://us.battle.net', 'https://news.yahoo.com'] + hosts += SiteSetting.cache_onebox_response_body_domains.split('|').collect { |domain| "https://www.#{domain}" } + hosts += amazon_domains + + hosts.uniq end def self.force_custom_user_agent_hosts @@ -80,6 +84,33 @@ module Oneboxer Discourse.cache.delete(onebox_failed_cache_key(url)) end + def self.cache_response_body?(uri) + uri = URI.parse(uri) if uri.is_a?(String) + + if SiteSetting.cache_onebox_response_body? + SiteSetting.cache_onebox_response_body_domains.split("|").any? { |domain| uri.hostname.ends_with?(domain) } + end + end + + def self.cache_response_body(uri, response) + key = redis_cached_response_body_key(uri) + Discourse.redis.without_namespace.setex(key, 1.minutes.to_i, response) + end + + def self.cached_response_body_exists?(uri) + key = redis_cached_response_body_key(uri) + Discourse.redis.without_namespace.exists(key).to_i > 0 + end + + def self.fetch_cached_response_body(uri) + key = redis_cached_response_body_key(uri) + Discourse.redis.without_namespace.get(key) + end + + def self.redis_cached_response_body_key(uri) + "CACHED_RESPONSE_#{uri}" + end + # Parse URLs out of HTML, returning the document when finished. def self.each_onebox_link(string_or_doc, extra_paths: []) doc = string_or_doc @@ -368,12 +399,18 @@ module Oneboxer def self.external_onebox(url) Discourse.cache.fetch(onebox_cache_key(url), expires_in: 1.day) do - fd = FinalDestination.new(url, - ignore_redirects: ignore_redirects, - ignore_hostnames: blocked_domains, - force_get_hosts: force_get_hosts, - force_custom_user_agent_hosts: force_custom_user_agent_hosts, - preserve_fragment_url_hosts: preserve_fragment_url_hosts) + fd_options = { + ignore_redirects: ignore_redirects, + ignore_hostnames: blocked_domains, + force_get_hosts: force_get_hosts, + force_custom_user_agent_hosts: force_custom_user_agent_hosts, + preserve_fragment_url_hosts: preserve_fragment_url_hosts + } + + user_agent_override = SiteSetting.cache_onebox_user_agent if Oneboxer.cache_response_body?(url) && SiteSetting.cache_onebox_user_agent.present? + fd_options[:default_user_agent] = user_agent_override if user_agent_override + + fd = FinalDestination.new(url, fd_options) uri = fd.resolve if fd.status != :resolved @@ -389,20 +426,22 @@ module Oneboxer return error_box end - return blank_onebox if uri.blank? || blocked_domains.map { |hostname| uri.hostname.match?(hostname) }.any? + return blank_onebox if uri.blank? || blocked_domains.any? { |hostname| uri.hostname.match?(hostname) } - options = { + onebox_options = { max_width: 695, sanitize_config: Onebox::DiscourseOneboxSanitizeConfig::Config::DISCOURSE_ONEBOX, allowed_iframe_origins: allowed_iframe_origins, hostname: GlobalSetting.hostname, facebook_app_access_token: SiteSetting.facebook_app_access_token, - disable_media_download_controls: SiteSetting.disable_onebox_media_download_controls + disable_media_download_controls: SiteSetting.disable_onebox_media_download_controls, + body_cacher: self } - options[:cookie] = fd.cookie if fd.cookie + onebox_options[:cookie] = fd.cookie if fd.cookie + onebox_options[:user_agent] = user_agent_override if user_agent_override - r = Onebox.preview(uri.to_s, options) + r = Onebox.preview(uri.to_s, onebox_options) result = { onebox: r.to_s, preview: r&.placeholder_html.to_s } # NOTE: Call r.errors after calling placeholder_html diff --git a/spec/components/oneboxer_spec.rb b/spec/components/oneboxer_spec.rb index 5d6681422b7..c2adaa818b9 100644 --- a/spec/components/oneboxer_spec.rb +++ b/spec/components/oneboxer_spec.rb @@ -330,9 +330,51 @@ describe Oneboxer do end describe '#force_get_hosts' do + before do + SiteSetting.cache_onebox_response_body_domains = "example.net|example.com|example.org" + end + it "includes Amazon sites" do expect(Oneboxer.force_get_hosts).to include('https://www.amazon.ca') end + + it "includes cache_onebox_response_body_domains" do + expect(Oneboxer.force_get_hosts).to include('https://www.example.com') + end + end + + describe 'cache_onebox_response_body' do + let(:html) do + <<~HTML + +
+cache me if you can
+ + + HTML + end + + let(:url) { "https://www.example.com/my/great/content" } + let(:url2) { "https://www.example2.com/my/great/content" } + + before do + stub_request(:any, url).to_return(status: 200, body: html) + stub_request(:any, url2).to_return(status: 200, body: html) + + SiteSetting.cache_onebox_response_body = true + SiteSetting.cache_onebox_response_body_domains = "example.net|example.com|example.org" + end + + it "caches when domain matches" do + preview = Oneboxer.preview(url, invalidate_oneboxes: true) + expect(Oneboxer.cached_response_body_exists?(url)).to eq(true) + expect(Oneboxer.fetch_cached_response_body(url)).to eq(html) + end + + it "ignores cache when domain not present" do + preview = Oneboxer.preview(url2, invalidate_oneboxes: true) + expect(Oneboxer.cached_response_body_exists?(url2)).to eq(false) + end end end