From e72fd7ae4eee160aa8aedb106b62ef95e82c8b9d Mon Sep 17 00:00:00 2001 From: Sam Date: Wed, 4 Jul 2018 11:14:43 +1000 Subject: [PATCH] FIX: move crawler blocking into anon cache This refinement of previous fix moves the crawler blocking into anonymous cache This ensures we never poison the cache incorrectly when blocking crawlers --- app/controllers/application_controller.rb | 14 --- lib/middleware/anonymous_cache.rb | 12 +++ .../middleware/anonymous_cache_spec.rb | 87 +++++++++++++++++++ spec/requests/application_controller_spec.rb | 67 -------------- 4 files changed, 99 insertions(+), 81 deletions(-) diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb index 4807405bfac..83f2feb4599 100644 --- a/app/controllers/application_controller.rb +++ b/app/controllers/application_controller.rb @@ -40,7 +40,6 @@ class ApplicationController < ActionController::Base end end - before_action :block_crawlers before_action :check_readonly_mode before_action :handle_theme before_action :set_current_user_for_logs @@ -465,19 +464,6 @@ class ApplicationController < ActionController::Base private - def block_crawlers - if ( - request.get? && - !request.xhr? && - !request.path.ends_with?('robots.txt') && - CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT']) - ) - - request.env["discourse.request_tracker.skip"] = true - raise Discourse::InvalidAccess, 'Crawler not allowed' - end - end - def check_readonly_mode @readonly_mode = Discourse.readonly_mode? end diff --git a/lib/middleware/anonymous_cache.rb b/lib/middleware/anonymous_cache.rb index 2a195c2f930..9e29f06854c 100644 --- a/lib/middleware/anonymous_cache.rb +++ b/lib/middleware/anonymous_cache.rb @@ -21,6 +21,13 @@ module Middleware @request = Rack::Request.new(@env) end + def blocked_crawler? + @request.get? && + !@request.xhr? && + !@request.path.ends_with?('robots.txt') && + CrawlerDetection.is_blocked_crawler?(@request.env['HTTP_USER_AGENT']) + end + def is_mobile=(val) @is_mobile = val ? :true : :false end @@ -188,6 +195,11 @@ module Middleware helper = Helper.new(env) force_anon = false + if helper.blocked_crawler? + env["discourse.request_tracker.skip"] = true + return [403, {}, "Crawler is not allowed!"] + end + if helper.should_force_anonymous? force_anon = env["DISCOURSE_FORCE_ANON"] = true helper.force_anonymous! diff --git a/spec/components/middleware/anonymous_cache_spec.rb b/spec/components/middleware/anonymous_cache_spec.rb index ed4ad96c34e..0ee3bc46b77 100644 --- a/spec/components/middleware/anonymous_cache_spec.rb +++ b/spec/components/middleware/anonymous_cache_spec.rb @@ -152,4 +152,91 @@ describe Middleware::AnonymousCache::Helper do end end + context "crawler blocking" do + let :non_crawler do + { + "HTTP_USER_AGENT" => + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" + } + end + + def get(path, options) + middleware = Middleware::AnonymousCache.new(lambda { |_| [200, {}, []] }) + @env = env({ + "REQUEST_URI" => path, + "PATH_INFO" => path, + "REQUEST_PATH" => path + }.merge(options[:headers])) + @status = middleware.call(@env).first + end + + it "applies whitelisted_crawler_user_agents correctly" do + SiteSetting.whitelisted_crawler_user_agents = 'Googlebot' + + get '/srv/status', headers: { + 'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)' + } + + expect(@status).to eq(200) + + get '/srv/status', headers: { + 'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)' + } + + expect(@status).to eq(403) + + get '/srv/status', headers: non_crawler + expect(@status).to eq(200) + end + + it "applies blacklisted_crawler_user_agents correctly" do + SiteSetting.blacklisted_crawler_user_agents = 'Googlebot' + + get '/srv/status', headers: non_crawler + expect(@status).to eq(200) + + get '/srv/status', headers: { + 'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)' + } + + expect(@status).to eq(403) + + get '/srv/status', headers: { + 'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)' + } + + expect(@status).to eq(200) + end + + it "should never block robots.txt" do + SiteSetting.blacklisted_crawler_user_agents = 'Googlebot' + + get '/robots.txt', headers: { + 'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)' + } + + expect(@status).to eq(200) + end + + it "blocked crawlers shouldn't log page views" do + SiteSetting.blacklisted_crawler_user_agents = 'Googlebot' + + get '/srv/status', headers: { + 'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)' + } + + expect(@env["discourse.request_tracker.skip"]).to eq(true) + end + + it "blocks json requests" do + SiteSetting.blacklisted_crawler_user_agents = 'Googlebot' + + get '/srv/status.json', headers: { + 'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)' + } + + expect(@status).to eq(403) + end + end + end diff --git a/spec/requests/application_controller_spec.rb b/spec/requests/application_controller_spec.rb index 3d1f355be59..f498c807b56 100644 --- a/spec/requests/application_controller_spec.rb +++ b/spec/requests/application_controller_spec.rb @@ -33,71 +33,4 @@ RSpec.describe ApplicationController do end end - context "crawler blocking" do - let :non_crawler do - { - "HTTP_USER_AGENT" => - "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" - } - end - it "applies whitelisted_crawler_user_agents correctly" do - SiteSetting.whitelisted_crawler_user_agents = 'Googlebot' - - get '/srv/status', headers: { - 'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)' - } - - expect(response.status).to eq(200) - - get '/srv/status', headers: { - 'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)' - } - - expect(response.status).to eq(403) - - get '/srv/status', headers: non_crawler - expect(response.status).to eq(200) - end - - it "applies blacklisted_crawler_user_agents correctly" do - SiteSetting.blacklisted_crawler_user_agents = 'Googlebot' - - get '/srv/status', headers: non_crawler - expect(response.status).to eq(200) - - get '/srv/status', headers: { - 'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)' - } - - expect(response.status).to eq(403) - - get '/srv/status', headers: { - 'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)' - } - - expect(response.status).to eq(200) - end - - it "blocked crawlers shouldn't log page views" do - ApplicationRequest.clear_cache! - SiteSetting.blacklisted_crawler_user_agents = 'Googlebot' - expect { - get '/srv/status', headers: { - 'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)' - } - ApplicationRequest.write_cache! - }.to_not change { ApplicationRequest.count } - end - - it "blocks json requests" do - SiteSetting.blacklisted_crawler_user_agents = 'Googlebot' - - get '/srv/status.json', headers: { - 'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)' - } - - expect(response.status).to eq(403) - end - end - end