FIX: move crawler blocking into anon cache

This refinement of previous fix moves the crawler blocking into anonymous cache This ensures we never poison the cache incorrectly when blocking crawlers
2018-07-04 11:14:43 +10:00 · 2018-07-04 11:14:43 +10:00 · e72fd7ae4e
parent 7f98ed69cd
commit e72fd7ae4e
4 changed files with 99 additions and 81 deletions
--- a/app/controllers/application_controller.rb
+++ b/app/controllers/application_controller.rb
@ -40,7 +40,6 @@ class ApplicationController < ActionController::Base
    end
  end
  before_action :block_crawlers
  before_action :check_readonly_mode
  before_action :handle_theme
  before_action :set_current_user_for_logs
@ -465,19 +464,6 @@ class ApplicationController < ActionController::Base
  private
  def block_crawlers
    if (
        request.get? &&
        !request.xhr? &&
        !request.path.ends_with?('robots.txt') &&
        CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT'])
    )
      request.env["discourse.request_tracker.skip"] = true
      raise Discourse::InvalidAccess, 'Crawler not allowed'
    end
  end
  def check_readonly_mode
    @readonly_mode = Discourse.readonly_mode?
  end
--- a/lib/middleware/anonymous_cache.rb
+++ b/lib/middleware/anonymous_cache.rb
@ -21,6 +21,13 @@ module Middleware
        @request = Rack::Request.new(@env)
      end
      def blocked_crawler?
        @request.get? &&
        !@request.xhr? &&
        !@request.path.ends_with?('robots.txt') &&
        CrawlerDetection.is_blocked_crawler?(@request.env['HTTP_USER_AGENT'])
      end
      def is_mobile=(val)
        @is_mobile = val ? :true : :false
      end
@ -188,6 +195,11 @@ module Middleware
      helper = Helper.new(env)
      force_anon = false
      if helper.blocked_crawler?
        env["discourse.request_tracker.skip"] = true
        return [403, {}, "Crawler is not allowed!"]
      end
      if helper.should_force_anonymous?
        force_anon = env["DISCOURSE_FORCE_ANON"] = true
        helper.force_anonymous!
--- a/spec/components/middleware/anonymous_cache_spec.rb
+++ b/spec/components/middleware/anonymous_cache_spec.rb
@ -152,4 +152,91 @@ describe Middleware::AnonymousCache::Helper do
    end
  end
  context "crawler blocking" do
    let :non_crawler do
      {
        "HTTP_USER_AGENT" =>
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
      }
    end
    def get(path, options)
      middleware = Middleware::AnonymousCache.new(lambda { |_| [200, {}, []] })
      @env = env({
        "REQUEST_URI" => path,
        "PATH_INFO" => path,
        "REQUEST_PATH" => path
      }.merge(options[:headers]))
      @status = middleware.call(@env).first
    end
    it "applies whitelisted_crawler_user_agents correctly" do
      SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
      get '/srv/status', headers: {
        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
      }
      expect(@status).to eq(200)
      get '/srv/status', headers: {
        'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)'
      }
      expect(@status).to eq(403)
      get '/srv/status', headers: non_crawler
      expect(@status).to eq(200)
    end
    it "applies blacklisted_crawler_user_agents correctly" do
      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
      get '/srv/status', headers: non_crawler
      expect(@status).to eq(200)
      get '/srv/status', headers: {
        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
      }
      expect(@status).to eq(403)
      get '/srv/status', headers: {
        'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)'
      }
      expect(@status).to eq(200)
    end
    it "should never block robots.txt" do
      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
      get '/robots.txt', headers: {
        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
      }
      expect(@status).to eq(200)
    end
    it "blocked crawlers shouldn't log page views" do
      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
      get '/srv/status', headers: {
        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
      }
      expect(@env["discourse.request_tracker.skip"]).to eq(true)
    end
    it "blocks json requests" do
      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
      get '/srv/status.json', headers: {
        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
      }
      expect(@status).to eq(403)
    end
  end
 end
--- a/spec/requests/application_controller_spec.rb
+++ b/spec/requests/application_controller_spec.rb
@ -33,71 +33,4 @@ RSpec.describe ApplicationController do
    end
  end
  context "crawler blocking" do
    let :non_crawler do
      {
        "HTTP_USER_AGENT" =>
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
      }
    end
    it "applies whitelisted_crawler_user_agents correctly" do
      SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
      get '/srv/status', headers: {
        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
      }
      expect(response.status).to eq(200)
      get '/srv/status', headers: {
        'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)'
      }
      expect(response.status).to eq(403)
      get '/srv/status', headers: non_crawler
      expect(response.status).to eq(200)
    end
    it "applies blacklisted_crawler_user_agents correctly" do
      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
      get '/srv/status', headers: non_crawler
      expect(response.status).to eq(200)
      get '/srv/status', headers: {
        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
      }
      expect(response.status).to eq(403)
      get '/srv/status', headers: {
        'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)'
      }
      expect(response.status).to eq(200)
    end
    it "blocked crawlers shouldn't log page views" do
      ApplicationRequest.clear_cache!
      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
      expect {
        get '/srv/status', headers: {
          'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
        }
        ApplicationRequest.write_cache!
      }.to_not change { ApplicationRequest.count }
    end
    it "blocks json requests" do
      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
      get '/srv/status.json', headers: {
        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
      }
      expect(response.status).to eq(403)
    end
  end
 end