From e72fd7ae4eee160aa8aedb106b62ef95e82c8b9d Mon Sep 17 00:00:00 2001
From: Sam <sam.saffron@gmail.com>
Date: Wed, 4 Jul 2018 11:14:43 +1000
Subject: [PATCH] FIX: move crawler blocking into anon cache

This refinement of previous fix moves the crawler blocking into
anonymous cache

This ensures we never poison the cache incorrectly when blocking crawlers
---
 app/controllers/application_controller.rb     | 14 ---
 lib/middleware/anonymous_cache.rb             | 12 +++
 .../middleware/anonymous_cache_spec.rb        | 87 +++++++++++++++++++
 spec/requests/application_controller_spec.rb  | 67 --------------
 4 files changed, 99 insertions(+), 81 deletions(-)

diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb
index 4807405bfac..83f2feb4599 100644
--- a/app/controllers/application_controller.rb
+++ b/app/controllers/application_controller.rb
@@ -40,7 +40,6 @@ class ApplicationController < ActionController::Base
     end
   end
 
-  before_action :block_crawlers
   before_action :check_readonly_mode
   before_action :handle_theme
   before_action :set_current_user_for_logs
@@ -465,19 +464,6 @@ class ApplicationController < ActionController::Base
 
   private
 
-  def block_crawlers
-    if (
-        request.get? &&
-        !request.xhr? &&
-        !request.path.ends_with?('robots.txt') &&
-        CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT'])
-    )
-
-      request.env["discourse.request_tracker.skip"] = true
-      raise Discourse::InvalidAccess, 'Crawler not allowed'
-    end
-  end
-
   def check_readonly_mode
     @readonly_mode = Discourse.readonly_mode?
   end
diff --git a/lib/middleware/anonymous_cache.rb b/lib/middleware/anonymous_cache.rb
index 2a195c2f930..9e29f06854c 100644
--- a/lib/middleware/anonymous_cache.rb
+++ b/lib/middleware/anonymous_cache.rb
@@ -21,6 +21,13 @@ module Middleware
         @request = Rack::Request.new(@env)
       end
 
+      def blocked_crawler?
+        @request.get? &&
+        !@request.xhr? &&
+        !@request.path.ends_with?('robots.txt') &&
+        CrawlerDetection.is_blocked_crawler?(@request.env['HTTP_USER_AGENT'])
+      end
+
       def is_mobile=(val)
         @is_mobile = val ? :true : :false
       end
@@ -188,6 +195,11 @@ module Middleware
       helper = Helper.new(env)
       force_anon = false
 
+      if helper.blocked_crawler?
+        env["discourse.request_tracker.skip"] = true
+        return [403, {}, "Crawler is not allowed!"]
+      end
+
       if helper.should_force_anonymous?
         force_anon = env["DISCOURSE_FORCE_ANON"] = true
         helper.force_anonymous!
diff --git a/spec/components/middleware/anonymous_cache_spec.rb b/spec/components/middleware/anonymous_cache_spec.rb
index ed4ad96c34e..0ee3bc46b77 100644
--- a/spec/components/middleware/anonymous_cache_spec.rb
+++ b/spec/components/middleware/anonymous_cache_spec.rb
@@ -152,4 +152,91 @@ describe Middleware::AnonymousCache::Helper do
     end
   end
 
+  context "crawler blocking" do
+    let :non_crawler do
+      {
+        "HTTP_USER_AGENT" =>
+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
+      }
+    end
+
+    def get(path, options)
+      middleware = Middleware::AnonymousCache.new(lambda { |_| [200, {}, []] })
+      @env = env({
+        "REQUEST_URI" => path,
+        "PATH_INFO" => path,
+        "REQUEST_PATH" => path
+      }.merge(options[:headers]))
+      @status = middleware.call(@env).first
+    end
+
+    it "applies whitelisted_crawler_user_agents correctly" do
+      SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
+
+      get '/srv/status', headers: {
+        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
+      }
+
+      expect(@status).to eq(200)
+
+      get '/srv/status', headers: {
+        'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)'
+      }
+
+      expect(@status).to eq(403)
+
+      get '/srv/status', headers: non_crawler
+      expect(@status).to eq(200)
+    end
+
+    it "applies blacklisted_crawler_user_agents correctly" do
+      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
+
+      get '/srv/status', headers: non_crawler
+      expect(@status).to eq(200)
+
+      get '/srv/status', headers: {
+        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
+      }
+
+      expect(@status).to eq(403)
+
+      get '/srv/status', headers: {
+        'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)'
+      }
+
+      expect(@status).to eq(200)
+    end
+
+    it "should never block robots.txt" do
+      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
+
+      get '/robots.txt', headers: {
+        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
+      }
+
+      expect(@status).to eq(200)
+    end
+
+    it "blocked crawlers shouldn't log page views" do
+      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
+
+      get '/srv/status', headers: {
+        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
+      }
+
+      expect(@env["discourse.request_tracker.skip"]).to eq(true)
+    end
+
+    it "blocks json requests" do
+      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
+
+      get '/srv/status.json', headers: {
+        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
+      }
+
+      expect(@status).to eq(403)
+    end
+  end
+
 end
diff --git a/spec/requests/application_controller_spec.rb b/spec/requests/application_controller_spec.rb
index 3d1f355be59..f498c807b56 100644
--- a/spec/requests/application_controller_spec.rb
+++ b/spec/requests/application_controller_spec.rb
@@ -33,71 +33,4 @@ RSpec.describe ApplicationController do
     end
   end
 
-  context "crawler blocking" do
-    let :non_crawler do
-      {
-        "HTTP_USER_AGENT" =>
-        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
-      }
-    end
-    it "applies whitelisted_crawler_user_agents correctly" do
-      SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
-
-      get '/srv/status', headers: {
-        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
-      }
-
-      expect(response.status).to eq(200)
-
-      get '/srv/status', headers: {
-        'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)'
-      }
-
-      expect(response.status).to eq(403)
-
-      get '/srv/status', headers: non_crawler
-      expect(response.status).to eq(200)
-    end
-
-    it "applies blacklisted_crawler_user_agents correctly" do
-      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
-
-      get '/srv/status', headers: non_crawler
-      expect(response.status).to eq(200)
-
-      get '/srv/status', headers: {
-        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
-      }
-
-      expect(response.status).to eq(403)
-
-      get '/srv/status', headers: {
-        'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)'
-      }
-
-      expect(response.status).to eq(200)
-    end
-
-    it "blocked crawlers shouldn't log page views" do
-      ApplicationRequest.clear_cache!
-      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
-      expect {
-        get '/srv/status', headers: {
-          'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
-        }
-        ApplicationRequest.write_cache!
-      }.to_not change { ApplicationRequest.count }
-    end
-
-    it "blocks json requests" do
-      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
-
-      get '/srv/status.json', headers: {
-        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
-      }
-
-      expect(response.status).to eq(403)
-    end
-  end
-
 end