FIX: move crawler blocking into anon cache

This refinement of previous fix moves the crawler blocking into anonymous cache This ensures we never poison the cache incorrectly when blocking crawlers
2018-07-04 11:14:43 +10:00 · 2018-07-04 11:14:43 +10:00 · e72fd7ae4e
parent 7f98ed69cd
commit e72fd7ae4e
4 changed files with 99 additions and 81 deletions
--- a/app/controllers/application_controller.rb
+++ b/app/controllers/application_controller.rb
@ -40,7 +40,6 @@ class ApplicationController < ActionController::Base
    end
  end

-  before_action :block_crawlers
  before_action :check_readonly_mode
  before_action :handle_theme
  before_action :set_current_user_for_logs
@ -465,19 +464,6 @@ class ApplicationController < ActionController::Base

  private

-  def block_crawlers
-    if (
-        request.get? &&
-        !request.xhr? &&
-        !request.path.ends_with?('robots.txt') &&
-        CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT'])
-    )
-
-      request.env["discourse.request_tracker.skip"] = true
-      raise Discourse::InvalidAccess, 'Crawler not allowed'
-    end
-  end
-
  def check_readonly_mode
    @readonly_mode = Discourse.readonly_mode?
  end
--- a/lib/middleware/anonymous_cache.rb
+++ b/lib/middleware/anonymous_cache.rb
@ -21,6 +21,13 @@ module Middleware
        @request = Rack::Request.new(@env)
      end

+      def blocked_crawler?
+        @request.get? &&
+        !@request.xhr? &&
+        !@request.path.ends_with?('robots.txt') &&
+        CrawlerDetection.is_blocked_crawler?(@request.env['HTTP_USER_AGENT'])
+      end
+
      def is_mobile=(val)
        @is_mobile = val ? :true : :false
      end
@ -188,6 +195,11 @@ module Middleware
      helper = Helper.new(env)
      force_anon = false

+      if helper.blocked_crawler?
+        env["discourse.request_tracker.skip"] = true
+        return [403, {}, "Crawler is not allowed!"]
+      end
+
      if helper.should_force_anonymous?
        force_anon = env["DISCOURSE_FORCE_ANON"] = true
        helper.force_anonymous!
--- a/spec/components/middleware/anonymous_cache_spec.rb
+++ b/spec/components/middleware/anonymous_cache_spec.rb
@ -152,4 +152,91 @@ describe Middleware::AnonymousCache::Helper do
    end
  end

+  context "crawler blocking" do
+    let :non_crawler do
+      {
+        "HTTP_USER_AGENT" =>
+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
+      }
+    end
+
+    def get(path, options)
+      middleware = Middleware::AnonymousCache.new(lambda { |_| [200, {}, []] })
+      @env = env({
+        "REQUEST_URI" => path,
+        "PATH_INFO" => path,
+        "REQUEST_PATH" => path
+      }.merge(options[:headers]))
+      @status = middleware.call(@env).first
+    end
+
+    it "applies whitelisted_crawler_user_agents correctly" do
+      SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
+
+      get '/srv/status', headers: {
+        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
+      }
+
+      expect(@status).to eq(200)
+
+      get '/srv/status', headers: {
+        'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)'
+      }
+
+      expect(@status).to eq(403)
+
+      get '/srv/status', headers: non_crawler
+      expect(@status).to eq(200)
+    end
+
+    it "applies blacklisted_crawler_user_agents correctly" do
+      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
+
+      get '/srv/status', headers: non_crawler
+      expect(@status).to eq(200)
+
+      get '/srv/status', headers: {
+        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
+      }
+
+      expect(@status).to eq(403)
+
+      get '/srv/status', headers: {
+        'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)'
+      }
+
+      expect(@status).to eq(200)
+    end
+
+    it "should never block robots.txt" do
+      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
+
+      get '/robots.txt', headers: {
+        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
+      }
+
+      expect(@status).to eq(200)
+    end
+
+    it "blocked crawlers shouldn't log page views" do
+      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
+
+      get '/srv/status', headers: {
+        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
+      }
+
+      expect(@env["discourse.request_tracker.skip"]).to eq(true)
+    end
+
+    it "blocks json requests" do
+      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
+
+      get '/srv/status.json', headers: {
+        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
+      }
+
+      expect(@status).to eq(403)
+    end
+  end
+
 end
--- a/spec/requests/application_controller_spec.rb
+++ b/spec/requests/application_controller_spec.rb
@ -33,71 +33,4 @@ RSpec.describe ApplicationController do
    end
  end

-  context "crawler blocking" do
-    let :non_crawler do
-      {
-        "HTTP_USER_AGENT" =>
-        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
-      }
-    end
-    it "applies whitelisted_crawler_user_agents correctly" do
-      SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
-
-      get '/srv/status', headers: {
-        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
-      }
-
-      expect(response.status).to eq(200)
-
-      get '/srv/status', headers: {
-        'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)'
-      }
-
-      expect(response.status).to eq(403)
-
-      get '/srv/status', headers: non_crawler
-      expect(response.status).to eq(200)
-    end
-
-    it "applies blacklisted_crawler_user_agents correctly" do
-      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
-
-      get '/srv/status', headers: non_crawler
-      expect(response.status).to eq(200)
-
-      get '/srv/status', headers: {
-        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
-      }
-
-      expect(response.status).to eq(403)
-
-      get '/srv/status', headers: {
-        'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)'
-      }
-
-      expect(response.status).to eq(200)
-    end
-
-    it "blocked crawlers shouldn't log page views" do
-      ApplicationRequest.clear_cache!
-      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
-      expect {
-        get '/srv/status', headers: {
-          'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
-        }
-        ApplicationRequest.write_cache!
-      }.to_not change { ApplicationRequest.count }
-    end
-
-    it "blocks json requests" do
-      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
-
-      get '/srv/status.json', headers: {
-        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
-      }
-
-      expect(response.status).to eq(403)
-    end
-  end
-
 end