only applies to get html requests

2018-03-22 17:57:44 -04:00 · 2018-03-22 17:57:44 -04:00 · a84bb81ab5
parent ced7e9a691
commit a84bb81ab5
2 changed files with 13 additions and 1 deletions
--- a/lib/middleware/request_tracker.rb
+++ b/lib/middleware/request_tracker.rb
@ -277,7 +277,10 @@ class Middleware::RequestTracker
  end

  def block_crawler(request)
-    !request.path.ends_with?('robots.txt') &&
+    request.get? &&
+      !request.xhr? &&
+      request.env['HTTP_ACCEPT'] =~ /text\/html/ &&
+      !request.path.ends_with?('robots.txt') &&
      CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT'])
  end

--- a/spec/components/middleware/request_tracker_spec.rb
+++ b/spec/components/middleware/request_tracker_spec.rb
@ -9,6 +9,7 @@ describe Middleware::RequestTracker do
      "HTTP_USER_AGENT" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
      "REQUEST_URI" => "/path?bla=1",
      "REQUEST_METHOD" => "GET",
+      "HTTP_ACCEPT" => "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
      "rack.input" => ""
    }.merge(opts)
  end
@ -317,6 +318,14 @@ describe Middleware::RequestTracker do
        ApplicationRequest.write_cache!
      }.to_not change { ApplicationRequest.count }
    end
+
+    it "allows json requests" do
+      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
+      expect_success_response(*middleware.call(env(
+        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)',
+        'HTTP_ACCEPT' => 'application/json'
+      )))
+    end
  end

 end