diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb index 83f2feb4599..4807405bfac 100644 --- a/app/controllers/application_controller.rb +++ b/app/controllers/application_controller.rb @@ -40,6 +40,7 @@ class ApplicationController < ActionController::Base end end + before_action :block_crawlers before_action :check_readonly_mode before_action :handle_theme before_action :set_current_user_for_logs @@ -464,6 +465,19 @@ class ApplicationController < ActionController::Base private + def block_crawlers + if ( + request.get? && + !request.xhr? && + !request.path.ends_with?('robots.txt') && + CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT']) + ) + + request.env["discourse.request_tracker.skip"] = true + raise Discourse::InvalidAccess, 'Crawler not allowed' + end + end + def check_readonly_mode @readonly_mode = Discourse.readonly_mode? end diff --git a/lib/middleware/request_tracker.rb b/lib/middleware/request_tracker.rb index ce000deeb7d..0c766104642 100644 --- a/lib/middleware/request_tracker.rb +++ b/lib/middleware/request_tracker.rb @@ -175,12 +175,6 @@ class Middleware::RequestTracker return result end - # if block_crawler(request) - # log_request = false - # result = [403, { 'Content-Type' => 'text/plain' }, ["Crawler is not allowed."]] - # return result - # end - env["discourse.request_tracker"] = self MethodProfiler.start result = @app.call(env) @@ -287,13 +281,6 @@ class Middleware::RequestTracker end end - def block_crawler(request) - request.get? && - !request.xhr? && - !request.path.ends_with?('robots.txt') && - CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT']) - end - def log_later(data, host) Scheduler::Defer.later("Track view", _db = nil) do self.class.log_request_on_site(data, host) diff --git a/spec/requests/application_controller_spec.rb b/spec/requests/application_controller_spec.rb index 3e19b5f5048..3d1f355be59 100644 --- a/spec/requests/application_controller_spec.rb +++ b/spec/requests/application_controller_spec.rb @@ -32,4 +32,72 @@ RSpec.describe ApplicationController do end end end + + context "crawler blocking" do + let :non_crawler do + { + "HTTP_USER_AGENT" => + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" + } + end + it "applies whitelisted_crawler_user_agents correctly" do + SiteSetting.whitelisted_crawler_user_agents = 'Googlebot' + + get '/srv/status', headers: { + 'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)' + } + + expect(response.status).to eq(200) + + get '/srv/status', headers: { + 'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)' + } + + expect(response.status).to eq(403) + + get '/srv/status', headers: non_crawler + expect(response.status).to eq(200) + end + + it "applies blacklisted_crawler_user_agents correctly" do + SiteSetting.blacklisted_crawler_user_agents = 'Googlebot' + + get '/srv/status', headers: non_crawler + expect(response.status).to eq(200) + + get '/srv/status', headers: { + 'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)' + } + + expect(response.status).to eq(403) + + get '/srv/status', headers: { + 'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)' + } + + expect(response.status).to eq(200) + end + + it "blocked crawlers shouldn't log page views" do + ApplicationRequest.clear_cache! + SiteSetting.blacklisted_crawler_user_agents = 'Googlebot' + expect { + get '/srv/status', headers: { + 'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)' + } + ApplicationRequest.write_cache! + }.to_not change { ApplicationRequest.count } + end + + it "blocks json requests" do + SiteSetting.blacklisted_crawler_user_agents = 'Googlebot' + + get '/srv/status.json', headers: { + 'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)' + } + + expect(response.status).to eq(403) + end + end + end