FIX: move crawler blocking to app controller

We need access to site settings in multisite, we do not have access
yet if we attempt to get them in request tracker middleware
This commit is contained in:
Sam 2018-07-04 10:07:14 +10:00
parent d1b21aa73b
commit 7f98ed69cd
3 changed files with 82 additions and 13 deletions

View File

@ -40,6 +40,7 @@ class ApplicationController < ActionController::Base
end end
end end
before_action :block_crawlers
before_action :check_readonly_mode before_action :check_readonly_mode
before_action :handle_theme before_action :handle_theme
before_action :set_current_user_for_logs before_action :set_current_user_for_logs
@ -464,6 +465,19 @@ class ApplicationController < ActionController::Base
private private
def block_crawlers
if (
request.get? &&
!request.xhr? &&
!request.path.ends_with?('robots.txt') &&
CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT'])
)
request.env["discourse.request_tracker.skip"] = true
raise Discourse::InvalidAccess, 'Crawler not allowed'
end
end
def check_readonly_mode def check_readonly_mode
@readonly_mode = Discourse.readonly_mode? @readonly_mode = Discourse.readonly_mode?
end end

View File

@ -175,12 +175,6 @@ class Middleware::RequestTracker
return result return result
end end
# if block_crawler(request)
# log_request = false
# result = [403, { 'Content-Type' => 'text/plain' }, ["Crawler is not allowed."]]
# return result
# end
env["discourse.request_tracker"] = self env["discourse.request_tracker"] = self
MethodProfiler.start MethodProfiler.start
result = @app.call(env) result = @app.call(env)
@ -287,13 +281,6 @@ class Middleware::RequestTracker
end end
end end
def block_crawler(request)
request.get? &&
!request.xhr? &&
!request.path.ends_with?('robots.txt') &&
CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT'])
end
def log_later(data, host) def log_later(data, host)
Scheduler::Defer.later("Track view", _db = nil) do Scheduler::Defer.later("Track view", _db = nil) do
self.class.log_request_on_site(data, host) self.class.log_request_on_site(data, host)

View File

@ -32,4 +32,72 @@ RSpec.describe ApplicationController do
end end
end end
end end
context "crawler blocking" do
let :non_crawler do
{
"HTTP_USER_AGENT" =>
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
}
end
it "applies whitelisted_crawler_user_agents correctly" do
SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(response.status).to eq(200)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)'
}
expect(response.status).to eq(403)
get '/srv/status', headers: non_crawler
expect(response.status).to eq(200)
end
it "applies blacklisted_crawler_user_agents correctly" do
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
get '/srv/status', headers: non_crawler
expect(response.status).to eq(200)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(response.status).to eq(403)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)'
}
expect(response.status).to eq(200)
end
it "blocked crawlers shouldn't log page views" do
ApplicationRequest.clear_cache!
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
expect {
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
ApplicationRequest.write_cache!
}.to_not change { ApplicationRequest.count }
end
it "blocks json requests" do
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
get '/srv/status.json', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(response.status).to eq(403)
end
end
end end