FIX: move crawler blocking into anon cache

This refinement of previous fix moves the crawler blocking into
anonymous cache

This ensures we never poison the cache incorrectly when blocking crawlers
This commit is contained in:
Sam 2018-07-04 11:14:43 +10:00
parent 7f98ed69cd
commit e72fd7ae4e
4 changed files with 99 additions and 81 deletions

View File

@ -40,7 +40,6 @@ class ApplicationController < ActionController::Base
end end
end end
before_action :block_crawlers
before_action :check_readonly_mode before_action :check_readonly_mode
before_action :handle_theme before_action :handle_theme
before_action :set_current_user_for_logs before_action :set_current_user_for_logs
@ -465,19 +464,6 @@ class ApplicationController < ActionController::Base
private private
def block_crawlers
if (
request.get? &&
!request.xhr? &&
!request.path.ends_with?('robots.txt') &&
CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT'])
)
request.env["discourse.request_tracker.skip"] = true
raise Discourse::InvalidAccess, 'Crawler not allowed'
end
end
def check_readonly_mode def check_readonly_mode
@readonly_mode = Discourse.readonly_mode? @readonly_mode = Discourse.readonly_mode?
end end

View File

@ -21,6 +21,13 @@ module Middleware
@request = Rack::Request.new(@env) @request = Rack::Request.new(@env)
end end
def blocked_crawler?
@request.get? &&
!@request.xhr? &&
!@request.path.ends_with?('robots.txt') &&
CrawlerDetection.is_blocked_crawler?(@request.env['HTTP_USER_AGENT'])
end
def is_mobile=(val) def is_mobile=(val)
@is_mobile = val ? :true : :false @is_mobile = val ? :true : :false
end end
@ -188,6 +195,11 @@ module Middleware
helper = Helper.new(env) helper = Helper.new(env)
force_anon = false force_anon = false
if helper.blocked_crawler?
env["discourse.request_tracker.skip"] = true
return [403, {}, "Crawler is not allowed!"]
end
if helper.should_force_anonymous? if helper.should_force_anonymous?
force_anon = env["DISCOURSE_FORCE_ANON"] = true force_anon = env["DISCOURSE_FORCE_ANON"] = true
helper.force_anonymous! helper.force_anonymous!

View File

@ -152,4 +152,91 @@ describe Middleware::AnonymousCache::Helper do
end end
end end
context "crawler blocking" do
let :non_crawler do
{
"HTTP_USER_AGENT" =>
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
}
end
def get(path, options)
middleware = Middleware::AnonymousCache.new(lambda { |_| [200, {}, []] })
@env = env({
"REQUEST_URI" => path,
"PATH_INFO" => path,
"REQUEST_PATH" => path
}.merge(options[:headers]))
@status = middleware.call(@env).first
end
it "applies whitelisted_crawler_user_agents correctly" do
SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(@status).to eq(200)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)'
}
expect(@status).to eq(403)
get '/srv/status', headers: non_crawler
expect(@status).to eq(200)
end
it "applies blacklisted_crawler_user_agents correctly" do
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
get '/srv/status', headers: non_crawler
expect(@status).to eq(200)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(@status).to eq(403)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)'
}
expect(@status).to eq(200)
end
it "should never block robots.txt" do
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
get '/robots.txt', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(@status).to eq(200)
end
it "blocked crawlers shouldn't log page views" do
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(@env["discourse.request_tracker.skip"]).to eq(true)
end
it "blocks json requests" do
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
get '/srv/status.json', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(@status).to eq(403)
end
end
end end

View File

@ -33,71 +33,4 @@ RSpec.describe ApplicationController do
end end
end end
context "crawler blocking" do
let :non_crawler do
{
"HTTP_USER_AGENT" =>
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
}
end
it "applies whitelisted_crawler_user_agents correctly" do
SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(response.status).to eq(200)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)'
}
expect(response.status).to eq(403)
get '/srv/status', headers: non_crawler
expect(response.status).to eq(200)
end
it "applies blacklisted_crawler_user_agents correctly" do
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
get '/srv/status', headers: non_crawler
expect(response.status).to eq(200)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(response.status).to eq(403)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)'
}
expect(response.status).to eq(200)
end
it "blocked crawlers shouldn't log page views" do
ApplicationRequest.clear_cache!
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
expect {
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
ApplicationRequest.write_cache!
}.to_not change { ApplicationRequest.count }
end
it "blocks json requests" do
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
get '/srv/status.json', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(response.status).to eq(403)
end
end
end end