FIX: move crawler blocking into anon cache

This refinement of previous fix moves the crawler blocking into
anonymous cache

This ensures we never poison the cache incorrectly when blocking crawlers
This commit is contained in:
Sam 2018-07-04 11:14:43 +10:00
parent 7f98ed69cd
commit e72fd7ae4e
4 changed files with 99 additions and 81 deletions

View File

@ -40,7 +40,6 @@ class ApplicationController < ActionController::Base
end
end
before_action :block_crawlers
before_action :check_readonly_mode
before_action :handle_theme
before_action :set_current_user_for_logs
@ -465,19 +464,6 @@ class ApplicationController < ActionController::Base
private
def block_crawlers
if (
request.get? &&
!request.xhr? &&
!request.path.ends_with?('robots.txt') &&
CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT'])
)
request.env["discourse.request_tracker.skip"] = true
raise Discourse::InvalidAccess, 'Crawler not allowed'
end
end
def check_readonly_mode
@readonly_mode = Discourse.readonly_mode?
end

View File

@ -21,6 +21,13 @@ module Middleware
@request = Rack::Request.new(@env)
end
def blocked_crawler?
@request.get? &&
!@request.xhr? &&
!@request.path.ends_with?('robots.txt') &&
CrawlerDetection.is_blocked_crawler?(@request.env['HTTP_USER_AGENT'])
end
def is_mobile=(val)
@is_mobile = val ? :true : :false
end
@ -188,6 +195,11 @@ module Middleware
helper = Helper.new(env)
force_anon = false
if helper.blocked_crawler?
env["discourse.request_tracker.skip"] = true
return [403, {}, "Crawler is not allowed!"]
end
if helper.should_force_anonymous?
force_anon = env["DISCOURSE_FORCE_ANON"] = true
helper.force_anonymous!

View File

@ -152,4 +152,91 @@ describe Middleware::AnonymousCache::Helper do
end
end
context "crawler blocking" do
let :non_crawler do
{
"HTTP_USER_AGENT" =>
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
}
end
def get(path, options)
middleware = Middleware::AnonymousCache.new(lambda { |_| [200, {}, []] })
@env = env({
"REQUEST_URI" => path,
"PATH_INFO" => path,
"REQUEST_PATH" => path
}.merge(options[:headers]))
@status = middleware.call(@env).first
end
it "applies whitelisted_crawler_user_agents correctly" do
SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(@status).to eq(200)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)'
}
expect(@status).to eq(403)
get '/srv/status', headers: non_crawler
expect(@status).to eq(200)
end
it "applies blacklisted_crawler_user_agents correctly" do
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
get '/srv/status', headers: non_crawler
expect(@status).to eq(200)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(@status).to eq(403)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)'
}
expect(@status).to eq(200)
end
it "should never block robots.txt" do
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
get '/robots.txt', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(@status).to eq(200)
end
it "blocked crawlers shouldn't log page views" do
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(@env["discourse.request_tracker.skip"]).to eq(true)
end
it "blocks json requests" do
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
get '/srv/status.json', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(@status).to eq(403)
end
end
end

View File

@ -33,71 +33,4 @@ RSpec.describe ApplicationController do
end
end
context "crawler blocking" do
let :non_crawler do
{
"HTTP_USER_AGENT" =>
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
}
end
it "applies whitelisted_crawler_user_agents correctly" do
SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(response.status).to eq(200)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)'
}
expect(response.status).to eq(403)
get '/srv/status', headers: non_crawler
expect(response.status).to eq(200)
end
it "applies blacklisted_crawler_user_agents correctly" do
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
get '/srv/status', headers: non_crawler
expect(response.status).to eq(200)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(response.status).to eq(403)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)'
}
expect(response.status).to eq(200)
end
it "blocked crawlers shouldn't log page views" do
ApplicationRequest.clear_cache!
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
expect {
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
ApplicationRequest.write_cache!
}.to_not change { ApplicationRequest.count }
end
it "blocks json requests" do
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
get '/srv/status.json', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(response.status).to eq(403)
end
end
end