FIX: move crawler blocking into anon cache
This refinement of previous fix moves the crawler blocking into anonymous cache This ensures we never poison the cache incorrectly when blocking crawlers
This commit is contained in:
parent
7f98ed69cd
commit
e72fd7ae4e
|
@ -40,7 +40,6 @@ class ApplicationController < ActionController::Base
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
before_action :block_crawlers
|
|
||||||
before_action :check_readonly_mode
|
before_action :check_readonly_mode
|
||||||
before_action :handle_theme
|
before_action :handle_theme
|
||||||
before_action :set_current_user_for_logs
|
before_action :set_current_user_for_logs
|
||||||
|
@ -465,19 +464,6 @@ class ApplicationController < ActionController::Base
|
||||||
|
|
||||||
private
|
private
|
||||||
|
|
||||||
def block_crawlers
|
|
||||||
if (
|
|
||||||
request.get? &&
|
|
||||||
!request.xhr? &&
|
|
||||||
!request.path.ends_with?('robots.txt') &&
|
|
||||||
CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT'])
|
|
||||||
)
|
|
||||||
|
|
||||||
request.env["discourse.request_tracker.skip"] = true
|
|
||||||
raise Discourse::InvalidAccess, 'Crawler not allowed'
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
def check_readonly_mode
|
def check_readonly_mode
|
||||||
@readonly_mode = Discourse.readonly_mode?
|
@readonly_mode = Discourse.readonly_mode?
|
||||||
end
|
end
|
||||||
|
|
|
@ -21,6 +21,13 @@ module Middleware
|
||||||
@request = Rack::Request.new(@env)
|
@request = Rack::Request.new(@env)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def blocked_crawler?
|
||||||
|
@request.get? &&
|
||||||
|
!@request.xhr? &&
|
||||||
|
!@request.path.ends_with?('robots.txt') &&
|
||||||
|
CrawlerDetection.is_blocked_crawler?(@request.env['HTTP_USER_AGENT'])
|
||||||
|
end
|
||||||
|
|
||||||
def is_mobile=(val)
|
def is_mobile=(val)
|
||||||
@is_mobile = val ? :true : :false
|
@is_mobile = val ? :true : :false
|
||||||
end
|
end
|
||||||
|
@ -188,6 +195,11 @@ module Middleware
|
||||||
helper = Helper.new(env)
|
helper = Helper.new(env)
|
||||||
force_anon = false
|
force_anon = false
|
||||||
|
|
||||||
|
if helper.blocked_crawler?
|
||||||
|
env["discourse.request_tracker.skip"] = true
|
||||||
|
return [403, {}, "Crawler is not allowed!"]
|
||||||
|
end
|
||||||
|
|
||||||
if helper.should_force_anonymous?
|
if helper.should_force_anonymous?
|
||||||
force_anon = env["DISCOURSE_FORCE_ANON"] = true
|
force_anon = env["DISCOURSE_FORCE_ANON"] = true
|
||||||
helper.force_anonymous!
|
helper.force_anonymous!
|
||||||
|
|
|
@ -152,4 +152,91 @@ describe Middleware::AnonymousCache::Helper do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context "crawler blocking" do
|
||||||
|
let :non_crawler do
|
||||||
|
{
|
||||||
|
"HTTP_USER_AGENT" =>
|
||||||
|
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
def get(path, options)
|
||||||
|
middleware = Middleware::AnonymousCache.new(lambda { |_| [200, {}, []] })
|
||||||
|
@env = env({
|
||||||
|
"REQUEST_URI" => path,
|
||||||
|
"PATH_INFO" => path,
|
||||||
|
"REQUEST_PATH" => path
|
||||||
|
}.merge(options[:headers]))
|
||||||
|
@status = middleware.call(@env).first
|
||||||
|
end
|
||||||
|
|
||||||
|
it "applies whitelisted_crawler_user_agents correctly" do
|
||||||
|
SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
|
||||||
|
|
||||||
|
get '/srv/status', headers: {
|
||||||
|
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(@status).to eq(200)
|
||||||
|
|
||||||
|
get '/srv/status', headers: {
|
||||||
|
'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)'
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(@status).to eq(403)
|
||||||
|
|
||||||
|
get '/srv/status', headers: non_crawler
|
||||||
|
expect(@status).to eq(200)
|
||||||
|
end
|
||||||
|
|
||||||
|
it "applies blacklisted_crawler_user_agents correctly" do
|
||||||
|
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
|
||||||
|
|
||||||
|
get '/srv/status', headers: non_crawler
|
||||||
|
expect(@status).to eq(200)
|
||||||
|
|
||||||
|
get '/srv/status', headers: {
|
||||||
|
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(@status).to eq(403)
|
||||||
|
|
||||||
|
get '/srv/status', headers: {
|
||||||
|
'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)'
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(@status).to eq(200)
|
||||||
|
end
|
||||||
|
|
||||||
|
it "should never block robots.txt" do
|
||||||
|
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
|
||||||
|
|
||||||
|
get '/robots.txt', headers: {
|
||||||
|
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(@status).to eq(200)
|
||||||
|
end
|
||||||
|
|
||||||
|
it "blocked crawlers shouldn't log page views" do
|
||||||
|
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
|
||||||
|
|
||||||
|
get '/srv/status', headers: {
|
||||||
|
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(@env["discourse.request_tracker.skip"]).to eq(true)
|
||||||
|
end
|
||||||
|
|
||||||
|
it "blocks json requests" do
|
||||||
|
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
|
||||||
|
|
||||||
|
get '/srv/status.json', headers: {
|
||||||
|
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(@status).to eq(403)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -33,71 +33,4 @@ RSpec.describe ApplicationController do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
context "crawler blocking" do
|
|
||||||
let :non_crawler do
|
|
||||||
{
|
|
||||||
"HTTP_USER_AGENT" =>
|
|
||||||
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
|
|
||||||
}
|
|
||||||
end
|
|
||||||
it "applies whitelisted_crawler_user_agents correctly" do
|
|
||||||
SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
|
|
||||||
|
|
||||||
get '/srv/status', headers: {
|
|
||||||
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(response.status).to eq(200)
|
|
||||||
|
|
||||||
get '/srv/status', headers: {
|
|
||||||
'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)'
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(response.status).to eq(403)
|
|
||||||
|
|
||||||
get '/srv/status', headers: non_crawler
|
|
||||||
expect(response.status).to eq(200)
|
|
||||||
end
|
|
||||||
|
|
||||||
it "applies blacklisted_crawler_user_agents correctly" do
|
|
||||||
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
|
|
||||||
|
|
||||||
get '/srv/status', headers: non_crawler
|
|
||||||
expect(response.status).to eq(200)
|
|
||||||
|
|
||||||
get '/srv/status', headers: {
|
|
||||||
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(response.status).to eq(403)
|
|
||||||
|
|
||||||
get '/srv/status', headers: {
|
|
||||||
'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)'
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(response.status).to eq(200)
|
|
||||||
end
|
|
||||||
|
|
||||||
it "blocked crawlers shouldn't log page views" do
|
|
||||||
ApplicationRequest.clear_cache!
|
|
||||||
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
|
|
||||||
expect {
|
|
||||||
get '/srv/status', headers: {
|
|
||||||
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
|
|
||||||
}
|
|
||||||
ApplicationRequest.write_cache!
|
|
||||||
}.to_not change { ApplicationRequest.count }
|
|
||||||
end
|
|
||||||
|
|
||||||
it "blocks json requests" do
|
|
||||||
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
|
|
||||||
|
|
||||||
get '/srv/status.json', headers: {
|
|
||||||
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(response.status).to eq(403)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue