FIX: Apply crawler rate limits to cached requests (#27174)

This commit moves the logic for crawler rate limits out of the application controller and into the request tracker middleware. The reason for this move is to apply rate limits to all crawler requests instead of just the requests that make it to the application controller. Some requests are served early from the middleware stack without reaching the Rails app for performance reasons (e.g. `AnonymousCache`) which results in crawlers getting 200 responses even though they've reached their limits and should be getting 429 responses.

Internal topic: t/128810.
This commit is contained in:
Osama Sayegh 2024-05-27 16:26:35 +03:00 committed by GitHub
parent 7992d7a65a
commit 361992bb74
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 87 additions and 26 deletions

View File

@ -29,7 +29,6 @@ class ApplicationController < ActionController::Base
end end
end end
before_action :rate_limit_crawlers
before_action :check_readonly_mode before_action :check_readonly_mode
before_action :handle_theme before_action :handle_theme
before_action :set_current_user_for_logs before_action :set_current_user_for_logs
@ -1053,28 +1052,6 @@ class ApplicationController < ActionController::Base
Theme.where(id: ids).pluck(:id, :name).to_h.to_json Theme.where(id: ids).pluck(:id, :name).to_h.to_json
end end
def rate_limit_crawlers
return if current_user.present?
return if SiteSetting.slow_down_crawler_user_agents.blank?
user_agent = request.user_agent&.downcase
return if user_agent.blank?
SiteSetting
.slow_down_crawler_user_agents
.downcase
.split("|")
.each do |crawler|
if user_agent.include?(crawler)
key = "#{crawler}_crawler_rate_limit"
limiter =
RateLimiter.new(nil, key, 1, SiteSetting.slow_down_crawler_rate, error_code: key)
limiter.performed!
break
end
end
end
def run_second_factor!(action_class, action_data: nil, target_user: current_user) def run_second_factor!(action_class, action_data: nil, target_user: current_user)
if current_user && target_user != current_user if current_user && target_user != current_user
# Anon can run 2fa against another target, but logged-in users should not. # Anon can run 2fa against another target, but logged-in users should not.

View File

@ -267,6 +267,20 @@ class Middleware::RequestTracker
end end
return 429, headers, [message] return 429, headers, [message]
end end
if !cookie
if error_details = check_crawler_limits(env)
available_in, error_code = error_details
message = "Too many crawling requests. Error code: #{error_code}."
headers = {
"Content-Type" => "text/plain",
"Retry-After" => available_in.to_s,
"Discourse-Rate-Limit-Error-Code" => error_code,
}
return 429, headers, [message]
end
end
env["discourse.request_tracker"] = self env["discourse.request_tracker"] = self
MethodProfiler.start MethodProfiler.start
@ -443,4 +457,30 @@ class Middleware::RequestTracker
end end
end end
end end
def check_crawler_limits(env)
slow_down_agents = SiteSetting.slow_down_crawler_user_agents
return if slow_down_agents.blank?
user_agent = env["HTTP_USER_AGENT"]&.downcase
return if user_agent.blank?
return if !CrawlerDetection.crawler?(user_agent)
slow_down_agents
.downcase
.split("|")
.each do |crawler|
if user_agent.include?(crawler)
key = "#{crawler}_crawler_rate_limit"
limiter =
RateLimiter.new(nil, key, 1, SiteSetting.slow_down_crawler_rate, error_code: key)
limiter.performed!
break
end
end
nil
rescue RateLimiter::LimitExceeded => e
[e.available_in, e.error_code]
end
end end

View File

@ -369,6 +369,31 @@ RSpec.describe Middleware::RequestTracker do
end end
end end
describe "crawler rate limits" do
context "when there are multiple matching crawlers" do
before { SiteSetting.slow_down_crawler_user_agents = "badcrawler2|badcrawler22" }
it "only checks limits for the first match" do
env = env("HTTP_USER_AGENT" => "badcrawler")
status, _ = middleware.call(env)
expect(status).to eq(200)
end
end
it "compares user agents in a case-insensitive manner" do
SiteSetting.slow_down_crawler_user_agents = "BaDCRawLer"
env1 = env("HTTP_USER_AGENT" => "bADcrAWLer")
env2 = env("HTTP_USER_AGENT" => "bADcrAWLer")
status, _ = middleware.call(env1)
expect(status).to eq(200)
status, _ = middleware.call(env2)
expect(status).to eq(429)
end
end
describe "register_ip_skipper" do describe "register_ip_skipper" do
before do before do
Middleware::RequestTracker.register_ip_skipper { |ip| ip == "1.1.1.2" } Middleware::RequestTracker.register_ip_skipper { |ip| ip == "1.1.1.2" }

View File

@ -1103,13 +1103,17 @@ RSpec.describe ApplicationController do
end end
describe "crawlers in slow_down_crawler_user_agents site setting" do describe "crawlers in slow_down_crawler_user_agents site setting" do
before { RateLimiter.enable } before do
Fabricate(:admin) # to prevent redirect to the wizard
RateLimiter.enable
SiteSetting.slow_down_crawler_rate = 128
SiteSetting.slow_down_crawler_user_agents = "badcrawler|problematiccrawler"
end
use_redis_snapshotting use_redis_snapshotting
it "are rate limited" do it "are rate limited" do
SiteSetting.slow_down_crawler_rate = 128
SiteSetting.slow_down_crawler_user_agents = "badcrawler|problematiccrawler"
now = Time.zone.now now = Time.zone.now
freeze_time now freeze_time now
@ -1141,6 +1145,21 @@ RSpec.describe ApplicationController do
get "/", headers: { "HTTP_USER_AGENT" => "iam problematiccrawler" } get "/", headers: { "HTTP_USER_AGENT" => "iam problematiccrawler" }
expect(response.status).to eq(200) expect(response.status).to eq(200)
end end
context "with anonymous caching" do
before do
global_setting :anon_cache_store_threshold, 1
Middleware::AnonymousCache.enable_anon_cache
end
it "don't bypass crawler rate limits" do
get "/", headers: { "HTTP_USER_AGENT" => "iam badcrawler" }
expect(response.status).to eq(200)
get "/", headers: { "HTTP_USER_AGENT" => "iam badcrawler" }
expect(response.status).to eq(429)
end
end
end end
describe "#banner_json" do describe "#banner_json" do