FEATURE: Replace `Crawl-delay` directive with proper rate limiting (#15131)
We have a couple of site setting, `slow_down_crawler_user_agents` and `slow_down_crawler_rate`, that are meant to allow site owners to signal to specific crawlers that they're crawling the site too aggressively and that they should slow down. When a crawler is added to the `slow_down_crawler_user_agents` setting, Discourse currently adds a `Crawl-delay` directive for that crawler in `/robots.txt`. Unfortunately, many crawlers don't support the `Crawl-delay` directive in `/robots.txt` which leaves the site owners no options if a crawler is crawling the site too aggressively. This PR replaces the `Crawl-delay` directive with proper rate limiting for crawlers added to the `slow_down_crawler_user_agents` list. On every request made by a non-logged in user, Discourse will check the User Agent string and if it contains one of the values of the `slow_down_crawler_user_agents` list, Discourse will only allow 1 request every N seconds for that User Agent (N is the value of the `slow_down_crawler_rate` setting) and the rest of requests made within the same interval will get a 429 response. The `slow_down_crawler_user_agents` setting becomes quite dangerous with this PR since it could rate limit lots if not all of anonymous traffic if the setting is not used appropriately. So to protect against this scenario, we've added a couple of new validations to the setting when it's changed: 1) each value added to setting must 3 characters or longer 2) each value cannot be a substring of tokens found in popular browser User Agent. The current list of prohibited values is: apple, windows, linux, ubuntu, gecko, firefox, chrome, safari, applewebkit, webkit, mozilla, macintosh, khtml, intel, osx, os x, iphone, ipad and mac.
This commit is contained in:
parent
9896fc7d33
commit
7bd3986b21
|
@ -29,6 +29,7 @@ class ApplicationController < ActionController::Base
|
|||
end
|
||||
end
|
||||
|
||||
before_action :rate_limit_crawlers
|
||||
before_action :check_readonly_mode
|
||||
before_action :handle_theme
|
||||
before_action :set_current_user_for_logs
|
||||
|
@ -958,4 +959,27 @@ class ApplicationController < ActionController::Base
|
|||
ids = Theme.transform_ids(id)
|
||||
Theme.where(id: ids).pluck(:id, :name).to_h.to_json
|
||||
end
|
||||
|
||||
def rate_limit_crawlers
|
||||
return if current_user.present?
|
||||
return if SiteSetting.slow_down_crawler_user_agents.blank?
|
||||
|
||||
user_agent = request.user_agent&.downcase
|
||||
return if user_agent.blank?
|
||||
|
||||
SiteSetting.slow_down_crawler_user_agents.downcase.split("|").each do |crawler|
|
||||
if user_agent.include?(crawler)
|
||||
key = "#{crawler}_crawler_rate_limit"
|
||||
limiter = RateLimiter.new(
|
||||
nil,
|
||||
key,
|
||||
1,
|
||||
SiteSetting.slow_down_crawler_rate,
|
||||
error_code: key
|
||||
)
|
||||
limiter.performed!
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -84,16 +84,6 @@ class RobotsTxtController < ApplicationController
|
|||
result[:agents] << { name: 'Googlebot', disallow: deny_paths_googlebot }
|
||||
end
|
||||
|
||||
if SiteSetting.slow_down_crawler_user_agents.present?
|
||||
SiteSetting.slow_down_crawler_user_agents.split('|').each do |agent|
|
||||
result[:agents] << {
|
||||
name: agent,
|
||||
delay: SiteSetting.slow_down_crawler_rate,
|
||||
disallow: deny_paths
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
result
|
||||
end
|
||||
end
|
||||
|
|
|
@ -5,9 +5,6 @@
|
|||
#
|
||||
<% @robots_info[:agents].each do |agent| %>
|
||||
User-agent: <%= agent[:name] %>
|
||||
<%- if agent[:delay] -%>
|
||||
Crawl-delay: <%= agent[:delay] %>
|
||||
<%- end -%>
|
||||
<% agent[:disallow].each do |path| %>
|
||||
Disallow: <%= path %>
|
||||
<% end %>
|
||||
|
|
|
@ -216,6 +216,8 @@ en:
|
|||
local_login_cannot_be_disabled_if_second_factor_enforced: "You cannot disable local login if 2FA is enforced. Disable enforced 2FA before disabling local logins."
|
||||
cannot_enable_s3_uploads_when_s3_enabled_globally: "You cannot enable S3 uploads because S3 uploads are already globally enabled, and enabling this site-level could cause critical issues with uploads"
|
||||
cors_origins_should_not_have_trailing_slash: "You should not add the trailing slash (/) to CORS origins."
|
||||
slow_down_crawler_user_agent_must_be_at_least_3_characters: "User agents must be at least 3 characters long to avoid incorrectly rate limiting human users."
|
||||
slow_down_crawler_user_agent_cannot_be_popular_browsers: "You cannot add any of the following values to the setting: %{values}."
|
||||
conflicting_google_user_id: 'The Google Account ID for this account has changed; staff intervention is required for security reasons. Please contact staff and point them to <br><a href="https://meta.discourse.org/t/76575">https://meta.discourse.org/t/76575</a>'
|
||||
onebox:
|
||||
invalid_address: "Sorry, we were unable to generate a preview for this web page, because the server '%{hostname}' could not be found. Instead of a preview, only a link will appear in your post. :cry:"
|
||||
|
@ -1611,7 +1613,7 @@ en:
|
|||
allowed_iframes: "A list of iframe src domain prefixes that discourse can safely allow in posts"
|
||||
allowed_crawler_user_agents: "User agents of web crawlers that should be allowed to access the site. WARNING! SETTING THIS WILL DISALLOW ALL CRAWLERS NOT LISTED HERE!"
|
||||
blocked_crawler_user_agents: "Unique case insensitive word in the user agent string identifying web crawlers that should not be allowed to access the site. Does not apply if allowlist is defined."
|
||||
slow_down_crawler_user_agents: "User agents of web crawlers that should be rate limited in robots.txt using the Crawl-delay directive"
|
||||
slow_down_crawler_user_agents: "User agents of web crawlers that should be rate limited as configured in the \"slow down crawler rate\" setting. Each value must be at least 3 characters long."
|
||||
slow_down_crawler_rate: "If slow_down_crawler_user_agents is specified this rate will apply to all the crawlers (number of seconds delay between requests)"
|
||||
content_security_policy: "Enable Content-Security-Policy"
|
||||
content_security_policy_report_only: "Enable Content-Security-Policy-Report-Only"
|
||||
|
|
|
@ -3,6 +3,28 @@
|
|||
module SiteSettings; end
|
||||
|
||||
module SiteSettings::Validations
|
||||
PROHIBITED_USER_AGENT_STRINGS = %w[
|
||||
apple
|
||||
windows
|
||||
linux
|
||||
ubuntu
|
||||
gecko
|
||||
firefox
|
||||
chrome
|
||||
safari
|
||||
applewebkit
|
||||
webkit
|
||||
mozilla
|
||||
macintosh
|
||||
khtml
|
||||
intel
|
||||
osx
|
||||
os\ x
|
||||
iphone
|
||||
ipad
|
||||
mac
|
||||
]
|
||||
|
||||
def validate_error(key, opts = {})
|
||||
raise Discourse::InvalidParameters.new(I18n.t("errors.site_settings.#{key}", opts))
|
||||
end
|
||||
|
@ -201,6 +223,22 @@ module SiteSettings::Validations
|
|||
validate_error :cors_origins_should_not_have_trailing_slash
|
||||
end
|
||||
|
||||
def validate_slow_down_crawler_user_agents(new_val)
|
||||
return if new_val.blank?
|
||||
|
||||
new_val.downcase.split("|").each do |crawler|
|
||||
if crawler.size < 3
|
||||
validate_error(:slow_down_crawler_user_agent_must_be_at_least_3_characters)
|
||||
end
|
||||
if PROHIBITED_USER_AGENT_STRINGS.any? { |c| c.include?(crawler) }
|
||||
validate_error(
|
||||
:slow_down_crawler_user_agent_cannot_be_popular_browsers,
|
||||
values: PROHIBITED_USER_AGENT_STRINGS.join(I18n.t("word_connector.comma"))
|
||||
)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def validate_bucket_setting(setting_name, upload_bucket, backup_bucket)
|
||||
|
|
|
@ -308,4 +308,65 @@ describe SiteSettings::Validations do
|
|||
end
|
||||
end
|
||||
end
|
||||
|
||||
context "slow_down_crawler_user_agents" do
|
||||
let(:too_short_message) do
|
||||
I18n.t(
|
||||
"errors.site_settings.slow_down_crawler_user_agent_must_be_at_least_3_characters"
|
||||
)
|
||||
end
|
||||
let(:popular_browser_message) do
|
||||
I18n.t(
|
||||
"errors.site_settings.slow_down_crawler_user_agent_cannot_be_popular_browsers",
|
||||
values: SiteSettings::Validations::PROHIBITED_USER_AGENT_STRINGS.join(I18n.t("word_connector.comma"))
|
||||
)
|
||||
end
|
||||
|
||||
it "cannot contain a user agent that's shorter than 3 characters" do
|
||||
expect {
|
||||
subject.validate_slow_down_crawler_user_agents("ao|acsw")
|
||||
}.to raise_error(Discourse::InvalidParameters, too_short_message)
|
||||
expect {
|
||||
subject.validate_slow_down_crawler_user_agents("up")
|
||||
}.to raise_error(Discourse::InvalidParameters, too_short_message)
|
||||
expect {
|
||||
subject.validate_slow_down_crawler_user_agents("a|")
|
||||
}.to raise_error(Discourse::InvalidParameters, too_short_message)
|
||||
expect {
|
||||
subject.validate_slow_down_crawler_user_agents("|a")
|
||||
}.to raise_error(Discourse::InvalidParameters, too_short_message)
|
||||
end
|
||||
|
||||
it "allows user agents that are 3 characters or longer" do
|
||||
expect {
|
||||
subject.validate_slow_down_crawler_user_agents("aoc")
|
||||
}.not_to raise_error
|
||||
expect {
|
||||
subject.validate_slow_down_crawler_user_agents("anuq")
|
||||
}.not_to raise_error
|
||||
expect {
|
||||
subject.validate_slow_down_crawler_user_agents("pupsc|kcx")
|
||||
}.not_to raise_error
|
||||
end
|
||||
|
||||
it "allows the setting to be empty" do
|
||||
expect {
|
||||
subject.validate_slow_down_crawler_user_agents("")
|
||||
}.not_to raise_error
|
||||
end
|
||||
|
||||
it "cannot contain a token of a popular browser user agent" do
|
||||
expect {
|
||||
subject.validate_slow_down_crawler_user_agents("mOzilla")
|
||||
}.to raise_error(Discourse::InvalidParameters, popular_browser_message)
|
||||
|
||||
expect {
|
||||
subject.validate_slow_down_crawler_user_agents("chRome|badcrawler")
|
||||
}.to raise_error(Discourse::InvalidParameters, popular_browser_message)
|
||||
|
||||
expect {
|
||||
subject.validate_slow_down_crawler_user_agents("html|badcrawler")
|
||||
}.to raise_error(Discourse::InvalidParameters, popular_browser_message)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -941,4 +941,62 @@ RSpec.describe ApplicationController do
|
|||
).to eq("user_api_key_limiter_1_day")
|
||||
end
|
||||
end
|
||||
|
||||
describe "crawlers in slow_down_crawler_user_agents site setting" do
|
||||
before do
|
||||
RateLimiter.enable
|
||||
RateLimiter.clear_all!
|
||||
end
|
||||
|
||||
it "are rate limited" do
|
||||
SiteSetting.slow_down_crawler_rate = 128
|
||||
SiteSetting.slow_down_crawler_user_agents = "badcrawler|problematiccrawler"
|
||||
now = Time.zone.now
|
||||
freeze_time now
|
||||
|
||||
get "/", headers: {
|
||||
"HTTP_USER_AGENT" => "iam badcrawler"
|
||||
}
|
||||
expect(response.status).to eq(200)
|
||||
get "/", headers: {
|
||||
"HTTP_USER_AGENT" => "iam badcrawler"
|
||||
}
|
||||
expect(response.status).to eq(429)
|
||||
expect(response.headers["Retry-After"]).to eq("128")
|
||||
|
||||
get "/", headers: {
|
||||
"HTTP_USER_AGENT" => "iam problematiccrawler"
|
||||
}
|
||||
expect(response.status).to eq(200)
|
||||
get "/", headers: {
|
||||
"HTTP_USER_AGENT" => "iam problematiccrawler"
|
||||
}
|
||||
expect(response.status).to eq(429)
|
||||
expect(response.headers["Retry-After"]).to eq("128")
|
||||
|
||||
freeze_time now + 100.seconds
|
||||
get "/", headers: {
|
||||
"HTTP_USER_AGENT" => "iam badcrawler"
|
||||
}
|
||||
expect(response.status).to eq(429)
|
||||
expect(response.headers["Retry-After"]).to eq("28")
|
||||
|
||||
get "/", headers: {
|
||||
"HTTP_USER_AGENT" => "iam problematiccrawler"
|
||||
}
|
||||
expect(response.status).to eq(429)
|
||||
expect(response.headers["Retry-After"]).to eq("28")
|
||||
|
||||
freeze_time now + 150.seconds
|
||||
get "/", headers: {
|
||||
"HTTP_USER_AGENT" => "iam badcrawler"
|
||||
}
|
||||
expect(response.status).to eq(200)
|
||||
|
||||
get "/", headers: {
|
||||
"HTTP_USER_AGENT" => "iam problematiccrawler"
|
||||
}
|
||||
expect(response.status).to eq(200)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -56,17 +56,6 @@ RSpec.describe RobotsTxtController do
|
|||
end
|
||||
end
|
||||
|
||||
context 'crawl delay' do
|
||||
it 'allows you to set crawl delay on particular bots' do
|
||||
SiteSetting.allow_index_in_robots_txt = true
|
||||
SiteSetting.slow_down_crawler_rate = 17
|
||||
SiteSetting.slow_down_crawler_user_agents = 'bingbot|googlebot'
|
||||
get '/robots.txt'
|
||||
expect(response.body).to include("\nUser-agent: bingbot\nCrawl-delay: 17")
|
||||
expect(response.body).to include("\nUser-agent: googlebot\nCrawl-delay: 17")
|
||||
end
|
||||
end
|
||||
|
||||
context 'allow_index_in_robots_txt is true' do
|
||||
|
||||
def expect_allowed_and_disallowed_sections(allow_index, disallow_index)
|
||||
|
|
Loading…
Reference in New Issue