diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb
index 8d1494136ea..1351d760378 100644
--- a/app/controllers/application_controller.rb
+++ b/app/controllers/application_controller.rb
@@ -29,6 +29,7 @@ class ApplicationController < ActionController::Base
end
end
+ before_action :rate_limit_crawlers
before_action :check_readonly_mode
before_action :handle_theme
before_action :set_current_user_for_logs
@@ -958,4 +959,27 @@ class ApplicationController < ActionController::Base
ids = Theme.transform_ids(id)
Theme.where(id: ids).pluck(:id, :name).to_h.to_json
end
+
+ def rate_limit_crawlers
+ return if current_user.present?
+ return if SiteSetting.slow_down_crawler_user_agents.blank?
+
+ user_agent = request.user_agent&.downcase
+ return if user_agent.blank?
+
+ SiteSetting.slow_down_crawler_user_agents.downcase.split("|").each do |crawler|
+ if user_agent.include?(crawler)
+ key = "#{crawler}_crawler_rate_limit"
+ limiter = RateLimiter.new(
+ nil,
+ key,
+ 1,
+ SiteSetting.slow_down_crawler_rate,
+ error_code: key
+ )
+ limiter.performed!
+ break
+ end
+ end
+ end
end
diff --git a/app/controllers/robots_txt_controller.rb b/app/controllers/robots_txt_controller.rb
index ad9b8e5e8c2..4bc2c449d34 100644
--- a/app/controllers/robots_txt_controller.rb
+++ b/app/controllers/robots_txt_controller.rb
@@ -84,16 +84,6 @@ class RobotsTxtController < ApplicationController
result[:agents] << { name: 'Googlebot', disallow: deny_paths_googlebot }
end
- if SiteSetting.slow_down_crawler_user_agents.present?
- SiteSetting.slow_down_crawler_user_agents.split('|').each do |agent|
- result[:agents] << {
- name: agent,
- delay: SiteSetting.slow_down_crawler_rate,
- disallow: deny_paths
- }
- end
- end
-
result
end
end
diff --git a/app/views/robots_txt/index.erb b/app/views/robots_txt/index.erb
index f1a8db2c48d..cb007ff69cd 100644
--- a/app/views/robots_txt/index.erb
+++ b/app/views/robots_txt/index.erb
@@ -5,9 +5,6 @@
#
<% @robots_info[:agents].each do |agent| %>
User-agent: <%= agent[:name] %>
-<%- if agent[:delay] -%>
-Crawl-delay: <%= agent[:delay] %>
-<%- end -%>
<% agent[:disallow].each do |path| %>
Disallow: <%= path %>
<% end %>
diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml
index 6f8f2f95771..14301fe56a7 100644
--- a/config/locales/server.en.yml
+++ b/config/locales/server.en.yml
@@ -216,6 +216,8 @@ en:
local_login_cannot_be_disabled_if_second_factor_enforced: "You cannot disable local login if 2FA is enforced. Disable enforced 2FA before disabling local logins."
cannot_enable_s3_uploads_when_s3_enabled_globally: "You cannot enable S3 uploads because S3 uploads are already globally enabled, and enabling this site-level could cause critical issues with uploads"
cors_origins_should_not_have_trailing_slash: "You should not add the trailing slash (/) to CORS origins."
+ slow_down_crawler_user_agent_must_be_at_least_3_characters: "User agents must be at least 3 characters long to avoid incorrectly rate limiting human users."
+ slow_down_crawler_user_agent_cannot_be_popular_browsers: "You cannot add any of the following values to the setting: %{values}."
conflicting_google_user_id: 'The Google Account ID for this account has changed; staff intervention is required for security reasons. Please contact staff and point them to
https://meta.discourse.org/t/76575'
onebox:
invalid_address: "Sorry, we were unable to generate a preview for this web page, because the server '%{hostname}' could not be found. Instead of a preview, only a link will appear in your post. :cry:"
@@ -1611,7 +1613,7 @@ en:
allowed_iframes: "A list of iframe src domain prefixes that discourse can safely allow in posts"
allowed_crawler_user_agents: "User agents of web crawlers that should be allowed to access the site. WARNING! SETTING THIS WILL DISALLOW ALL CRAWLERS NOT LISTED HERE!"
blocked_crawler_user_agents: "Unique case insensitive word in the user agent string identifying web crawlers that should not be allowed to access the site. Does not apply if allowlist is defined."
- slow_down_crawler_user_agents: "User agents of web crawlers that should be rate limited in robots.txt using the Crawl-delay directive"
+ slow_down_crawler_user_agents: "User agents of web crawlers that should be rate limited as configured in the \"slow down crawler rate\" setting. Each value must be at least 3 characters long."
slow_down_crawler_rate: "If slow_down_crawler_user_agents is specified this rate will apply to all the crawlers (number of seconds delay between requests)"
content_security_policy: "Enable Content-Security-Policy"
content_security_policy_report_only: "Enable Content-Security-Policy-Report-Only"
diff --git a/lib/site_settings/validations.rb b/lib/site_settings/validations.rb
index ebfcbfcfe14..2c574bf251d 100644
--- a/lib/site_settings/validations.rb
+++ b/lib/site_settings/validations.rb
@@ -3,6 +3,28 @@
module SiteSettings; end
module SiteSettings::Validations
+ PROHIBITED_USER_AGENT_STRINGS = %w[
+ apple
+ windows
+ linux
+ ubuntu
+ gecko
+ firefox
+ chrome
+ safari
+ applewebkit
+ webkit
+ mozilla
+ macintosh
+ khtml
+ intel
+ osx
+ os\ x
+ iphone
+ ipad
+ mac
+ ]
+
def validate_error(key, opts = {})
raise Discourse::InvalidParameters.new(I18n.t("errors.site_settings.#{key}", opts))
end
@@ -201,6 +223,22 @@ module SiteSettings::Validations
validate_error :cors_origins_should_not_have_trailing_slash
end
+ def validate_slow_down_crawler_user_agents(new_val)
+ return if new_val.blank?
+
+ new_val.downcase.split("|").each do |crawler|
+ if crawler.size < 3
+ validate_error(:slow_down_crawler_user_agent_must_be_at_least_3_characters)
+ end
+ if PROHIBITED_USER_AGENT_STRINGS.any? { |c| c.include?(crawler) }
+ validate_error(
+ :slow_down_crawler_user_agent_cannot_be_popular_browsers,
+ values: PROHIBITED_USER_AGENT_STRINGS.join(I18n.t("word_connector.comma"))
+ )
+ end
+ end
+ end
+
private
def validate_bucket_setting(setting_name, upload_bucket, backup_bucket)
diff --git a/spec/lib/site_settings/validations_spec.rb b/spec/lib/site_settings/validations_spec.rb
index 747099f39aa..6b6999cf349 100644
--- a/spec/lib/site_settings/validations_spec.rb
+++ b/spec/lib/site_settings/validations_spec.rb
@@ -308,4 +308,65 @@ describe SiteSettings::Validations do
end
end
end
+
+ context "slow_down_crawler_user_agents" do
+ let(:too_short_message) do
+ I18n.t(
+ "errors.site_settings.slow_down_crawler_user_agent_must_be_at_least_3_characters"
+ )
+ end
+ let(:popular_browser_message) do
+ I18n.t(
+ "errors.site_settings.slow_down_crawler_user_agent_cannot_be_popular_browsers",
+ values: SiteSettings::Validations::PROHIBITED_USER_AGENT_STRINGS.join(I18n.t("word_connector.comma"))
+ )
+ end
+
+ it "cannot contain a user agent that's shorter than 3 characters" do
+ expect {
+ subject.validate_slow_down_crawler_user_agents("ao|acsw")
+ }.to raise_error(Discourse::InvalidParameters, too_short_message)
+ expect {
+ subject.validate_slow_down_crawler_user_agents("up")
+ }.to raise_error(Discourse::InvalidParameters, too_short_message)
+ expect {
+ subject.validate_slow_down_crawler_user_agents("a|")
+ }.to raise_error(Discourse::InvalidParameters, too_short_message)
+ expect {
+ subject.validate_slow_down_crawler_user_agents("|a")
+ }.to raise_error(Discourse::InvalidParameters, too_short_message)
+ end
+
+ it "allows user agents that are 3 characters or longer" do
+ expect {
+ subject.validate_slow_down_crawler_user_agents("aoc")
+ }.not_to raise_error
+ expect {
+ subject.validate_slow_down_crawler_user_agents("anuq")
+ }.not_to raise_error
+ expect {
+ subject.validate_slow_down_crawler_user_agents("pupsc|kcx")
+ }.not_to raise_error
+ end
+
+ it "allows the setting to be empty" do
+ expect {
+ subject.validate_slow_down_crawler_user_agents("")
+ }.not_to raise_error
+ end
+
+ it "cannot contain a token of a popular browser user agent" do
+ expect {
+ subject.validate_slow_down_crawler_user_agents("mOzilla")
+ }.to raise_error(Discourse::InvalidParameters, popular_browser_message)
+
+ expect {
+ subject.validate_slow_down_crawler_user_agents("chRome|badcrawler")
+ }.to raise_error(Discourse::InvalidParameters, popular_browser_message)
+
+ expect {
+ subject.validate_slow_down_crawler_user_agents("html|badcrawler")
+ }.to raise_error(Discourse::InvalidParameters, popular_browser_message)
+ end
+ end
end
diff --git a/spec/requests/application_controller_spec.rb b/spec/requests/application_controller_spec.rb
index 34d9ecb07fc..112e637271d 100644
--- a/spec/requests/application_controller_spec.rb
+++ b/spec/requests/application_controller_spec.rb
@@ -941,4 +941,62 @@ RSpec.describe ApplicationController do
).to eq("user_api_key_limiter_1_day")
end
end
+
+ describe "crawlers in slow_down_crawler_user_agents site setting" do
+ before do
+ RateLimiter.enable
+ RateLimiter.clear_all!
+ end
+
+ it "are rate limited" do
+ SiteSetting.slow_down_crawler_rate = 128
+ SiteSetting.slow_down_crawler_user_agents = "badcrawler|problematiccrawler"
+ now = Time.zone.now
+ freeze_time now
+
+ get "/", headers: {
+ "HTTP_USER_AGENT" => "iam badcrawler"
+ }
+ expect(response.status).to eq(200)
+ get "/", headers: {
+ "HTTP_USER_AGENT" => "iam badcrawler"
+ }
+ expect(response.status).to eq(429)
+ expect(response.headers["Retry-After"]).to eq("128")
+
+ get "/", headers: {
+ "HTTP_USER_AGENT" => "iam problematiccrawler"
+ }
+ expect(response.status).to eq(200)
+ get "/", headers: {
+ "HTTP_USER_AGENT" => "iam problematiccrawler"
+ }
+ expect(response.status).to eq(429)
+ expect(response.headers["Retry-After"]).to eq("128")
+
+ freeze_time now + 100.seconds
+ get "/", headers: {
+ "HTTP_USER_AGENT" => "iam badcrawler"
+ }
+ expect(response.status).to eq(429)
+ expect(response.headers["Retry-After"]).to eq("28")
+
+ get "/", headers: {
+ "HTTP_USER_AGENT" => "iam problematiccrawler"
+ }
+ expect(response.status).to eq(429)
+ expect(response.headers["Retry-After"]).to eq("28")
+
+ freeze_time now + 150.seconds
+ get "/", headers: {
+ "HTTP_USER_AGENT" => "iam badcrawler"
+ }
+ expect(response.status).to eq(200)
+
+ get "/", headers: {
+ "HTTP_USER_AGENT" => "iam problematiccrawler"
+ }
+ expect(response.status).to eq(200)
+ end
+ end
end
diff --git a/spec/requests/robots_txt_controller_spec.rb b/spec/requests/robots_txt_controller_spec.rb
index b20627c0d8b..0672612c624 100644
--- a/spec/requests/robots_txt_controller_spec.rb
+++ b/spec/requests/robots_txt_controller_spec.rb
@@ -56,17 +56,6 @@ RSpec.describe RobotsTxtController do
end
end
- context 'crawl delay' do
- it 'allows you to set crawl delay on particular bots' do
- SiteSetting.allow_index_in_robots_txt = true
- SiteSetting.slow_down_crawler_rate = 17
- SiteSetting.slow_down_crawler_user_agents = 'bingbot|googlebot'
- get '/robots.txt'
- expect(response.body).to include("\nUser-agent: bingbot\nCrawl-delay: 17")
- expect(response.body).to include("\nUser-agent: googlebot\nCrawl-delay: 17")
- end
- end
-
context 'allow_index_in_robots_txt is true' do
def expect_allowed_and_disallowed_sections(allow_index, disallow_index)