diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb index 8d1494136ea..1351d760378 100644 --- a/app/controllers/application_controller.rb +++ b/app/controllers/application_controller.rb @@ -29,6 +29,7 @@ class ApplicationController < ActionController::Base end end + before_action :rate_limit_crawlers before_action :check_readonly_mode before_action :handle_theme before_action :set_current_user_for_logs @@ -958,4 +959,27 @@ class ApplicationController < ActionController::Base ids = Theme.transform_ids(id) Theme.where(id: ids).pluck(:id, :name).to_h.to_json end + + def rate_limit_crawlers + return if current_user.present? + return if SiteSetting.slow_down_crawler_user_agents.blank? + + user_agent = request.user_agent&.downcase + return if user_agent.blank? + + SiteSetting.slow_down_crawler_user_agents.downcase.split("|").each do |crawler| + if user_agent.include?(crawler) + key = "#{crawler}_crawler_rate_limit" + limiter = RateLimiter.new( + nil, + key, + 1, + SiteSetting.slow_down_crawler_rate, + error_code: key + ) + limiter.performed! + break + end + end + end end diff --git a/app/controllers/robots_txt_controller.rb b/app/controllers/robots_txt_controller.rb index ad9b8e5e8c2..4bc2c449d34 100644 --- a/app/controllers/robots_txt_controller.rb +++ b/app/controllers/robots_txt_controller.rb @@ -84,16 +84,6 @@ class RobotsTxtController < ApplicationController result[:agents] << { name: 'Googlebot', disallow: deny_paths_googlebot } end - if SiteSetting.slow_down_crawler_user_agents.present? - SiteSetting.slow_down_crawler_user_agents.split('|').each do |agent| - result[:agents] << { - name: agent, - delay: SiteSetting.slow_down_crawler_rate, - disallow: deny_paths - } - end - end - result end end diff --git a/app/views/robots_txt/index.erb b/app/views/robots_txt/index.erb index f1a8db2c48d..cb007ff69cd 100644 --- a/app/views/robots_txt/index.erb +++ b/app/views/robots_txt/index.erb @@ -5,9 +5,6 @@ # <% @robots_info[:agents].each do |agent| %> User-agent: <%= agent[:name] %> -<%- if agent[:delay] -%> -Crawl-delay: <%= agent[:delay] %> -<%- end -%> <% agent[:disallow].each do |path| %> Disallow: <%= path %> <% end %> diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index 6f8f2f95771..14301fe56a7 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -216,6 +216,8 @@ en: local_login_cannot_be_disabled_if_second_factor_enforced: "You cannot disable local login if 2FA is enforced. Disable enforced 2FA before disabling local logins." cannot_enable_s3_uploads_when_s3_enabled_globally: "You cannot enable S3 uploads because S3 uploads are already globally enabled, and enabling this site-level could cause critical issues with uploads" cors_origins_should_not_have_trailing_slash: "You should not add the trailing slash (/) to CORS origins." + slow_down_crawler_user_agent_must_be_at_least_3_characters: "User agents must be at least 3 characters long to avoid incorrectly rate limiting human users." + slow_down_crawler_user_agent_cannot_be_popular_browsers: "You cannot add any of the following values to the setting: %{values}." conflicting_google_user_id: 'The Google Account ID for this account has changed; staff intervention is required for security reasons. Please contact staff and point them to
https://meta.discourse.org/t/76575' onebox: invalid_address: "Sorry, we were unable to generate a preview for this web page, because the server '%{hostname}' could not be found. Instead of a preview, only a link will appear in your post. :cry:" @@ -1611,7 +1613,7 @@ en: allowed_iframes: "A list of iframe src domain prefixes that discourse can safely allow in posts" allowed_crawler_user_agents: "User agents of web crawlers that should be allowed to access the site. WARNING! SETTING THIS WILL DISALLOW ALL CRAWLERS NOT LISTED HERE!" blocked_crawler_user_agents: "Unique case insensitive word in the user agent string identifying web crawlers that should not be allowed to access the site. Does not apply if allowlist is defined." - slow_down_crawler_user_agents: "User agents of web crawlers that should be rate limited in robots.txt using the Crawl-delay directive" + slow_down_crawler_user_agents: "User agents of web crawlers that should be rate limited as configured in the \"slow down crawler rate\" setting. Each value must be at least 3 characters long." slow_down_crawler_rate: "If slow_down_crawler_user_agents is specified this rate will apply to all the crawlers (number of seconds delay between requests)" content_security_policy: "Enable Content-Security-Policy" content_security_policy_report_only: "Enable Content-Security-Policy-Report-Only" diff --git a/lib/site_settings/validations.rb b/lib/site_settings/validations.rb index ebfcbfcfe14..2c574bf251d 100644 --- a/lib/site_settings/validations.rb +++ b/lib/site_settings/validations.rb @@ -3,6 +3,28 @@ module SiteSettings; end module SiteSettings::Validations + PROHIBITED_USER_AGENT_STRINGS = %w[ + apple + windows + linux + ubuntu + gecko + firefox + chrome + safari + applewebkit + webkit + mozilla + macintosh + khtml + intel + osx + os\ x + iphone + ipad + mac + ] + def validate_error(key, opts = {}) raise Discourse::InvalidParameters.new(I18n.t("errors.site_settings.#{key}", opts)) end @@ -201,6 +223,22 @@ module SiteSettings::Validations validate_error :cors_origins_should_not_have_trailing_slash end + def validate_slow_down_crawler_user_agents(new_val) + return if new_val.blank? + + new_val.downcase.split("|").each do |crawler| + if crawler.size < 3 + validate_error(:slow_down_crawler_user_agent_must_be_at_least_3_characters) + end + if PROHIBITED_USER_AGENT_STRINGS.any? { |c| c.include?(crawler) } + validate_error( + :slow_down_crawler_user_agent_cannot_be_popular_browsers, + values: PROHIBITED_USER_AGENT_STRINGS.join(I18n.t("word_connector.comma")) + ) + end + end + end + private def validate_bucket_setting(setting_name, upload_bucket, backup_bucket) diff --git a/spec/lib/site_settings/validations_spec.rb b/spec/lib/site_settings/validations_spec.rb index 747099f39aa..6b6999cf349 100644 --- a/spec/lib/site_settings/validations_spec.rb +++ b/spec/lib/site_settings/validations_spec.rb @@ -308,4 +308,65 @@ describe SiteSettings::Validations do end end end + + context "slow_down_crawler_user_agents" do + let(:too_short_message) do + I18n.t( + "errors.site_settings.slow_down_crawler_user_agent_must_be_at_least_3_characters" + ) + end + let(:popular_browser_message) do + I18n.t( + "errors.site_settings.slow_down_crawler_user_agent_cannot_be_popular_browsers", + values: SiteSettings::Validations::PROHIBITED_USER_AGENT_STRINGS.join(I18n.t("word_connector.comma")) + ) + end + + it "cannot contain a user agent that's shorter than 3 characters" do + expect { + subject.validate_slow_down_crawler_user_agents("ao|acsw") + }.to raise_error(Discourse::InvalidParameters, too_short_message) + expect { + subject.validate_slow_down_crawler_user_agents("up") + }.to raise_error(Discourse::InvalidParameters, too_short_message) + expect { + subject.validate_slow_down_crawler_user_agents("a|") + }.to raise_error(Discourse::InvalidParameters, too_short_message) + expect { + subject.validate_slow_down_crawler_user_agents("|a") + }.to raise_error(Discourse::InvalidParameters, too_short_message) + end + + it "allows user agents that are 3 characters or longer" do + expect { + subject.validate_slow_down_crawler_user_agents("aoc") + }.not_to raise_error + expect { + subject.validate_slow_down_crawler_user_agents("anuq") + }.not_to raise_error + expect { + subject.validate_slow_down_crawler_user_agents("pupsc|kcx") + }.not_to raise_error + end + + it "allows the setting to be empty" do + expect { + subject.validate_slow_down_crawler_user_agents("") + }.not_to raise_error + end + + it "cannot contain a token of a popular browser user agent" do + expect { + subject.validate_slow_down_crawler_user_agents("mOzilla") + }.to raise_error(Discourse::InvalidParameters, popular_browser_message) + + expect { + subject.validate_slow_down_crawler_user_agents("chRome|badcrawler") + }.to raise_error(Discourse::InvalidParameters, popular_browser_message) + + expect { + subject.validate_slow_down_crawler_user_agents("html|badcrawler") + }.to raise_error(Discourse::InvalidParameters, popular_browser_message) + end + end end diff --git a/spec/requests/application_controller_spec.rb b/spec/requests/application_controller_spec.rb index 34d9ecb07fc..112e637271d 100644 --- a/spec/requests/application_controller_spec.rb +++ b/spec/requests/application_controller_spec.rb @@ -941,4 +941,62 @@ RSpec.describe ApplicationController do ).to eq("user_api_key_limiter_1_day") end end + + describe "crawlers in slow_down_crawler_user_agents site setting" do + before do + RateLimiter.enable + RateLimiter.clear_all! + end + + it "are rate limited" do + SiteSetting.slow_down_crawler_rate = 128 + SiteSetting.slow_down_crawler_user_agents = "badcrawler|problematiccrawler" + now = Time.zone.now + freeze_time now + + get "/", headers: { + "HTTP_USER_AGENT" => "iam badcrawler" + } + expect(response.status).to eq(200) + get "/", headers: { + "HTTP_USER_AGENT" => "iam badcrawler" + } + expect(response.status).to eq(429) + expect(response.headers["Retry-After"]).to eq("128") + + get "/", headers: { + "HTTP_USER_AGENT" => "iam problematiccrawler" + } + expect(response.status).to eq(200) + get "/", headers: { + "HTTP_USER_AGENT" => "iam problematiccrawler" + } + expect(response.status).to eq(429) + expect(response.headers["Retry-After"]).to eq("128") + + freeze_time now + 100.seconds + get "/", headers: { + "HTTP_USER_AGENT" => "iam badcrawler" + } + expect(response.status).to eq(429) + expect(response.headers["Retry-After"]).to eq("28") + + get "/", headers: { + "HTTP_USER_AGENT" => "iam problematiccrawler" + } + expect(response.status).to eq(429) + expect(response.headers["Retry-After"]).to eq("28") + + freeze_time now + 150.seconds + get "/", headers: { + "HTTP_USER_AGENT" => "iam badcrawler" + } + expect(response.status).to eq(200) + + get "/", headers: { + "HTTP_USER_AGENT" => "iam problematiccrawler" + } + expect(response.status).to eq(200) + end + end end diff --git a/spec/requests/robots_txt_controller_spec.rb b/spec/requests/robots_txt_controller_spec.rb index b20627c0d8b..0672612c624 100644 --- a/spec/requests/robots_txt_controller_spec.rb +++ b/spec/requests/robots_txt_controller_spec.rb @@ -56,17 +56,6 @@ RSpec.describe RobotsTxtController do end end - context 'crawl delay' do - it 'allows you to set crawl delay on particular bots' do - SiteSetting.allow_index_in_robots_txt = true - SiteSetting.slow_down_crawler_rate = 17 - SiteSetting.slow_down_crawler_user_agents = 'bingbot|googlebot' - get '/robots.txt' - expect(response.body).to include("\nUser-agent: bingbot\nCrawl-delay: 17") - expect(response.body).to include("\nUser-agent: googlebot\nCrawl-delay: 17") - end - end - context 'allow_index_in_robots_txt is true' do def expect_allowed_and_disallowed_sections(allow_index, disallow_index)