From 7bd3986b215c033f989e3bc8aa76328b42dcb821 Mon Sep 17 00:00:00 2001 From: Osama Sayegh Date: Tue, 30 Nov 2021 12:55:25 +0300 Subject: [PATCH] FEATURE: Replace `Crawl-delay` directive with proper rate limiting (#15131) We have a couple of site setting, `slow_down_crawler_user_agents` and `slow_down_crawler_rate`, that are meant to allow site owners to signal to specific crawlers that they're crawling the site too aggressively and that they should slow down. When a crawler is added to the `slow_down_crawler_user_agents` setting, Discourse currently adds a `Crawl-delay` directive for that crawler in `/robots.txt`. Unfortunately, many crawlers don't support the `Crawl-delay` directive in `/robots.txt` which leaves the site owners no options if a crawler is crawling the site too aggressively. This PR replaces the `Crawl-delay` directive with proper rate limiting for crawlers added to the `slow_down_crawler_user_agents` list. On every request made by a non-logged in user, Discourse will check the User Agent string and if it contains one of the values of the `slow_down_crawler_user_agents` list, Discourse will only allow 1 request every N seconds for that User Agent (N is the value of the `slow_down_crawler_rate` setting) and the rest of requests made within the same interval will get a 429 response. The `slow_down_crawler_user_agents` setting becomes quite dangerous with this PR since it could rate limit lots if not all of anonymous traffic if the setting is not used appropriately. So to protect against this scenario, we've added a couple of new validations to the setting when it's changed: 1) each value added to setting must 3 characters or longer 2) each value cannot be a substring of tokens found in popular browser User Agent. The current list of prohibited values is: apple, windows, linux, ubuntu, gecko, firefox, chrome, safari, applewebkit, webkit, mozilla, macintosh, khtml, intel, osx, os x, iphone, ipad and mac. --- app/controllers/application_controller.rb | 24 ++++++++ app/controllers/robots_txt_controller.rb | 10 ---- app/views/robots_txt/index.erb | 3 - config/locales/server.en.yml | 4 +- lib/site_settings/validations.rb | 38 ++++++++++++ spec/lib/site_settings/validations_spec.rb | 61 ++++++++++++++++++++ spec/requests/application_controller_spec.rb | 58 +++++++++++++++++++ spec/requests/robots_txt_controller_spec.rb | 11 ---- 8 files changed, 184 insertions(+), 25 deletions(-) diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb index 8d1494136ea..1351d760378 100644 --- a/app/controllers/application_controller.rb +++ b/app/controllers/application_controller.rb @@ -29,6 +29,7 @@ class ApplicationController < ActionController::Base end end + before_action :rate_limit_crawlers before_action :check_readonly_mode before_action :handle_theme before_action :set_current_user_for_logs @@ -958,4 +959,27 @@ class ApplicationController < ActionController::Base ids = Theme.transform_ids(id) Theme.where(id: ids).pluck(:id, :name).to_h.to_json end + + def rate_limit_crawlers + return if current_user.present? + return if SiteSetting.slow_down_crawler_user_agents.blank? + + user_agent = request.user_agent&.downcase + return if user_agent.blank? + + SiteSetting.slow_down_crawler_user_agents.downcase.split("|").each do |crawler| + if user_agent.include?(crawler) + key = "#{crawler}_crawler_rate_limit" + limiter = RateLimiter.new( + nil, + key, + 1, + SiteSetting.slow_down_crawler_rate, + error_code: key + ) + limiter.performed! + break + end + end + end end diff --git a/app/controllers/robots_txt_controller.rb b/app/controllers/robots_txt_controller.rb index ad9b8e5e8c2..4bc2c449d34 100644 --- a/app/controllers/robots_txt_controller.rb +++ b/app/controllers/robots_txt_controller.rb @@ -84,16 +84,6 @@ class RobotsTxtController < ApplicationController result[:agents] << { name: 'Googlebot', disallow: deny_paths_googlebot } end - if SiteSetting.slow_down_crawler_user_agents.present? - SiteSetting.slow_down_crawler_user_agents.split('|').each do |agent| - result[:agents] << { - name: agent, - delay: SiteSetting.slow_down_crawler_rate, - disallow: deny_paths - } - end - end - result end end diff --git a/app/views/robots_txt/index.erb b/app/views/robots_txt/index.erb index f1a8db2c48d..cb007ff69cd 100644 --- a/app/views/robots_txt/index.erb +++ b/app/views/robots_txt/index.erb @@ -5,9 +5,6 @@ # <% @robots_info[:agents].each do |agent| %> User-agent: <%= agent[:name] %> -<%- if agent[:delay] -%> -Crawl-delay: <%= agent[:delay] %> -<%- end -%> <% agent[:disallow].each do |path| %> Disallow: <%= path %> <% end %> diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index 6f8f2f95771..14301fe56a7 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -216,6 +216,8 @@ en: local_login_cannot_be_disabled_if_second_factor_enforced: "You cannot disable local login if 2FA is enforced. Disable enforced 2FA before disabling local logins." cannot_enable_s3_uploads_when_s3_enabled_globally: "You cannot enable S3 uploads because S3 uploads are already globally enabled, and enabling this site-level could cause critical issues with uploads" cors_origins_should_not_have_trailing_slash: "You should not add the trailing slash (/) to CORS origins." + slow_down_crawler_user_agent_must_be_at_least_3_characters: "User agents must be at least 3 characters long to avoid incorrectly rate limiting human users." + slow_down_crawler_user_agent_cannot_be_popular_browsers: "You cannot add any of the following values to the setting: %{values}." conflicting_google_user_id: 'The Google Account ID for this account has changed; staff intervention is required for security reasons. Please contact staff and point them to
https://meta.discourse.org/t/76575' onebox: invalid_address: "Sorry, we were unable to generate a preview for this web page, because the server '%{hostname}' could not be found. Instead of a preview, only a link will appear in your post. :cry:" @@ -1611,7 +1613,7 @@ en: allowed_iframes: "A list of iframe src domain prefixes that discourse can safely allow in posts" allowed_crawler_user_agents: "User agents of web crawlers that should be allowed to access the site. WARNING! SETTING THIS WILL DISALLOW ALL CRAWLERS NOT LISTED HERE!" blocked_crawler_user_agents: "Unique case insensitive word in the user agent string identifying web crawlers that should not be allowed to access the site. Does not apply if allowlist is defined." - slow_down_crawler_user_agents: "User agents of web crawlers that should be rate limited in robots.txt using the Crawl-delay directive" + slow_down_crawler_user_agents: "User agents of web crawlers that should be rate limited as configured in the \"slow down crawler rate\" setting. Each value must be at least 3 characters long." slow_down_crawler_rate: "If slow_down_crawler_user_agents is specified this rate will apply to all the crawlers (number of seconds delay between requests)" content_security_policy: "Enable Content-Security-Policy" content_security_policy_report_only: "Enable Content-Security-Policy-Report-Only" diff --git a/lib/site_settings/validations.rb b/lib/site_settings/validations.rb index ebfcbfcfe14..2c574bf251d 100644 --- a/lib/site_settings/validations.rb +++ b/lib/site_settings/validations.rb @@ -3,6 +3,28 @@ module SiteSettings; end module SiteSettings::Validations + PROHIBITED_USER_AGENT_STRINGS = %w[ + apple + windows + linux + ubuntu + gecko + firefox + chrome + safari + applewebkit + webkit + mozilla + macintosh + khtml + intel + osx + os\ x + iphone + ipad + mac + ] + def validate_error(key, opts = {}) raise Discourse::InvalidParameters.new(I18n.t("errors.site_settings.#{key}", opts)) end @@ -201,6 +223,22 @@ module SiteSettings::Validations validate_error :cors_origins_should_not_have_trailing_slash end + def validate_slow_down_crawler_user_agents(new_val) + return if new_val.blank? + + new_val.downcase.split("|").each do |crawler| + if crawler.size < 3 + validate_error(:slow_down_crawler_user_agent_must_be_at_least_3_characters) + end + if PROHIBITED_USER_AGENT_STRINGS.any? { |c| c.include?(crawler) } + validate_error( + :slow_down_crawler_user_agent_cannot_be_popular_browsers, + values: PROHIBITED_USER_AGENT_STRINGS.join(I18n.t("word_connector.comma")) + ) + end + end + end + private def validate_bucket_setting(setting_name, upload_bucket, backup_bucket) diff --git a/spec/lib/site_settings/validations_spec.rb b/spec/lib/site_settings/validations_spec.rb index 747099f39aa..6b6999cf349 100644 --- a/spec/lib/site_settings/validations_spec.rb +++ b/spec/lib/site_settings/validations_spec.rb @@ -308,4 +308,65 @@ describe SiteSettings::Validations do end end end + + context "slow_down_crawler_user_agents" do + let(:too_short_message) do + I18n.t( + "errors.site_settings.slow_down_crawler_user_agent_must_be_at_least_3_characters" + ) + end + let(:popular_browser_message) do + I18n.t( + "errors.site_settings.slow_down_crawler_user_agent_cannot_be_popular_browsers", + values: SiteSettings::Validations::PROHIBITED_USER_AGENT_STRINGS.join(I18n.t("word_connector.comma")) + ) + end + + it "cannot contain a user agent that's shorter than 3 characters" do + expect { + subject.validate_slow_down_crawler_user_agents("ao|acsw") + }.to raise_error(Discourse::InvalidParameters, too_short_message) + expect { + subject.validate_slow_down_crawler_user_agents("up") + }.to raise_error(Discourse::InvalidParameters, too_short_message) + expect { + subject.validate_slow_down_crawler_user_agents("a|") + }.to raise_error(Discourse::InvalidParameters, too_short_message) + expect { + subject.validate_slow_down_crawler_user_agents("|a") + }.to raise_error(Discourse::InvalidParameters, too_short_message) + end + + it "allows user agents that are 3 characters or longer" do + expect { + subject.validate_slow_down_crawler_user_agents("aoc") + }.not_to raise_error + expect { + subject.validate_slow_down_crawler_user_agents("anuq") + }.not_to raise_error + expect { + subject.validate_slow_down_crawler_user_agents("pupsc|kcx") + }.not_to raise_error + end + + it "allows the setting to be empty" do + expect { + subject.validate_slow_down_crawler_user_agents("") + }.not_to raise_error + end + + it "cannot contain a token of a popular browser user agent" do + expect { + subject.validate_slow_down_crawler_user_agents("mOzilla") + }.to raise_error(Discourse::InvalidParameters, popular_browser_message) + + expect { + subject.validate_slow_down_crawler_user_agents("chRome|badcrawler") + }.to raise_error(Discourse::InvalidParameters, popular_browser_message) + + expect { + subject.validate_slow_down_crawler_user_agents("html|badcrawler") + }.to raise_error(Discourse::InvalidParameters, popular_browser_message) + end + end end diff --git a/spec/requests/application_controller_spec.rb b/spec/requests/application_controller_spec.rb index 34d9ecb07fc..112e637271d 100644 --- a/spec/requests/application_controller_spec.rb +++ b/spec/requests/application_controller_spec.rb @@ -941,4 +941,62 @@ RSpec.describe ApplicationController do ).to eq("user_api_key_limiter_1_day") end end + + describe "crawlers in slow_down_crawler_user_agents site setting" do + before do + RateLimiter.enable + RateLimiter.clear_all! + end + + it "are rate limited" do + SiteSetting.slow_down_crawler_rate = 128 + SiteSetting.slow_down_crawler_user_agents = "badcrawler|problematiccrawler" + now = Time.zone.now + freeze_time now + + get "/", headers: { + "HTTP_USER_AGENT" => "iam badcrawler" + } + expect(response.status).to eq(200) + get "/", headers: { + "HTTP_USER_AGENT" => "iam badcrawler" + } + expect(response.status).to eq(429) + expect(response.headers["Retry-After"]).to eq("128") + + get "/", headers: { + "HTTP_USER_AGENT" => "iam problematiccrawler" + } + expect(response.status).to eq(200) + get "/", headers: { + "HTTP_USER_AGENT" => "iam problematiccrawler" + } + expect(response.status).to eq(429) + expect(response.headers["Retry-After"]).to eq("128") + + freeze_time now + 100.seconds + get "/", headers: { + "HTTP_USER_AGENT" => "iam badcrawler" + } + expect(response.status).to eq(429) + expect(response.headers["Retry-After"]).to eq("28") + + get "/", headers: { + "HTTP_USER_AGENT" => "iam problematiccrawler" + } + expect(response.status).to eq(429) + expect(response.headers["Retry-After"]).to eq("28") + + freeze_time now + 150.seconds + get "/", headers: { + "HTTP_USER_AGENT" => "iam badcrawler" + } + expect(response.status).to eq(200) + + get "/", headers: { + "HTTP_USER_AGENT" => "iam problematiccrawler" + } + expect(response.status).to eq(200) + end + end end diff --git a/spec/requests/robots_txt_controller_spec.rb b/spec/requests/robots_txt_controller_spec.rb index b20627c0d8b..0672612c624 100644 --- a/spec/requests/robots_txt_controller_spec.rb +++ b/spec/requests/robots_txt_controller_spec.rb @@ -56,17 +56,6 @@ RSpec.describe RobotsTxtController do end end - context 'crawl delay' do - it 'allows you to set crawl delay on particular bots' do - SiteSetting.allow_index_in_robots_txt = true - SiteSetting.slow_down_crawler_rate = 17 - SiteSetting.slow_down_crawler_user_agents = 'bingbot|googlebot' - get '/robots.txt' - expect(response.body).to include("\nUser-agent: bingbot\nCrawl-delay: 17") - expect(response.body).to include("\nUser-agent: googlebot\nCrawl-delay: 17") - end - end - context 'allow_index_in_robots_txt is true' do def expect_allowed_and_disallowed_sections(allow_index, disallow_index)