diff --git a/app/controllers/robots_txt_controller.rb b/app/controllers/robots_txt_controller.rb index 2315120935f..fe838b885b4 100644 --- a/app/controllers/robots_txt_controller.rb +++ b/app/controllers/robots_txt_controller.rb @@ -5,6 +5,12 @@ class RobotsTxtController < ApplicationController def index if SiteSetting.allow_index_in_robots_txt path = :index + @crawler_delayed_agents = [] + + SiteSetting.slow_down_crawler_user_agents.split('|').each do |agent| + @crawler_delayed_agents << [agent, SiteSetting.slow_down_crawler_rate] + end + if SiteSetting.whitelisted_crawler_user_agents.present? @allowed_user_agents = SiteSetting.whitelisted_crawler_user_agents.split('|') @disallowed_user_agents = ['*'] diff --git a/app/views/robots_txt/index.erb b/app/views/robots_txt/index.erb index 5ec74e08a89..fbc77e88bd7 100644 --- a/app/views/robots_txt/index.erb +++ b/app/views/robots_txt/index.erb @@ -39,3 +39,8 @@ Disallow: / <% end %> <%= server_plugin_outlet "robots_txt_index" %> + +<% @crawler_delayed_agents.each do |agent, delay| %> +User-agent: <%= agent %> +Crawl-delay: <%= delay %> +<% end %> diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index e0ceef9c830..ab60927d016 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -1125,6 +1125,8 @@ en: allowed_iframes: "A list of iframe src domain prefixes that discourse can safely allow in posts" whitelisted_crawler_user_agents: 'User agents of web crawlers that should be allowed to access the site.' blacklisted_crawler_user_agents: 'User agents of web crawlers that should not be allowed to access the site. Does not apply if whitelist is defined.' + slow_down_crawler_user_agents: 'User agents of web crawlers that should be rate limited in robots.txt using the crawl-delay directive, respected by yandax, bing and yahoo' + slow_down_crawler_rate: 'If slow_down_crawler_user_agents is specified this rate will apply to all the crawlers (number of seconds delay between requests)' top_menu: "Determine which items appear in the homepage navigation, and in what order. Example latest|new|unread|categories|top|read|posted|bookmarks" post_menu: "Determine which items appear on the post menu, and in what order. Example like|edit|flag|delete|share|bookmark|reply" post_menu_hidden_items: "The menu items to hide by default in the post menu unless an expansion ellipsis is clicked on." diff --git a/config/site_settings.yml b/config/site_settings.yml index a2bc8e0df60..2328621fff4 100644 --- a/config/site_settings.yml +++ b/config/site_settings.yml @@ -1022,8 +1022,12 @@ security: type: list default: '' blacklisted_crawler_user_agents: + type: list + default: '' + slow_down_crawler_user_agents: type: list default: 'bingbot' + slow_down_crawler_rate: 5 onebox: enable_flash_video_onebox: false diff --git a/spec/requests/robots_txt_controller_spec.rb b/spec/requests/robots_txt_controller_spec.rb index be3590e8d8f..fa5c62e3205 100644 --- a/spec/requests/robots_txt_controller_spec.rb +++ b/spec/requests/robots_txt_controller_spec.rb @@ -2,6 +2,18 @@ require 'rails_helper' RSpec.describe RobotsTxtController do describe '#index' do + + context 'crawl delay' do + it 'allows you to set crawl delay on particular bots' do + SiteSetting.allow_index_in_robots_txt = true + SiteSetting.slow_down_crawler_rate = 17 + SiteSetting.slow_down_crawler_user_agents = 'bingbot|googlebot' + get '/robots.txt' + expect(response.body).to include("\nUser-agent: bingbot\nCrawl-delay: 17") + expect(response.body).to include("\nUser-agent: googlebot\nCrawl-delay: 17") + end + end + context 'allow_index_in_robots_txt is true' do def expect_allowed_and_disallowed_sections(allow_index, disallow_index)