FEATURE: allow for setting crawl delay per user agent

Also moved to default crawl delay bing so no more than a req every 5 seconds is allowed

New site settings:

"slow_down_crawler_user_agents" - list of crawlers that will be slowed down
"slow_down_crawler_rate" - how many seconds to wait between requests

Not enforced server side yet
This commit is contained in:
Sam 2018-04-06 10:15:23 +10:00
parent 17f9c5494d
commit 3a7b696703
5 changed files with 29 additions and 0 deletions

View File

@ -5,6 +5,12 @@ class RobotsTxtController < ApplicationController
def index def index
if SiteSetting.allow_index_in_robots_txt if SiteSetting.allow_index_in_robots_txt
path = :index path = :index
@crawler_delayed_agents = []
SiteSetting.slow_down_crawler_user_agents.split('|').each do |agent|
@crawler_delayed_agents << [agent, SiteSetting.slow_down_crawler_rate]
end
if SiteSetting.whitelisted_crawler_user_agents.present? if SiteSetting.whitelisted_crawler_user_agents.present?
@allowed_user_agents = SiteSetting.whitelisted_crawler_user_agents.split('|') @allowed_user_agents = SiteSetting.whitelisted_crawler_user_agents.split('|')
@disallowed_user_agents = ['*'] @disallowed_user_agents = ['*']

View File

@ -39,3 +39,8 @@ Disallow: /
<% end %> <% end %>
<%= server_plugin_outlet "robots_txt_index" %> <%= server_plugin_outlet "robots_txt_index" %>
<% @crawler_delayed_agents.each do |agent, delay| %>
User-agent: <%= agent %>
Crawl-delay: <%= delay %>
<% end %>

View File

@ -1125,6 +1125,8 @@ en:
allowed_iframes: "A list of iframe src domain prefixes that discourse can safely allow in posts" allowed_iframes: "A list of iframe src domain prefixes that discourse can safely allow in posts"
whitelisted_crawler_user_agents: 'User agents of web crawlers that should be allowed to access the site.' whitelisted_crawler_user_agents: 'User agents of web crawlers that should be allowed to access the site.'
blacklisted_crawler_user_agents: 'User agents of web crawlers that should not be allowed to access the site. Does not apply if whitelist is defined.' blacklisted_crawler_user_agents: 'User agents of web crawlers that should not be allowed to access the site. Does not apply if whitelist is defined.'
slow_down_crawler_user_agents: 'User agents of web crawlers that should be rate limited in robots.txt using the crawl-delay directive, respected by yandax, bing and yahoo'
slow_down_crawler_rate: 'If slow_down_crawler_user_agents is specified this rate will apply to all the crawlers (number of seconds delay between requests)'
top_menu: "Determine which items appear in the homepage navigation, and in what order. Example latest|new|unread|categories|top|read|posted|bookmarks" top_menu: "Determine which items appear in the homepage navigation, and in what order. Example latest|new|unread|categories|top|read|posted|bookmarks"
post_menu: "Determine which items appear on the post menu, and in what order. Example like|edit|flag|delete|share|bookmark|reply" post_menu: "Determine which items appear on the post menu, and in what order. Example like|edit|flag|delete|share|bookmark|reply"
post_menu_hidden_items: "The menu items to hide by default in the post menu unless an expansion ellipsis is clicked on." post_menu_hidden_items: "The menu items to hide by default in the post menu unless an expansion ellipsis is clicked on."

View File

@ -1022,8 +1022,12 @@ security:
type: list type: list
default: '' default: ''
blacklisted_crawler_user_agents: blacklisted_crawler_user_agents:
type: list
default: ''
slow_down_crawler_user_agents:
type: list type: list
default: 'bingbot' default: 'bingbot'
slow_down_crawler_rate: 5
onebox: onebox:
enable_flash_video_onebox: false enable_flash_video_onebox: false

View File

@ -2,6 +2,18 @@ require 'rails_helper'
RSpec.describe RobotsTxtController do RSpec.describe RobotsTxtController do
describe '#index' do describe '#index' do
context 'crawl delay' do
it 'allows you to set crawl delay on particular bots' do
SiteSetting.allow_index_in_robots_txt = true
SiteSetting.slow_down_crawler_rate = 17
SiteSetting.slow_down_crawler_user_agents = 'bingbot|googlebot'
get '/robots.txt'
expect(response.body).to include("\nUser-agent: bingbot\nCrawl-delay: 17")
expect(response.body).to include("\nUser-agent: googlebot\nCrawl-delay: 17")
end
end
context 'allow_index_in_robots_txt is true' do context 'allow_index_in_robots_txt is true' do
def expect_allowed_and_disallowed_sections(allow_index, disallow_index) def expect_allowed_and_disallowed_sections(allow_index, disallow_index)