FEATURE: allow for setting crawl delay per user agent
Also moved to default crawl delay bing so no more than a req every 5 seconds is allowed New site settings: "slow_down_crawler_user_agents" - list of crawlers that will be slowed down "slow_down_crawler_rate" - how many seconds to wait between requests Not enforced server side yet
This commit is contained in:
parent
17f9c5494d
commit
3a7b696703
|
@ -5,6 +5,12 @@ class RobotsTxtController < ApplicationController
|
|||
def index
|
||||
if SiteSetting.allow_index_in_robots_txt
|
||||
path = :index
|
||||
@crawler_delayed_agents = []
|
||||
|
||||
SiteSetting.slow_down_crawler_user_agents.split('|').each do |agent|
|
||||
@crawler_delayed_agents << [agent, SiteSetting.slow_down_crawler_rate]
|
||||
end
|
||||
|
||||
if SiteSetting.whitelisted_crawler_user_agents.present?
|
||||
@allowed_user_agents = SiteSetting.whitelisted_crawler_user_agents.split('|')
|
||||
@disallowed_user_agents = ['*']
|
||||
|
|
|
@ -39,3 +39,8 @@ Disallow: /
|
|||
<% end %>
|
||||
|
||||
<%= server_plugin_outlet "robots_txt_index" %>
|
||||
|
||||
<% @crawler_delayed_agents.each do |agent, delay| %>
|
||||
User-agent: <%= agent %>
|
||||
Crawl-delay: <%= delay %>
|
||||
<% end %>
|
||||
|
|
|
@ -1125,6 +1125,8 @@ en:
|
|||
allowed_iframes: "A list of iframe src domain prefixes that discourse can safely allow in posts"
|
||||
whitelisted_crawler_user_agents: 'User agents of web crawlers that should be allowed to access the site.'
|
||||
blacklisted_crawler_user_agents: 'User agents of web crawlers that should not be allowed to access the site. Does not apply if whitelist is defined.'
|
||||
slow_down_crawler_user_agents: 'User agents of web crawlers that should be rate limited in robots.txt using the crawl-delay directive, respected by yandax, bing and yahoo'
|
||||
slow_down_crawler_rate: 'If slow_down_crawler_user_agents is specified this rate will apply to all the crawlers (number of seconds delay between requests)'
|
||||
top_menu: "Determine which items appear in the homepage navigation, and in what order. Example latest|new|unread|categories|top|read|posted|bookmarks"
|
||||
post_menu: "Determine which items appear on the post menu, and in what order. Example like|edit|flag|delete|share|bookmark|reply"
|
||||
post_menu_hidden_items: "The menu items to hide by default in the post menu unless an expansion ellipsis is clicked on."
|
||||
|
|
|
@ -1022,8 +1022,12 @@ security:
|
|||
type: list
|
||||
default: ''
|
||||
blacklisted_crawler_user_agents:
|
||||
type: list
|
||||
default: ''
|
||||
slow_down_crawler_user_agents:
|
||||
type: list
|
||||
default: 'bingbot'
|
||||
slow_down_crawler_rate: 5
|
||||
|
||||
onebox:
|
||||
enable_flash_video_onebox: false
|
||||
|
|
|
@ -2,6 +2,18 @@ require 'rails_helper'
|
|||
|
||||
RSpec.describe RobotsTxtController do
|
||||
describe '#index' do
|
||||
|
||||
context 'crawl delay' do
|
||||
it 'allows you to set crawl delay on particular bots' do
|
||||
SiteSetting.allow_index_in_robots_txt = true
|
||||
SiteSetting.slow_down_crawler_rate = 17
|
||||
SiteSetting.slow_down_crawler_user_agents = 'bingbot|googlebot'
|
||||
get '/robots.txt'
|
||||
expect(response.body).to include("\nUser-agent: bingbot\nCrawl-delay: 17")
|
||||
expect(response.body).to include("\nUser-agent: googlebot\nCrawl-delay: 17")
|
||||
end
|
||||
end
|
||||
|
||||
context 'allow_index_in_robots_txt is true' do
|
||||
|
||||
def expect_allowed_and_disallowed_sections(allow_index, disallow_index)
|
||||
|
|
Loading…
Reference in New Issue