FEATURE: allow for setting crawl delay per user agent
Also moved to default crawl delay bing so no more than a req every 5 seconds is allowed New site settings: "slow_down_crawler_user_agents" - list of crawlers that will be slowed down "slow_down_crawler_rate" - how many seconds to wait between requests Not enforced server side yet
This commit is contained in:
parent
17f9c5494d
commit
3a7b696703
|
@ -5,6 +5,12 @@ class RobotsTxtController < ApplicationController
|
||||||
def index
|
def index
|
||||||
if SiteSetting.allow_index_in_robots_txt
|
if SiteSetting.allow_index_in_robots_txt
|
||||||
path = :index
|
path = :index
|
||||||
|
@crawler_delayed_agents = []
|
||||||
|
|
||||||
|
SiteSetting.slow_down_crawler_user_agents.split('|').each do |agent|
|
||||||
|
@crawler_delayed_agents << [agent, SiteSetting.slow_down_crawler_rate]
|
||||||
|
end
|
||||||
|
|
||||||
if SiteSetting.whitelisted_crawler_user_agents.present?
|
if SiteSetting.whitelisted_crawler_user_agents.present?
|
||||||
@allowed_user_agents = SiteSetting.whitelisted_crawler_user_agents.split('|')
|
@allowed_user_agents = SiteSetting.whitelisted_crawler_user_agents.split('|')
|
||||||
@disallowed_user_agents = ['*']
|
@disallowed_user_agents = ['*']
|
||||||
|
|
|
@ -39,3 +39,8 @@ Disallow: /
|
||||||
<% end %>
|
<% end %>
|
||||||
|
|
||||||
<%= server_plugin_outlet "robots_txt_index" %>
|
<%= server_plugin_outlet "robots_txt_index" %>
|
||||||
|
|
||||||
|
<% @crawler_delayed_agents.each do |agent, delay| %>
|
||||||
|
User-agent: <%= agent %>
|
||||||
|
Crawl-delay: <%= delay %>
|
||||||
|
<% end %>
|
||||||
|
|
|
@ -1125,6 +1125,8 @@ en:
|
||||||
allowed_iframes: "A list of iframe src domain prefixes that discourse can safely allow in posts"
|
allowed_iframes: "A list of iframe src domain prefixes that discourse can safely allow in posts"
|
||||||
whitelisted_crawler_user_agents: 'User agents of web crawlers that should be allowed to access the site.'
|
whitelisted_crawler_user_agents: 'User agents of web crawlers that should be allowed to access the site.'
|
||||||
blacklisted_crawler_user_agents: 'User agents of web crawlers that should not be allowed to access the site. Does not apply if whitelist is defined.'
|
blacklisted_crawler_user_agents: 'User agents of web crawlers that should not be allowed to access the site. Does not apply if whitelist is defined.'
|
||||||
|
slow_down_crawler_user_agents: 'User agents of web crawlers that should be rate limited in robots.txt using the crawl-delay directive, respected by yandax, bing and yahoo'
|
||||||
|
slow_down_crawler_rate: 'If slow_down_crawler_user_agents is specified this rate will apply to all the crawlers (number of seconds delay between requests)'
|
||||||
top_menu: "Determine which items appear in the homepage navigation, and in what order. Example latest|new|unread|categories|top|read|posted|bookmarks"
|
top_menu: "Determine which items appear in the homepage navigation, and in what order. Example latest|new|unread|categories|top|read|posted|bookmarks"
|
||||||
post_menu: "Determine which items appear on the post menu, and in what order. Example like|edit|flag|delete|share|bookmark|reply"
|
post_menu: "Determine which items appear on the post menu, and in what order. Example like|edit|flag|delete|share|bookmark|reply"
|
||||||
post_menu_hidden_items: "The menu items to hide by default in the post menu unless an expansion ellipsis is clicked on."
|
post_menu_hidden_items: "The menu items to hide by default in the post menu unless an expansion ellipsis is clicked on."
|
||||||
|
|
|
@ -1022,8 +1022,12 @@ security:
|
||||||
type: list
|
type: list
|
||||||
default: ''
|
default: ''
|
||||||
blacklisted_crawler_user_agents:
|
blacklisted_crawler_user_agents:
|
||||||
|
type: list
|
||||||
|
default: ''
|
||||||
|
slow_down_crawler_user_agents:
|
||||||
type: list
|
type: list
|
||||||
default: 'bingbot'
|
default: 'bingbot'
|
||||||
|
slow_down_crawler_rate: 5
|
||||||
|
|
||||||
onebox:
|
onebox:
|
||||||
enable_flash_video_onebox: false
|
enable_flash_video_onebox: false
|
||||||
|
|
|
@ -2,6 +2,18 @@ require 'rails_helper'
|
||||||
|
|
||||||
RSpec.describe RobotsTxtController do
|
RSpec.describe RobotsTxtController do
|
||||||
describe '#index' do
|
describe '#index' do
|
||||||
|
|
||||||
|
context 'crawl delay' do
|
||||||
|
it 'allows you to set crawl delay on particular bots' do
|
||||||
|
SiteSetting.allow_index_in_robots_txt = true
|
||||||
|
SiteSetting.slow_down_crawler_rate = 17
|
||||||
|
SiteSetting.slow_down_crawler_user_agents = 'bingbot|googlebot'
|
||||||
|
get '/robots.txt'
|
||||||
|
expect(response.body).to include("\nUser-agent: bingbot\nCrawl-delay: 17")
|
||||||
|
expect(response.body).to include("\nUser-agent: googlebot\nCrawl-delay: 17")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
context 'allow_index_in_robots_txt is true' do
|
context 'allow_index_in_robots_txt is true' do
|
||||||
|
|
||||||
def expect_allowed_and_disallowed_sections(allow_index, disallow_index)
|
def expect_allowed_and_disallowed_sections(allow_index, disallow_index)
|
||||||
|
|
Loading…
Reference in New Issue