FEATURE: explicitly ban outlier traffic sources in robots.txt (#11553)
Googlebot handles no-index headers very elegantly. It advises to leave as many routes as possible open and uses headers for high fidelity rules regarding indexes.
Discourse adds special `x-robot-tags` noindex headers to users, badges, groups, search and tag routes.
Following up on b52143feff
we now have it so Googlebot gets special handling.
Rest of the crawlers get a far more aggressive disallow list to protect against excessive crawling.
This commit is contained in:
parent
53f1811757
commit
758e160862
|
@ -8,17 +8,27 @@ class RobotsTxtController < ApplicationController
|
|||
|
||||
# NOTE: order is important!
|
||||
DISALLOWED_PATHS ||= %w{
|
||||
/admin/
|
||||
/auth/
|
||||
/assets/browser-update*.js
|
||||
/email/
|
||||
/session
|
||||
/session/
|
||||
/user-api-key
|
||||
/user-api-key/
|
||||
/*?api_key*
|
||||
/*?*api_key*
|
||||
}
|
||||
|
||||
DISALLOWED_WITH_HEADER_PATHS ||= %w{
|
||||
/badges
|
||||
/u
|
||||
/my
|
||||
/search
|
||||
/tag
|
||||
/g
|
||||
/t/*/*.rss
|
||||
/c/*.rss
|
||||
}
|
||||
|
||||
def index
|
||||
if (overridden = SiteSetting.overridden_robots_txt.dup).present?
|
||||
overridden.prepend(OVERRIDDEN_HEADER) if guardian.is_admin? && !is_api?
|
||||
|
@ -45,7 +55,8 @@ class RobotsTxtController < ApplicationController
|
|||
end
|
||||
|
||||
def self.fetch_default_robots_info
|
||||
deny_paths = DISALLOWED_PATHS.map { |p| Discourse.base_path + p }
|
||||
deny_paths_googlebot = DISALLOWED_PATHS.map { |p| Discourse.base_path + p }
|
||||
deny_paths = deny_paths_googlebot + DISALLOWED_WITH_HEADER_PATHS.map { |p| Discourse.base_path + p }
|
||||
deny_all = [ "#{Discourse.base_path}/" ]
|
||||
|
||||
result = {
|
||||
|
@ -55,17 +66,22 @@ class RobotsTxtController < ApplicationController
|
|||
|
||||
if SiteSetting.allowed_crawler_user_agents.present?
|
||||
SiteSetting.allowed_crawler_user_agents.split('|').each do |agent|
|
||||
result[:agents] << { name: agent, disallow: deny_paths }
|
||||
paths = agent == "Googlebot" ? deny_paths_googlebot : deny_paths
|
||||
result[:agents] << { name: agent, disallow: paths }
|
||||
end
|
||||
|
||||
result[:agents] << { name: '*', disallow: deny_all }
|
||||
elsif SiteSetting.blocked_crawler_user_agents.present?
|
||||
result[:agents] << { name: '*', disallow: deny_paths }
|
||||
SiteSetting.blocked_crawler_user_agents.split('|').each do |agent|
|
||||
result[:agents] << { name: agent, disallow: deny_all }
|
||||
end
|
||||
else
|
||||
|
||||
if SiteSetting.blocked_crawler_user_agents.present?
|
||||
SiteSetting.blocked_crawler_user_agents.split('|').each do |agent|
|
||||
result[:agents] << { name: agent, disallow: deny_all }
|
||||
end
|
||||
end
|
||||
|
||||
result[:agents] << { name: '*', disallow: deny_paths }
|
||||
|
||||
result[:agents] << { name: 'Googlebot', disallow: deny_paths_googlebot }
|
||||
end
|
||||
|
||||
if SiteSetting.slow_down_crawler_user_agents.present?
|
||||
|
|
|
@ -91,6 +91,8 @@ RSpec.describe RobotsTxtController do
|
|||
i = response.body.index('User-agent: *')
|
||||
expect(i).to be_present
|
||||
expect(response.body[i..-1]).to include("Disallow: /auth/")
|
||||
# we have to insert Googlebot for special handling
|
||||
expect(response.body[i..-1]).to include("User-agent: Googlebot")
|
||||
end
|
||||
|
||||
it "can allowlist user agents" do
|
||||
|
|
Loading…
Reference in New Issue