diff --git a/app/controllers/robots_txt_controller.rb b/app/controllers/robots_txt_controller.rb index 7de07939a06..9004715b80a 100644 --- a/app/controllers/robots_txt_controller.rb +++ b/app/controllers/robots_txt_controller.rb @@ -8,17 +8,27 @@ class RobotsTxtController < ApplicationController # NOTE: order is important! DISALLOWED_PATHS ||= %w{ + /admin/ /auth/ /assets/browser-update*.js /email/ /session - /session/ /user-api-key - /user-api-key/ /*?api_key* /*?*api_key* } + DISALLOWED_WITH_HEADER_PATHS ||= %w{ + /badges + /u + /my + /search + /tag + /g + /t/*/*.rss + /c/*.rss + } + def index if (overridden = SiteSetting.overridden_robots_txt.dup).present? overridden.prepend(OVERRIDDEN_HEADER) if guardian.is_admin? && !is_api? @@ -45,7 +55,8 @@ class RobotsTxtController < ApplicationController end def self.fetch_default_robots_info - deny_paths = DISALLOWED_PATHS.map { |p| Discourse.base_path + p } + deny_paths_googlebot = DISALLOWED_PATHS.map { |p| Discourse.base_path + p } + deny_paths = deny_paths_googlebot + DISALLOWED_WITH_HEADER_PATHS.map { |p| Discourse.base_path + p } deny_all = [ "#{Discourse.base_path}/" ] result = { @@ -55,17 +66,22 @@ class RobotsTxtController < ApplicationController if SiteSetting.allowed_crawler_user_agents.present? SiteSetting.allowed_crawler_user_agents.split('|').each do |agent| - result[:agents] << { name: agent, disallow: deny_paths } + paths = agent == "Googlebot" ? deny_paths_googlebot : deny_paths + result[:agents] << { name: agent, disallow: paths } end result[:agents] << { name: '*', disallow: deny_all } - elsif SiteSetting.blocked_crawler_user_agents.present? - result[:agents] << { name: '*', disallow: deny_paths } - SiteSetting.blocked_crawler_user_agents.split('|').each do |agent| - result[:agents] << { name: agent, disallow: deny_all } - end else + + if SiteSetting.blocked_crawler_user_agents.present? + SiteSetting.blocked_crawler_user_agents.split('|').each do |agent| + result[:agents] << { name: agent, disallow: deny_all } + end + end + result[:agents] << { name: '*', disallow: deny_paths } + + result[:agents] << { name: 'Googlebot', disallow: deny_paths_googlebot } end if SiteSetting.slow_down_crawler_user_agents.present? diff --git a/spec/requests/robots_txt_controller_spec.rb b/spec/requests/robots_txt_controller_spec.rb index ffaaa22192b..b20627c0d8b 100644 --- a/spec/requests/robots_txt_controller_spec.rb +++ b/spec/requests/robots_txt_controller_spec.rb @@ -91,6 +91,8 @@ RSpec.describe RobotsTxtController do i = response.body.index('User-agent: *') expect(i).to be_present expect(response.body[i..-1]).to include("Disallow: /auth/") + # we have to insert Googlebot for special handling + expect(response.body[i..-1]).to include("User-agent: Googlebot") end it "can allowlist user agents" do