FEATURE: let Google index pages so it can remove them

Google insists on indexing pages so it can figure out if they can be removed from the index. see: https://support.google.com/webmasters/answer/6332384?hl=en This change ensures the we have special behavior for Googlebot where we allow indexing, but block the actual indexing via X-Robots-Tag
2025-02-28 08:19:13 +00:00 · 2020-05-11 12:14:21 +10:00 · 2020-05-11 12:14:21 +10:00 · bb4e8899c4
commit bb4e8899c4
parent 4a74f18e95
4 changed files with 18 additions and 2 deletions
--- a/app/controllers/application_controller.rb
+++ b/app/controllers/application_controller.rb
@ -806,7 +806,13 @@ class ApplicationController < ActionController::Base
  end

  def add_noindex_header
-    response.headers['X-Robots-Tag'] = 'noindex' if request.get?
+    if request.get?
+      if SiteSetting.allow_index_in_robots_txt
+        response.headers['X-Robots-Tag'] = 'noindex'
+      else
+        response.headers['X-Robots-Tag'] = 'noindex, nofollow'
+      end
+    end
  end

  protected
--- a/app/views/robots_txt/no_index.erb
+++ b/app/views/robots_txt/no_index.erb
@ -1,4 +1,13 @@
 # See http://www.robotstxt.org/wc/norobots.html for documentation on how to use the robots.txt file
 #
+
+# Googlebot must be allowed to index so it can remove items from the index
+# we return the X-Robots-Tag with noindex, nofollow which will ensure
+# indexing is minimized and nothing shows up in Google search results
+User-agent: googlebot
+Allow: <%= Discourse.base_uri + "/" %>
+Disallow: <%= Discourse.base_uri + "/uploads/*" %>
+
 User-agent: *
 Disallow: <%= Discourse.base_uri + "/" %>
+
--- a/spec/requests/robots_txt_controller_spec.rb
+++ b/spec/requests/robots_txt_controller_spec.rb
@ -132,6 +132,7 @@ RSpec.describe RobotsTxtController do
      get '/robots.txt'

      expect(response.body).to_not include("Disallow: /u/")
+      expect(response.body).to include("User-agent: googlebot\nAllow")
    end

    it "returns overridden robots.txt if the file is overridden" do
--- a/spec/requests/topics_controller_spec.rb
+++ b/spec/requests/topics_controller_spec.rb
@ -1853,7 +1853,7 @@ RSpec.describe TopicsController do

      get "/t/#{topic.slug}/#{topic.id}.json"

-      expect(response.headers['X-Robots-Tag']).to eq('noindex')
+      expect(response.headers['X-Robots-Tag']).to eq('noindex, nofollow')
    end

    it "doesn't store an incoming link when there's no referer" do