FEATURE: use source tags for crawler detection

Source tags are an additional header optionally provided by the hosting platform that provide semantic information about the source of the request.
2024-11-28 16:19:44 -05:00 · 2024-11-28 16:19:44 -05:00 · 19319ceadb
parent e1e25a11a4
commit 19319ceadb
3 changed files with 116 additions and 1 deletions
--- a/lib/middleware/anonymous_cache.rb
+++ b/lib/middleware/anonymous_cache.rb
@ -79,12 +79,20 @@ module Middleware
      end

      def blocked_crawler?
+        # Use the source tag metadata to detect whether a request is coming from
+        # an address belonging to a known crawler.
+        verified_crawler = SrcTagInfo.new(@request).verified_crawler
+
+        # If so, we'll use *only* that crawler name
+        # for determing a block/allow.
+        crawler_identifier = verified_crawler || @user_agent
+
        @request.get? && !@request.xhr? && !@request.path.ends_with?("robots.txt") &&
          !@request.path.ends_with?("srv/status") &&
          @request[Auth::DefaultCurrentUserProvider::API_KEY].nil? &&
          @env[Auth::DefaultCurrentUserProvider::USER_API_KEY].nil? &&
          @env[Auth::DefaultCurrentUserProvider::HEADER_API_KEY].nil? &&
-          CrawlerDetection.is_blocked_crawler?(@user_agent)
+          CrawlerDetection.is_blocked_crawler?(crawler_identifier)
      end

      # rubocop:disable Lint/BooleanSymbol
--- a/lib/src_tag_info.rb
+++ b/lib/src_tag_info.rb
@ -0,0 +1,45 @@
+# frozen_string_literal: true
+
+class SrcTagInfo
+  def initialize(request)
+    @request = request
+  end
+
+  def src_tags
+    @src_tags ||=
+      if src_tag_header = ENV["DISCOURSE_HTTP_SRC_TAG_HEADER"]
+        @request.env.fetch("HTTP_#{src_tag_header.upcase.gsub("-", "_")}", "").split()
+      else
+        []
+      end
+  end
+
+  def src_tags_supported
+    @src_tags_supported ||=
+      if src_tag_supported_header = ENV["DISCOURSE_HTTP_SRC_TAG_SUPPORTED_HEADER"]
+        @request.env.fetch("HTTP_#{src_tag_supported_header.upcase.gsub("-", "_")}", "").split()
+      else
+        []
+      end
+  end
+
+  def verified_crawler
+    # Use the source tag metadata to detect whether a request is coming from
+    # an address belonging to a known crawler.
+    if verified_crawler_src = src_tags&.select { _1.start_with? "crawler-" }
+      verified_crawler_src.first&.[](8..)
+    else
+      nil
+    end
+  end
+
+  def verified_cloud
+    # Use the source tag metadata to detect whether a request is coming from
+    # an address belonging to a known cloud provider.
+    if verified_cloud_src = src_tags&.select { _1.start_with? "cloud-" }
+      verified_cloud_src.first&.[](6..)
+    else
+      nil
+    end
+  end
+end
--- a/spec/lib/middleware/anonymous_cache_spec.rb
+++ b/spec/lib/middleware/anonymous_cache_spec.rb
@ -418,5 +418,67 @@ RSpec.describe Middleware::AnonymousCache do

      expect(@status).to eq(403)
    end
+
+    context "with src-tag" do
+      ENV["DISCOURSE_HTTP_SRC_TAG_HEADER"] = "src-tag"
+      ENV["DISCOURSE_HTTP_SRC_TAG_SUPPORTED_HEADER"] = "src-tag-lists"
+
+      context "when src is googlebot" do
+        headers = { "REMOTE_ADDR" => "1.1.1.1", "HTTP_SRC_TAG" => "crawler-googlebot" }
+
+        context "when googlebot is blocked" do
+          before { SiteSetting.blocked_crawler_user_agents = "Googlebot" }
+
+          it "blocks googlebot" do
+            get "/",
+                headers:
+                  headers.merge(
+                    { "HTTP_USER_AGENT" => "Googlebot/2.1 (+http://www.google.com/bot.html)" },
+                  )
+            expect(@status).to eq(403)
+          end
+
+          it "blocks apparent non-googlebot requests" do
+            get "/", headers: headers.merge({ "HTTP_USER_AGENT" => "Innocentbot/42" })
+            expect(@status).to eq(403)
+          end
+        end
+
+        context "when googlebot is not blocked" do
+          before { SiteSetting.blocked_crawler_user_agents = "Nexus 5X Build|AppleWebKit" }
+
+          it "does not block googlebot" do
+            get "/",
+                headers:
+                  headers.merge(
+                    { "HTTP_USER_AGENT" => "Googlebot/2.1 (+http://www.google.com/bot.html)" },
+                  )
+            expect(@status).to eq(200)
+          end
+          it "does not block googlebot UAs including a blocked string" do
+            get "/",
+                headers:
+                  headers.merge(
+                    {
+                      "HTTP_USER_AGENT" =>
+                        "Mozilla/5.0 (Nexus 5X Build/MMB29P) AppleWebKit/537.36 Chrome/130.0.6723.69 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
+                    },
+                  )
+            expect(@status).to eq(200)
+          end
+          it "does not block non-googlebot UAs including a blocked string" do
+            get "/",
+                headers:
+                  headers.merge(
+                    {
+                      "HTTP_USER_AGENT" =>
+                        "Mozilla/5.0 (Nexus 5X Build/MMB29P) AppleWebKit/537.36 Chrome/130.0.6723.69",
+                    },
+                  )
+            expect(@status).to eq(200)
+          end
+        end
+      end
+    end
  end
 end