FEATURE: use source tags for crawler detection

Source tags are an additional header optionally provided by the hosting
platform that provide semantic information about the source of the request.
This commit is contained in:
Michael Brown 2024-11-28 16:19:44 -05:00
parent e1e25a11a4
commit 19319ceadb
No known key found for this signature in database
GPG Key ID: 6C07FB3007CF9360
3 changed files with 116 additions and 1 deletions

View File

@ -79,12 +79,20 @@ module Middleware
end
def blocked_crawler?
# Use the source tag metadata to detect whether a request is coming from
# an address belonging to a known crawler.
verified_crawler = SrcTagInfo.new(@request).verified_crawler
# If so, we'll use *only* that crawler name
# for determing a block/allow.
crawler_identifier = verified_crawler || @user_agent
@request.get? && !@request.xhr? && !@request.path.ends_with?("robots.txt") &&
!@request.path.ends_with?("srv/status") &&
@request[Auth::DefaultCurrentUserProvider::API_KEY].nil? &&
@env[Auth::DefaultCurrentUserProvider::USER_API_KEY].nil? &&
@env[Auth::DefaultCurrentUserProvider::HEADER_API_KEY].nil? &&
CrawlerDetection.is_blocked_crawler?(@user_agent)
CrawlerDetection.is_blocked_crawler?(crawler_identifier)
end
# rubocop:disable Lint/BooleanSymbol

45
lib/src_tag_info.rb Normal file
View File

@ -0,0 +1,45 @@
# frozen_string_literal: true
class SrcTagInfo
def initialize(request)
@request = request
end
def src_tags
@src_tags ||=
if src_tag_header = ENV["DISCOURSE_HTTP_SRC_TAG_HEADER"]
@request.env.fetch("HTTP_#{src_tag_header.upcase.gsub("-", "_")}", "").split()
else
[]
end
end
def src_tags_supported
@src_tags_supported ||=
if src_tag_supported_header = ENV["DISCOURSE_HTTP_SRC_TAG_SUPPORTED_HEADER"]
@request.env.fetch("HTTP_#{src_tag_supported_header.upcase.gsub("-", "_")}", "").split()
else
[]
end
end
def verified_crawler
# Use the source tag metadata to detect whether a request is coming from
# an address belonging to a known crawler.
if verified_crawler_src = src_tags&.select { _1.start_with? "crawler-" }
verified_crawler_src.first&.[](8..)
else
nil
end
end
def verified_cloud
# Use the source tag metadata to detect whether a request is coming from
# an address belonging to a known cloud provider.
if verified_cloud_src = src_tags&.select { _1.start_with? "cloud-" }
verified_cloud_src.first&.[](6..)
else
nil
end
end
end

View File

@ -418,5 +418,67 @@ RSpec.describe Middleware::AnonymousCache do
expect(@status).to eq(403)
end
context "with src-tag" do
ENV["DISCOURSE_HTTP_SRC_TAG_HEADER"] = "src-tag"
ENV["DISCOURSE_HTTP_SRC_TAG_SUPPORTED_HEADER"] = "src-tag-lists"
context "when src is googlebot" do
headers = { "REMOTE_ADDR" => "1.1.1.1", "HTTP_SRC_TAG" => "crawler-googlebot" }
context "when googlebot is blocked" do
before { SiteSetting.blocked_crawler_user_agents = "Googlebot" }
it "blocks googlebot" do
get "/",
headers:
headers.merge(
{ "HTTP_USER_AGENT" => "Googlebot/2.1 (+http://www.google.com/bot.html)" },
)
expect(@status).to eq(403)
end
it "blocks apparent non-googlebot requests" do
get "/", headers: headers.merge({ "HTTP_USER_AGENT" => "Innocentbot/42" })
expect(@status).to eq(403)
end
end
context "when googlebot is not blocked" do
before { SiteSetting.blocked_crawler_user_agents = "Nexus 5X Build|AppleWebKit" }
it "does not block googlebot" do
get "/",
headers:
headers.merge(
{ "HTTP_USER_AGENT" => "Googlebot/2.1 (+http://www.google.com/bot.html)" },
)
expect(@status).to eq(200)
end
it "does not block googlebot UAs including a blocked string" do
get "/",
headers:
headers.merge(
{
"HTTP_USER_AGENT" =>
"Mozilla/5.0 (Nexus 5X Build/MMB29P) AppleWebKit/537.36 Chrome/130.0.6723.69 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
},
)
expect(@status).to eq(200)
end
it "does not block non-googlebot UAs including a blocked string" do
get "/",
headers:
headers.merge(
{
"HTTP_USER_AGENT" =>
"Mozilla/5.0 (Nexus 5X Build/MMB29P) AppleWebKit/537.36 Chrome/130.0.6723.69",
},
)
expect(@status).to eq(200)
end
end
end
end
end
end