FEATURE: use source tags for crawler detection
Source tags are an additional header optionally provided by the hosting platform that provide semantic information about the source of the request.
This commit is contained in:
parent
e1e25a11a4
commit
19319ceadb
|
@ -79,12 +79,20 @@ module Middleware
|
|||
end
|
||||
|
||||
def blocked_crawler?
|
||||
# Use the source tag metadata to detect whether a request is coming from
|
||||
# an address belonging to a known crawler.
|
||||
verified_crawler = SrcTagInfo.new(@request).verified_crawler
|
||||
|
||||
# If so, we'll use *only* that crawler name
|
||||
# for determing a block/allow.
|
||||
crawler_identifier = verified_crawler || @user_agent
|
||||
|
||||
@request.get? && !@request.xhr? && !@request.path.ends_with?("robots.txt") &&
|
||||
!@request.path.ends_with?("srv/status") &&
|
||||
@request[Auth::DefaultCurrentUserProvider::API_KEY].nil? &&
|
||||
@env[Auth::DefaultCurrentUserProvider::USER_API_KEY].nil? &&
|
||||
@env[Auth::DefaultCurrentUserProvider::HEADER_API_KEY].nil? &&
|
||||
CrawlerDetection.is_blocked_crawler?(@user_agent)
|
||||
CrawlerDetection.is_blocked_crawler?(crawler_identifier)
|
||||
end
|
||||
|
||||
# rubocop:disable Lint/BooleanSymbol
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
class SrcTagInfo
|
||||
def initialize(request)
|
||||
@request = request
|
||||
end
|
||||
|
||||
def src_tags
|
||||
@src_tags ||=
|
||||
if src_tag_header = ENV["DISCOURSE_HTTP_SRC_TAG_HEADER"]
|
||||
@request.env.fetch("HTTP_#{src_tag_header.upcase.gsub("-", "_")}", "").split()
|
||||
else
|
||||
[]
|
||||
end
|
||||
end
|
||||
|
||||
def src_tags_supported
|
||||
@src_tags_supported ||=
|
||||
if src_tag_supported_header = ENV["DISCOURSE_HTTP_SRC_TAG_SUPPORTED_HEADER"]
|
||||
@request.env.fetch("HTTP_#{src_tag_supported_header.upcase.gsub("-", "_")}", "").split()
|
||||
else
|
||||
[]
|
||||
end
|
||||
end
|
||||
|
||||
def verified_crawler
|
||||
# Use the source tag metadata to detect whether a request is coming from
|
||||
# an address belonging to a known crawler.
|
||||
if verified_crawler_src = src_tags&.select { _1.start_with? "crawler-" }
|
||||
verified_crawler_src.first&.[](8..)
|
||||
else
|
||||
nil
|
||||
end
|
||||
end
|
||||
|
||||
def verified_cloud
|
||||
# Use the source tag metadata to detect whether a request is coming from
|
||||
# an address belonging to a known cloud provider.
|
||||
if verified_cloud_src = src_tags&.select { _1.start_with? "cloud-" }
|
||||
verified_cloud_src.first&.[](6..)
|
||||
else
|
||||
nil
|
||||
end
|
||||
end
|
||||
end
|
|
@ -418,5 +418,67 @@ RSpec.describe Middleware::AnonymousCache do
|
|||
|
||||
expect(@status).to eq(403)
|
||||
end
|
||||
|
||||
context "with src-tag" do
|
||||
ENV["DISCOURSE_HTTP_SRC_TAG_HEADER"] = "src-tag"
|
||||
ENV["DISCOURSE_HTTP_SRC_TAG_SUPPORTED_HEADER"] = "src-tag-lists"
|
||||
|
||||
context "when src is googlebot" do
|
||||
headers = { "REMOTE_ADDR" => "1.1.1.1", "HTTP_SRC_TAG" => "crawler-googlebot" }
|
||||
|
||||
context "when googlebot is blocked" do
|
||||
before { SiteSetting.blocked_crawler_user_agents = "Googlebot" }
|
||||
|
||||
it "blocks googlebot" do
|
||||
get "/",
|
||||
headers:
|
||||
headers.merge(
|
||||
{ "HTTP_USER_AGENT" => "Googlebot/2.1 (+http://www.google.com/bot.html)" },
|
||||
)
|
||||
expect(@status).to eq(403)
|
||||
end
|
||||
|
||||
it "blocks apparent non-googlebot requests" do
|
||||
get "/", headers: headers.merge({ "HTTP_USER_AGENT" => "Innocentbot/42" })
|
||||
expect(@status).to eq(403)
|
||||
end
|
||||
end
|
||||
|
||||
context "when googlebot is not blocked" do
|
||||
before { SiteSetting.blocked_crawler_user_agents = "Nexus 5X Build|AppleWebKit" }
|
||||
|
||||
it "does not block googlebot" do
|
||||
get "/",
|
||||
headers:
|
||||
headers.merge(
|
||||
{ "HTTP_USER_AGENT" => "Googlebot/2.1 (+http://www.google.com/bot.html)" },
|
||||
)
|
||||
expect(@status).to eq(200)
|
||||
end
|
||||
it "does not block googlebot UAs including a blocked string" do
|
||||
get "/",
|
||||
headers:
|
||||
headers.merge(
|
||||
{
|
||||
"HTTP_USER_AGENT" =>
|
||||
"Mozilla/5.0 (Nexus 5X Build/MMB29P) AppleWebKit/537.36 Chrome/130.0.6723.69 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
|
||||
},
|
||||
)
|
||||
expect(@status).to eq(200)
|
||||
end
|
||||
it "does not block non-googlebot UAs including a blocked string" do
|
||||
get "/",
|
||||
headers:
|
||||
headers.merge(
|
||||
{
|
||||
"HTTP_USER_AGENT" =>
|
||||
"Mozilla/5.0 (Nexus 5X Build/MMB29P) AppleWebKit/537.36 Chrome/130.0.6723.69",
|
||||
},
|
||||
)
|
||||
expect(@status).to eq(200)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue