FIX: use crawler layout when saving url in Wayback Machine (#7667)
This commit is contained in:
parent
28dcf445b7
commit
42809f4d69
|
@ -79,7 +79,9 @@ class ApplicationController < ActionController::Base
|
||||||
request.user_agent &&
|
request.user_agent &&
|
||||||
(request.content_type.blank? || request.content_type.include?('html')) &&
|
(request.content_type.blank? || request.content_type.include?('html')) &&
|
||||||
!['json', 'rss'].include?(params[:format]) &&
|
!['json', 'rss'].include?(params[:format]) &&
|
||||||
(has_escaped_fragment? || CrawlerDetection.crawler?(request.user_agent) || params.key?("print"))
|
(has_escaped_fragment? || params.key?("print") ||
|
||||||
|
CrawlerDetection.crawler?(request.user_agent, request.headers["HTTP_VIA"])
|
||||||
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
def perform_refresh_session
|
def perform_refresh_session
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
# frozen_string_literal: true
|
# frozen_string_literal: true
|
||||||
|
|
||||||
module CrawlerDetection
|
module CrawlerDetection
|
||||||
|
WAYBACK_MACHINE_URL = "web.archive.org"
|
||||||
|
|
||||||
def self.to_matcher(string, type: nil)
|
def self.to_matcher(string, type: nil)
|
||||||
escaped = string.split('|').map { |agent| Regexp.escape(agent) }.join('|')
|
escaped = string.split('|').map { |agent| Regexp.escape(agent) }.join('|')
|
||||||
|
@ -13,8 +14,8 @@ module CrawlerDetection
|
||||||
Regexp.new(escaped, Regexp::IGNORECASE)
|
Regexp.new(escaped, Regexp::IGNORECASE)
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.crawler?(user_agent)
|
def self.crawler?(user_agent, via_header = nil)
|
||||||
return true if user_agent.nil?
|
return true if user_agent.nil? || via_header&.include?(WAYBACK_MACHINE_URL)
|
||||||
|
|
||||||
# this is done to avoid regenerating regexes
|
# this is done to avoid regenerating regexes
|
||||||
@non_crawler_matchers ||= {}
|
@non_crawler_matchers ||= {}
|
||||||
|
|
|
@ -62,7 +62,7 @@ module Middleware
|
||||||
@is_crawler ||=
|
@is_crawler ||=
|
||||||
begin
|
begin
|
||||||
user_agent = @env[USER_AGENT]
|
user_agent = @env[USER_AGENT]
|
||||||
if CrawlerDetection.crawler?(user_agent)
|
if CrawlerDetection.crawler?(user_agent, @env["HTTP_VIA"])
|
||||||
:true
|
:true
|
||||||
else
|
else
|
||||||
user_agent.downcase.include?("discourse") ? :true : :false
|
user_agent.downcase.include?("discourse") ? :true : :false
|
||||||
|
|
|
@ -5,9 +5,9 @@ require_dependency 'crawler_detection'
|
||||||
|
|
||||||
describe CrawlerDetection do
|
describe CrawlerDetection do
|
||||||
|
|
||||||
def crawler!(s)
|
def crawler!(user_agent, via = nil)
|
||||||
if (!CrawlerDetection.crawler?(s))
|
if (!CrawlerDetection.crawler?(user_agent, via))
|
||||||
raise "#{s} should be a crawler!"
|
raise "#{user_agent} should be a crawler!"
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -50,6 +50,10 @@ describe CrawlerDetection do
|
||||||
crawler! "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)"
|
crawler! "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)"
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it "returns true when VIA header contains 'web.archive.org'" do
|
||||||
|
crawler!("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", "HTTP/1.0 web.archive.org (Wayback Save Page)")
|
||||||
|
end
|
||||||
|
|
||||||
it "returns false for non-crawler user agents" do
|
it "returns false for non-crawler user agents" do
|
||||||
not_crawler! "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36"
|
not_crawler! "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36"
|
||||||
not_crawler! "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko"
|
not_crawler! "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko"
|
||||||
|
|
|
@ -2690,15 +2690,14 @@ RSpec.describe TopicsController do
|
||||||
end
|
end
|
||||||
|
|
||||||
context "when a crawler" do
|
context "when a crawler" do
|
||||||
it "renders with the crawler layout, and handles proper pagination" do
|
let(:topic) { Fabricate(:topic) }
|
||||||
|
let(:page1_time) { 3.months.ago }
|
||||||
page1_time = 3.months.ago
|
let(:page2_time) { 2.months.ago }
|
||||||
page2_time = 2.months.ago
|
let(:page3_time) { 1.month.ago }
|
||||||
page3_time = 1.month.ago
|
|
||||||
|
|
||||||
|
before do
|
||||||
freeze_time page1_time
|
freeze_time page1_time
|
||||||
|
|
||||||
topic = Fabricate(:topic)
|
|
||||||
Fabricate(:post, topic: topic)
|
Fabricate(:post, topic: topic)
|
||||||
Fabricate(:post, topic: topic)
|
Fabricate(:post, topic: topic)
|
||||||
|
|
||||||
|
@ -2712,10 +2711,11 @@ RSpec.describe TopicsController do
|
||||||
# ugly, but no inteface to set this and we don't want to create
|
# ugly, but no inteface to set this and we don't want to create
|
||||||
# 100 posts to test this thing
|
# 100 posts to test this thing
|
||||||
TopicView.stubs(:chunk_size).returns(2)
|
TopicView.stubs(:chunk_size).returns(2)
|
||||||
|
end
|
||||||
|
|
||||||
user_agent = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
shared_examples "crawler layout" do |user_agent, via|
|
||||||
|
it "renders with the crawler layout, and handles proper pagination" do
|
||||||
get topic.url, env: { "HTTP_USER_AGENT" => user_agent }
|
get topic.url, env: { "HTTP_USER_AGENT" => user_agent, "HTTP_VIA" => via }
|
||||||
|
|
||||||
body = response.body
|
body = response.body
|
||||||
|
|
||||||
|
@ -2725,7 +2725,7 @@ RSpec.describe TopicsController do
|
||||||
|
|
||||||
expect(response.headers['Last-Modified']).to eq(page1_time.httpdate)
|
expect(response.headers['Last-Modified']).to eq(page1_time.httpdate)
|
||||||
|
|
||||||
get topic.url + "?page=2", env: { "HTTP_USER_AGENT" => user_agent }
|
get topic.url + "?page=2", env: { "HTTP_USER_AGENT" => user_agent, "HTTP_VIA" => via }
|
||||||
body = response.body
|
body = response.body
|
||||||
|
|
||||||
expect(response.headers['Last-Modified']).to eq(page2_time.httpdate)
|
expect(response.headers['Last-Modified']).to eq(page2_time.httpdate)
|
||||||
|
@ -2733,7 +2733,7 @@ RSpec.describe TopicsController do
|
||||||
expect(body).to include('<link rel="prev" href="' + topic.relative_url)
|
expect(body).to include('<link rel="prev" href="' + topic.relative_url)
|
||||||
expect(body).to include('<link rel="next" href="' + topic.relative_url + "?page=3")
|
expect(body).to include('<link rel="next" href="' + topic.relative_url + "?page=3")
|
||||||
|
|
||||||
get topic.url + "?page=3", env: { "HTTP_USER_AGENT" => user_agent }
|
get topic.url + "?page=3", env: { "HTTP_USER_AGENT" => user_agent, "HTTP_VIA" => via }
|
||||||
body = response.body
|
body = response.body
|
||||||
|
|
||||||
expect(response.headers['Last-Modified']).to eq(page3_time.httpdate)
|
expect(response.headers['Last-Modified']).to eq(page3_time.httpdate)
|
||||||
|
@ -2741,6 +2741,10 @@ RSpec.describe TopicsController do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
include_examples "crawler layout", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", nil
|
||||||
|
include_examples "crawler layout", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", "HTTP/1.0 web.archive.org (Wayback Save Page)"
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
describe "#reset_bump_date" do
|
describe "#reset_bump_date" do
|
||||||
|
|
Loading…
Reference in New Issue