Merge pull request #5700 from discourse/crawl-block
FEATURE: control web crawlers access with white/blacklist
This commit is contained in:
commit
7311023a52
|
@ -2,6 +2,7 @@ import { ajax } from 'discourse/lib/ajax';
|
||||||
import round from "discourse/lib/round";
|
import round from "discourse/lib/round";
|
||||||
import { fmt } from 'discourse/lib/computed';
|
import { fmt } from 'discourse/lib/computed';
|
||||||
import { fillMissingDates } from 'discourse/lib/utilities';
|
import { fillMissingDates } from 'discourse/lib/utilities';
|
||||||
|
import computed from 'ember-addons/ember-computed-decorators';
|
||||||
|
|
||||||
const Report = Discourse.Model.extend({
|
const Report = Discourse.Model.extend({
|
||||||
reportUrl: fmt("type", "/admin/reports/%@"),
|
reportUrl: fmt("type", "/admin/reports/%@"),
|
||||||
|
@ -42,7 +43,8 @@ const Report = Discourse.Model.extend({
|
||||||
lastSevenDaysCount: function() { return this.valueFor(1, 7); }.property("data"),
|
lastSevenDaysCount: function() { return this.valueFor(1, 7); }.property("data"),
|
||||||
lastThirtyDaysCount: function() { return this.valueFor(1, 30); }.property("data"),
|
lastThirtyDaysCount: function() { return this.valueFor(1, 30); }.property("data"),
|
||||||
|
|
||||||
yesterdayTrend: function() {
|
@computed('data')
|
||||||
|
yesterdayTrend() {
|
||||||
const yesterdayVal = this.valueAt(1);
|
const yesterdayVal = this.valueAt(1);
|
||||||
const twoDaysAgoVal = this.valueAt(2);
|
const twoDaysAgoVal = this.valueAt(2);
|
||||||
if (yesterdayVal > twoDaysAgoVal) {
|
if (yesterdayVal > twoDaysAgoVal) {
|
||||||
|
@ -52,9 +54,10 @@ const Report = Discourse.Model.extend({
|
||||||
} else {
|
} else {
|
||||||
return "no-change";
|
return "no-change";
|
||||||
}
|
}
|
||||||
}.property("data"),
|
},
|
||||||
|
|
||||||
sevenDayTrend: function() {
|
@computed('data')
|
||||||
|
sevenDayTrend() {
|
||||||
const currentPeriod = this.valueFor(1, 7);
|
const currentPeriod = this.valueFor(1, 7);
|
||||||
const prevPeriod = this.valueFor(8, 14);
|
const prevPeriod = this.valueFor(8, 14);
|
||||||
if (currentPeriod > prevPeriod) {
|
if (currentPeriod > prevPeriod) {
|
||||||
|
@ -64,36 +67,39 @@ const Report = Discourse.Model.extend({
|
||||||
} else {
|
} else {
|
||||||
return "no-change";
|
return "no-change";
|
||||||
}
|
}
|
||||||
}.property("data"),
|
},
|
||||||
|
|
||||||
thirtyDayTrend: function() {
|
@computed('prev30Days', 'data')
|
||||||
if (this.get("prev30Days")) {
|
thirtyDayTrend(prev30Days) {
|
||||||
|
if (prev30Days) {
|
||||||
const currentPeriod = this.valueFor(1, 30);
|
const currentPeriod = this.valueFor(1, 30);
|
||||||
if (currentPeriod > this.get("prev30Days")) {
|
if (currentPeriod > this.get("prev30Days")) {
|
||||||
return "trending-up";
|
return "trending-up";
|
||||||
} else if (currentPeriod < this.get("prev30Days")) {
|
} else if (currentPeriod < prev30Days) {
|
||||||
return "trending-down";
|
return "trending-down";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return "no-change";
|
return "no-change";
|
||||||
}.property("data", "prev30Days"),
|
},
|
||||||
|
|
||||||
icon: function() {
|
@computed('type')
|
||||||
switch (this.get("type")) {
|
icon(type) {
|
||||||
|
switch (type) {
|
||||||
case "flags": return "flag";
|
case "flags": return "flag";
|
||||||
case "likes": return "heart";
|
case "likes": return "heart";
|
||||||
case "bookmarks": return "bookmark";
|
case "bookmarks": return "bookmark";
|
||||||
default: return null;
|
default: return null;
|
||||||
}
|
}
|
||||||
}.property("type"),
|
},
|
||||||
|
|
||||||
method: function() {
|
@computed('type')
|
||||||
if (this.get("type") === "time_to_first_response") {
|
method(type) {
|
||||||
|
if (type === "time_to_first_response") {
|
||||||
return "average";
|
return "average";
|
||||||
} else {
|
} else {
|
||||||
return "sum";
|
return "sum";
|
||||||
}
|
}
|
||||||
}.property("type"),
|
},
|
||||||
|
|
||||||
percentChangeString(val1, val2) {
|
percentChangeString(val1, val2) {
|
||||||
const val = ((val1 - val2) / val2) * 100;
|
const val = ((val1 - val2) / val2) * 100;
|
||||||
|
@ -114,21 +120,31 @@ const Report = Discourse.Model.extend({
|
||||||
return title;
|
return title;
|
||||||
},
|
},
|
||||||
|
|
||||||
yesterdayCountTitle: function() {
|
@computed('data')
|
||||||
|
yesterdayCountTitle() {
|
||||||
return this.changeTitle(this.valueAt(1), this.valueAt(2), "two days ago");
|
return this.changeTitle(this.valueAt(1), this.valueAt(2), "two days ago");
|
||||||
}.property("data"),
|
},
|
||||||
|
|
||||||
sevenDayCountTitle: function() {
|
@computed('data')
|
||||||
|
sevenDayCountTitle() {
|
||||||
return this.changeTitle(this.valueFor(1, 7), this.valueFor(8, 14), "two weeks ago");
|
return this.changeTitle(this.valueFor(1, 7), this.valueFor(8, 14), "two weeks ago");
|
||||||
}.property("data"),
|
},
|
||||||
|
|
||||||
thirtyDayCountTitle: function() {
|
@computed('prev30Days', 'data')
|
||||||
return this.changeTitle(this.valueFor(1, 30), this.get("prev30Days"), "in the previous 30 day period");
|
thirtyDayCountTitle(prev30Days) {
|
||||||
}.property("data"),
|
return this.changeTitle(this.valueFor(1, 30), prev30Days, "in the previous 30 day period");
|
||||||
|
},
|
||||||
|
|
||||||
dataReversed: function() {
|
@computed('data')
|
||||||
return this.get("data").toArray().reverse();
|
sortedData(data) {
|
||||||
}.property("data")
|
return this.get('xAxisIsDate') ? data.toArray().reverse() : data.toArray();
|
||||||
|
},
|
||||||
|
|
||||||
|
@computed('data')
|
||||||
|
xAxisIsDate() {
|
||||||
|
if (!this.data[0]) return false;
|
||||||
|
return this.data && this.data[0].x.match(/\d{4}-\d{1,2}-\d{1,2}/);
|
||||||
|
}
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -152,6 +168,14 @@ Report.reopenClass({
|
||||||
|
|
||||||
const model = Report.create({ type: type });
|
const model = Report.create({ type: type });
|
||||||
model.setProperties(json.report);
|
model.setProperties(json.report);
|
||||||
|
|
||||||
|
if (json.report.related_report) {
|
||||||
|
// TODO: fillMissingDates if xaxis is date
|
||||||
|
const related = Report.create({ type: json.report.related_report.type });
|
||||||
|
related.setProperties(json.report.related_report);
|
||||||
|
model.set('relatedReport', related);
|
||||||
|
}
|
||||||
|
|
||||||
return model;
|
return model;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,17 @@
|
||||||
|
{{#if model.sortedData}}
|
||||||
|
<table class="table report {{model.type}}">
|
||||||
|
<tr>
|
||||||
|
<th>{{model.xaxis}}</th>
|
||||||
|
<th>{{model.yaxis}}</th>
|
||||||
|
</tr>
|
||||||
|
|
||||||
|
{{#each model.sortedData as |row|}}
|
||||||
|
<tr>
|
||||||
|
<td class="x-value">{{row.x}}</td>
|
||||||
|
<td>
|
||||||
|
{{row.y}}
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
{{/each}}
|
||||||
|
</table>
|
||||||
|
{{/if}}
|
|
@ -31,20 +31,10 @@
|
||||||
{{#if viewingGraph}}
|
{{#if viewingGraph}}
|
||||||
{{admin-graph model=model}}
|
{{admin-graph model=model}}
|
||||||
{{else}}
|
{{else}}
|
||||||
<table class='table report'>
|
{{admin-table-report model=model}}
|
||||||
<tr>
|
{{/if}}
|
||||||
<th>{{model.xaxis}}</th>
|
|
||||||
<th>{{model.yaxis}}</th>
|
|
||||||
</tr>
|
|
||||||
|
|
||||||
{{#each model.dataReversed as |row|}}
|
{{#if model.relatedReport}}
|
||||||
<tr>
|
{{admin-table-report model=model.relatedReport}}
|
||||||
<td>{{row.x}}</td>
|
|
||||||
<td>
|
|
||||||
{{row.y}}
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
{{/each}}
|
|
||||||
</table>
|
|
||||||
{{/if}}
|
{{/if}}
|
||||||
{{/conditional-loading-spinner}}
|
{{/conditional-loading-spinner}}
|
||||||
|
|
|
@ -167,6 +167,20 @@ $mobile-breakpoint: 700px;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
&.web_crawlers {
|
||||||
|
tr {
|
||||||
|
th:nth-of-type(1) {
|
||||||
|
width: 60%;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
td.x-value {
|
||||||
|
max-width: 0;
|
||||||
|
overflow: hidden;
|
||||||
|
text-overflow: ellipsis;
|
||||||
|
white-space: nowrap;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
.bar-container {
|
.bar-container {
|
||||||
float: left;
|
float: left;
|
||||||
width: 300px;
|
width: 300px;
|
||||||
|
|
|
@ -3,7 +3,21 @@ class RobotsTxtController < ApplicationController
|
||||||
skip_before_action :preload_json, :check_xhr, :redirect_to_login_if_required
|
skip_before_action :preload_json, :check_xhr, :redirect_to_login_if_required
|
||||||
|
|
||||||
def index
|
def index
|
||||||
path = SiteSetting.allow_index_in_robots_txt ? :index : :no_index
|
if SiteSetting.allow_index_in_robots_txt
|
||||||
|
path = :index
|
||||||
|
if SiteSetting.whitelisted_crawler_user_agents.present?
|
||||||
|
@allowed_user_agents = SiteSetting.whitelisted_crawler_user_agents.split('|')
|
||||||
|
@disallowed_user_agents = ['*']
|
||||||
|
elsif SiteSetting.blacklisted_crawler_user_agents.present?
|
||||||
|
@allowed_user_agents = ['*']
|
||||||
|
@disallowed_user_agents = SiteSetting.blacklisted_crawler_user_agents.split('|')
|
||||||
|
else
|
||||||
|
@allowed_user_agents = ['*']
|
||||||
|
end
|
||||||
|
else
|
||||||
|
path = :no_index
|
||||||
|
end
|
||||||
|
|
||||||
render path, content_type: 'text/plain'
|
render path, content_type: 'text/plain'
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -0,0 +1,26 @@
|
||||||
|
module Jobs
|
||||||
|
|
||||||
|
class CleanUpCrawlerStats < Jobs::Scheduled
|
||||||
|
every 1.day
|
||||||
|
|
||||||
|
def execute(args)
|
||||||
|
WebCrawlerRequest.where('date < ?', WebCrawlerRequest.max_record_age.ago).delete_all
|
||||||
|
|
||||||
|
# keep count of only the top user agents
|
||||||
|
WebCrawlerRequest.exec_sql <<~SQL
|
||||||
|
WITH ranked_requests AS (
|
||||||
|
SELECT row_number() OVER (ORDER BY count DESC) as row_number, id
|
||||||
|
FROM web_crawler_requests
|
||||||
|
WHERE date = '#{1.day.ago.strftime("%Y-%m-%d")}'
|
||||||
|
)
|
||||||
|
DELETE FROM web_crawler_requests
|
||||||
|
WHERE id IN (
|
||||||
|
SELECT ranked_requests.id
|
||||||
|
FROM ranked_requests
|
||||||
|
WHERE row_number > #{WebCrawlerRequest.max_records_per_day}
|
||||||
|
)
|
||||||
|
SQL
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
|
@ -1,5 +1,6 @@
|
||||||
# frozen_string_literal: true
|
# frozen_string_literal: true
|
||||||
class ApplicationRequest < ActiveRecord::Base
|
class ApplicationRequest < ActiveRecord::Base
|
||||||
|
|
||||||
enum req_type: %i(http_total
|
enum req_type: %i(http_total
|
||||||
http_2xx
|
http_2xx
|
||||||
http_background
|
http_background
|
||||||
|
@ -12,41 +13,12 @@ class ApplicationRequest < ActiveRecord::Base
|
||||||
page_view_logged_in_mobile
|
page_view_logged_in_mobile
|
||||||
page_view_anon_mobile)
|
page_view_anon_mobile)
|
||||||
|
|
||||||
cattr_accessor :autoflush, :autoflush_seconds, :last_flush
|
include CachedCounting
|
||||||
# auto flush if backlog is larger than this
|
|
||||||
self.autoflush = 2000
|
|
||||||
|
|
||||||
# auto flush if older than this
|
|
||||||
self.autoflush_seconds = 5.minutes
|
|
||||||
self.last_flush = Time.now.utc
|
|
||||||
|
|
||||||
def self.increment!(type, opts = nil)
|
def self.increment!(type, opts = nil)
|
||||||
key = redis_key(type)
|
perform_increment!(redis_key(type), opts)
|
||||||
val = $redis.incr(key).to_i
|
|
||||||
|
|
||||||
# readonly mode it is going to be 0, skip
|
|
||||||
return if val == 0
|
|
||||||
|
|
||||||
# 3.days, see: https://github.com/rails/rails/issues/21296
|
|
||||||
$redis.expire(key, 259200)
|
|
||||||
|
|
||||||
autoflush = (opts && opts[:autoflush]) || self.autoflush
|
|
||||||
if autoflush > 0 && val >= autoflush
|
|
||||||
write_cache!
|
|
||||||
return
|
|
||||||
end
|
|
||||||
|
|
||||||
if (Time.now.utc - last_flush).to_i > autoflush_seconds
|
|
||||||
write_cache!
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
GET_AND_RESET = <<~LUA
|
|
||||||
local val = redis.call('get', KEYS[1])
|
|
||||||
redis.call('set', KEYS[1], '0')
|
|
||||||
return val
|
|
||||||
LUA
|
|
||||||
|
|
||||||
def self.write_cache!(date = nil)
|
def self.write_cache!(date = nil)
|
||||||
if date.nil?
|
if date.nil?
|
||||||
write_cache!(Time.now.utc)
|
write_cache!(Time.now.utc)
|
||||||
|
@ -58,13 +30,9 @@ class ApplicationRequest < ActiveRecord::Base
|
||||||
|
|
||||||
date = date.to_date
|
date = date.to_date
|
||||||
|
|
||||||
# this may seem a bit fancy but in so it allows
|
|
||||||
# for concurrent calls without double counting
|
|
||||||
req_types.each do |req_type, _|
|
req_types.each do |req_type, _|
|
||||||
key = redis_key(req_type, date)
|
val = get_and_reset(redis_key(req_type, date))
|
||||||
|
|
||||||
namespaced_key = $redis.namespace_key(key)
|
|
||||||
val = $redis.without_namespace.eval(GET_AND_RESET, keys: [namespaced_key]).to_i
|
|
||||||
next if val == 0
|
next if val == 0
|
||||||
|
|
||||||
id = req_id(date, req_type)
|
id = req_id(date, req_type)
|
||||||
|
|
|
@ -0,0 +1,67 @@
|
||||||
|
module CachedCounting
|
||||||
|
extend ActiveSupport::Concern
|
||||||
|
|
||||||
|
included do
|
||||||
|
class << self
|
||||||
|
attr_accessor :autoflush, :autoflush_seconds, :last_flush
|
||||||
|
end
|
||||||
|
|
||||||
|
# auto flush if backlog is larger than this
|
||||||
|
self.autoflush = 2000
|
||||||
|
|
||||||
|
# auto flush if older than this
|
||||||
|
self.autoflush_seconds = 5.minutes
|
||||||
|
|
||||||
|
self.last_flush = Time.now.utc
|
||||||
|
end
|
||||||
|
|
||||||
|
class_methods do
|
||||||
|
def perform_increment!(key, opts = nil)
|
||||||
|
val = $redis.incr(key).to_i
|
||||||
|
|
||||||
|
# readonly mode it is going to be 0, skip
|
||||||
|
return if val == 0
|
||||||
|
|
||||||
|
# 3.days, see: https://github.com/rails/rails/issues/21296
|
||||||
|
$redis.expire(key, 259200)
|
||||||
|
|
||||||
|
autoflush = (opts && opts[:autoflush]) || self.autoflush
|
||||||
|
if autoflush > 0 && val >= autoflush
|
||||||
|
write_cache!
|
||||||
|
return
|
||||||
|
end
|
||||||
|
|
||||||
|
if (Time.now.utc - last_flush).to_i > autoflush_seconds
|
||||||
|
write_cache!
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def write_cache!(date = nil)
|
||||||
|
raise NotImplementedError
|
||||||
|
end
|
||||||
|
|
||||||
|
GET_AND_RESET = <<~LUA
|
||||||
|
local val = redis.call('get', KEYS[1])
|
||||||
|
redis.call('set', KEYS[1], '0')
|
||||||
|
return val
|
||||||
|
LUA
|
||||||
|
|
||||||
|
# this may seem a bit fancy but in so it allows
|
||||||
|
# for concurrent calls without double counting
|
||||||
|
def get_and_reset(key)
|
||||||
|
namespaced_key = $redis.namespace_key(key)
|
||||||
|
$redis.without_namespace.eval(GET_AND_RESET, keys: [namespaced_key]).to_i
|
||||||
|
end
|
||||||
|
|
||||||
|
def request_id(query_params, retries = 0)
|
||||||
|
id = where(query_params).pluck(:id).first
|
||||||
|
id ||= create!(query_params.merge(count: 0)).id
|
||||||
|
rescue # primary key violation
|
||||||
|
if retries == 0
|
||||||
|
request_id(query_params, 1)
|
||||||
|
else
|
||||||
|
raise
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -27,7 +27,11 @@ class Report
|
||||||
category_id: category_id,
|
category_id: category_id,
|
||||||
group_id: group_id,
|
group_id: group_id,
|
||||||
prev30Days: self.prev30Days
|
prev30Days: self.prev30Days
|
||||||
}
|
}.tap do |json|
|
||||||
|
if type == 'page_view_crawler_reqs'
|
||||||
|
json[:related_report] = Report.find('web_crawlers', start_date: start_date, end_date: end_date)&.as_json
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def Report.add_report(name, &block)
|
def Report.add_report(name, &block)
|
||||||
|
@ -231,4 +235,12 @@ class Report
|
||||||
def self.report_notify_user_private_messages(report)
|
def self.report_notify_user_private_messages(report)
|
||||||
private_messages_report report, TopicSubtype.notify_user
|
private_messages_report report, TopicSubtype.notify_user
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def self.report_web_crawlers(report)
|
||||||
|
report.data = WebCrawlerRequest.where('date >= ? and date <= ?', report.start_date, report.end_date)
|
||||||
|
.limit(200)
|
||||||
|
.order('sum_count DESC')
|
||||||
|
.group(:user_agent).sum(:count)
|
||||||
|
.map { |ua, count| { x: ua, y: count } }
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -0,0 +1,76 @@
|
||||||
|
class WebCrawlerRequest < ActiveRecord::Base
|
||||||
|
include CachedCounting
|
||||||
|
|
||||||
|
# auto flush if older than this
|
||||||
|
self.autoflush_seconds = 1.hour
|
||||||
|
|
||||||
|
cattr_accessor :max_record_age, :max_records_per_day
|
||||||
|
|
||||||
|
# only keep the top records based on request count
|
||||||
|
self.max_records_per_day = 200
|
||||||
|
|
||||||
|
# delete records older than this
|
||||||
|
self.max_record_age = 30.days
|
||||||
|
|
||||||
|
def self.increment!(user_agent, opts = nil)
|
||||||
|
ua_list_key = user_agent_list_key
|
||||||
|
$redis.sadd(ua_list_key, user_agent)
|
||||||
|
$redis.expire(ua_list_key, 259200) # 3.days
|
||||||
|
|
||||||
|
perform_increment!(redis_key(user_agent), opts)
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.write_cache!(date = nil)
|
||||||
|
if date.nil?
|
||||||
|
write_cache!(Time.now.utc)
|
||||||
|
write_cache!(Time.now.utc.yesterday)
|
||||||
|
return
|
||||||
|
end
|
||||||
|
|
||||||
|
self.last_flush = Time.now.utc
|
||||||
|
|
||||||
|
date = date.to_date
|
||||||
|
ua_list_key = user_agent_list_key(date)
|
||||||
|
|
||||||
|
while user_agent = $redis.spop(ua_list_key)
|
||||||
|
val = get_and_reset(redis_key(user_agent, date))
|
||||||
|
|
||||||
|
next if val == 0
|
||||||
|
|
||||||
|
self.where(id: req_id(date, user_agent)).update_all(["count = count + ?", val])
|
||||||
|
end
|
||||||
|
rescue Redis::CommandError => e
|
||||||
|
raise unless e.message =~ /READONLY/
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.clear_cache!(date = nil)
|
||||||
|
if date.nil?
|
||||||
|
clear_cache!(Time.now.utc)
|
||||||
|
clear_cache!(Time.now.utc.yesterday)
|
||||||
|
return
|
||||||
|
end
|
||||||
|
|
||||||
|
list_key = user_agent_list_key(date)
|
||||||
|
|
||||||
|
$redis.smembers(list_key).each do |user_agent, _|
|
||||||
|
$redis.del redis_key(user_agent, date)
|
||||||
|
end
|
||||||
|
|
||||||
|
$redis.del(list_key)
|
||||||
|
end
|
||||||
|
|
||||||
|
protected
|
||||||
|
|
||||||
|
def self.user_agent_list_key(time = Time.now.utc)
|
||||||
|
"crawl_ua_list:#{time.strftime('%Y%m%d')}"
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.redis_key(user_agent, time = Time.now.utc)
|
||||||
|
"crawl_req:#{time.strftime('%Y%m%d')}:#{user_agent}"
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.req_id(date, user_agent)
|
||||||
|
request_id(date: date, user_agent: user_agent)
|
||||||
|
end
|
||||||
|
end
|
|
@ -1,6 +1,8 @@
|
||||||
# See http://www.robotstxt.org/wc/norobots.html for documentation on how to use the robots.txt file
|
# See http://www.robotstxt.org/wc/norobots.html for documentation on how to use the robots.txt file
|
||||||
#
|
#
|
||||||
User-agent: *
|
<% @allowed_user_agents.each do |user_agent| %>
|
||||||
|
User-agent: <%= user_agent %>
|
||||||
|
<% end %>
|
||||||
Disallow: /auth/cas
|
Disallow: /auth/cas
|
||||||
Disallow: /auth/facebook/callback
|
Disallow: /auth/facebook/callback
|
||||||
Disallow: /auth/twitter/callback
|
Disallow: /auth/twitter/callback
|
||||||
|
@ -29,4 +31,12 @@ Disallow: /groups
|
||||||
Disallow: /groups/
|
Disallow: /groups/
|
||||||
Disallow: /uploads/
|
Disallow: /uploads/
|
||||||
|
|
||||||
|
<% if @disallowed_user_agents %>
|
||||||
|
<% @disallowed_user_agents.each do |user_agent| %>
|
||||||
|
User-agent: <%= user_agent %>
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
<% end %>
|
||||||
|
<% end %>
|
||||||
|
|
||||||
<%= server_plugin_outlet "robots_txt_index" %>
|
<%= server_plugin_outlet "robots_txt_index" %>
|
||||||
|
|
|
@ -968,6 +968,10 @@ en:
|
||||||
title: "User Visits"
|
title: "User Visits"
|
||||||
xaxis: "Day"
|
xaxis: "Day"
|
||||||
yaxis: "Number of visits"
|
yaxis: "Number of visits"
|
||||||
|
web_crawlers:
|
||||||
|
title: "Web Crawler Requests"
|
||||||
|
xaxis: "User Agent"
|
||||||
|
yaxis: "Pageviews"
|
||||||
|
|
||||||
dashboard:
|
dashboard:
|
||||||
rails_env_warning: "Your server is running in %{env} mode."
|
rails_env_warning: "Your server is running in %{env} mode."
|
||||||
|
@ -1119,6 +1123,8 @@ en:
|
||||||
blacklist_ip_blocks: "A list of private IP blocks that should never be crawled by Discourse"
|
blacklist_ip_blocks: "A list of private IP blocks that should never be crawled by Discourse"
|
||||||
whitelist_internal_hosts: "A list of internal hosts that discourse can safely crawl for oneboxing and other purposes"
|
whitelist_internal_hosts: "A list of internal hosts that discourse can safely crawl for oneboxing and other purposes"
|
||||||
allowed_iframes: "A list of iframe src domain prefixes that discourse can safely allow in posts"
|
allowed_iframes: "A list of iframe src domain prefixes that discourse can safely allow in posts"
|
||||||
|
whitelisted_crawler_user_agents: 'User agents of web crawlers that should be allowed to access the site.'
|
||||||
|
blacklisted_crawler_user_agents: 'User agents of web crawlers that should not be allowed to access the site. Does not apply if whitelist is defined.'
|
||||||
top_menu: "Determine which items appear in the homepage navigation, and in what order. Example latest|new|unread|categories|top|read|posted|bookmarks"
|
top_menu: "Determine which items appear in the homepage navigation, and in what order. Example latest|new|unread|categories|top|read|posted|bookmarks"
|
||||||
post_menu: "Determine which items appear on the post menu, and in what order. Example like|edit|flag|delete|share|bookmark|reply"
|
post_menu: "Determine which items appear on the post menu, and in what order. Example like|edit|flag|delete|share|bookmark|reply"
|
||||||
post_menu_hidden_items: "The menu items to hide by default in the post menu unless an expansion ellipsis is clicked on."
|
post_menu_hidden_items: "The menu items to hide by default in the post menu unless an expansion ellipsis is clicked on."
|
||||||
|
|
|
@ -1018,6 +1018,12 @@ security:
|
||||||
default: 'https://www.google.com/maps/embed?|https://www.openstreetmap.org/export/embed.html?|https://calendar.google.com/calendar/embed?'
|
default: 'https://www.google.com/maps/embed?|https://www.openstreetmap.org/export/embed.html?|https://calendar.google.com/calendar/embed?'
|
||||||
type: list
|
type: list
|
||||||
client: true
|
client: true
|
||||||
|
whitelisted_crawler_user_agents:
|
||||||
|
type: list
|
||||||
|
default: ''
|
||||||
|
blacklisted_crawler_user_agents:
|
||||||
|
type: list
|
||||||
|
default: ''
|
||||||
|
|
||||||
onebox:
|
onebox:
|
||||||
enable_flash_video_onebox: false
|
enable_flash_video_onebox: false
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
class CreateWebCrawlerRequests < ActiveRecord::Migration[5.1]
|
||||||
|
def change
|
||||||
|
create_table :web_crawler_requests do |t|
|
||||||
|
t.date :date, null: false
|
||||||
|
t.string :user_agent, null: false
|
||||||
|
t.integer :count, null: false, default: 0
|
||||||
|
end
|
||||||
|
|
||||||
|
add_index :web_crawler_requests, [:date, :user_agent], unique: true
|
||||||
|
end
|
||||||
|
end
|
|
@ -28,4 +28,25 @@ module CrawlerDetection
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Given a user_agent that returns true from crawler?, should its request be allowed?
|
||||||
|
def self.allow_crawler?(user_agent)
|
||||||
|
return true if SiteSetting.whitelisted_crawler_user_agents.blank? &&
|
||||||
|
SiteSetting.blacklisted_crawler_user_agents.blank?
|
||||||
|
|
||||||
|
@whitelisted_matchers ||= {}
|
||||||
|
@blacklisted_matchers ||= {}
|
||||||
|
|
||||||
|
if SiteSetting.whitelisted_crawler_user_agents.present?
|
||||||
|
whitelisted = @whitelisted_matchers[SiteSetting.whitelisted_crawler_user_agents] ||= to_matcher(SiteSetting.whitelisted_crawler_user_agents)
|
||||||
|
!user_agent.nil? && user_agent.match?(whitelisted)
|
||||||
|
else
|
||||||
|
blacklisted = @blacklisted_matchers[SiteSetting.blacklisted_crawler_user_agents] ||= to_matcher(SiteSetting.blacklisted_crawler_user_agents)
|
||||||
|
user_agent.nil? || !user_agent.match?(blacklisted)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.is_blocked_crawler?(user_agent)
|
||||||
|
crawler?(user_agent) && !allow_crawler?(user_agent)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -83,6 +83,7 @@ class Middleware::RequestTracker
|
||||||
if track_view
|
if track_view
|
||||||
if data[:is_crawler]
|
if data[:is_crawler]
|
||||||
ApplicationRequest.increment!(:page_view_crawler)
|
ApplicationRequest.increment!(:page_view_crawler)
|
||||||
|
WebCrawlerRequest.increment!(data[:user_agent])
|
||||||
elsif data[:has_auth_cookie]
|
elsif data[:has_auth_cookie]
|
||||||
ApplicationRequest.increment!(:page_view_logged_in)
|
ApplicationRequest.increment!(:page_view_logged_in)
|
||||||
ApplicationRequest.increment!(:page_view_logged_in_mobile) if data[:is_mobile]
|
ApplicationRequest.increment!(:page_view_logged_in_mobile) if data[:is_mobile]
|
||||||
|
@ -129,8 +130,9 @@ class Middleware::RequestTracker
|
||||||
is_mobile: helper.is_mobile?,
|
is_mobile: helper.is_mobile?,
|
||||||
track_view: track_view,
|
track_view: track_view,
|
||||||
timing: timing
|
timing: timing
|
||||||
}
|
}.tap do |h|
|
||||||
|
h[:user_agent] = env['HTTP_USER_AGENT'] if h[:is_crawler]
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def log_request_info(env, result, info)
|
def log_request_info(env, result, info)
|
||||||
|
@ -155,12 +157,20 @@ class Middleware::RequestTracker
|
||||||
|
|
||||||
def call(env)
|
def call(env)
|
||||||
result = nil
|
result = nil
|
||||||
|
log_request = true
|
||||||
|
request = Rack::Request.new(env)
|
||||||
|
|
||||||
if rate_limit(env)
|
if rate_limit(request)
|
||||||
result = [429, {}, ["Slow down, too Many Requests from this IP Address"]]
|
result = [429, {}, ["Slow down, too Many Requests from this IP Address"]]
|
||||||
return result
|
return result
|
||||||
end
|
end
|
||||||
|
|
||||||
|
if block_crawler(request)
|
||||||
|
log_request = false
|
||||||
|
result = [403, { 'Content-Type' => 'text/plain' }, ['Crawler is not allowed']]
|
||||||
|
return result
|
||||||
|
end
|
||||||
|
|
||||||
env["discourse.request_tracker"] = self
|
env["discourse.request_tracker"] = self
|
||||||
MethodProfiler.start
|
MethodProfiler.start
|
||||||
result = @app.call(env)
|
result = @app.call(env)
|
||||||
|
@ -186,7 +196,7 @@ class Middleware::RequestTracker
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
log_request_info(env, result, info) unless env["discourse.request_tracker.skip"]
|
log_request_info(env, result, info) unless !log_request || env["discourse.request_tracker.skip"]
|
||||||
end
|
end
|
||||||
|
|
||||||
PRIVATE_IP ||= /^(127\.)|(192\.168\.)|(10\.)|(172\.1[6-9]\.)|(172\.2[0-9]\.)|(172\.3[0-1]\.)|(::1$)|([fF][cCdD])/
|
PRIVATE_IP ||= /^(127\.)|(192\.168\.)|(10\.)|(172\.1[6-9]\.)|(172\.2[0-9]\.)|(172\.3[0-1]\.)|(::1$)|([fF][cCdD])/
|
||||||
|
@ -196,7 +206,7 @@ class Middleware::RequestTracker
|
||||||
!!(ip && ip.to_s.match?(PRIVATE_IP))
|
!!(ip && ip.to_s.match?(PRIVATE_IP))
|
||||||
end
|
end
|
||||||
|
|
||||||
def rate_limit(env)
|
def rate_limit(request)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
GlobalSetting.max_reqs_per_ip_mode == "block" ||
|
GlobalSetting.max_reqs_per_ip_mode == "block" ||
|
||||||
|
@ -204,7 +214,7 @@ class Middleware::RequestTracker
|
||||||
GlobalSetting.max_reqs_per_ip_mode == "warn+block"
|
GlobalSetting.max_reqs_per_ip_mode == "warn+block"
|
||||||
)
|
)
|
||||||
|
|
||||||
ip = Rack::Request.new(env).ip
|
ip = request.ip
|
||||||
|
|
||||||
if !GlobalSetting.max_reqs_rate_limit_on_private
|
if !GlobalSetting.max_reqs_rate_limit_on_private
|
||||||
return false if is_private_ip?(ip)
|
return false if is_private_ip?(ip)
|
||||||
|
@ -236,15 +246,15 @@ class Middleware::RequestTracker
|
||||||
global: true
|
global: true
|
||||||
)
|
)
|
||||||
|
|
||||||
env['DISCOURSE_RATE_LIMITERS'] = [limiter10, limiter60]
|
request.env['DISCOURSE_RATE_LIMITERS'] = [limiter10, limiter60]
|
||||||
env['DISCOURSE_ASSET_RATE_LIMITERS'] = [limiter_assets10]
|
request.env['DISCOURSE_ASSET_RATE_LIMITERS'] = [limiter_assets10]
|
||||||
|
|
||||||
warn = GlobalSetting.max_reqs_per_ip_mode == "warn" ||
|
warn = GlobalSetting.max_reqs_per_ip_mode == "warn" ||
|
||||||
GlobalSetting.max_reqs_per_ip_mode == "warn+block"
|
GlobalSetting.max_reqs_per_ip_mode == "warn+block"
|
||||||
|
|
||||||
if !limiter_assets10.can_perform?
|
if !limiter_assets10.can_perform?
|
||||||
if warn
|
if warn
|
||||||
Rails.logger.warn("Global asset IP rate limit exceeded for #{ip}: 10 second rate limit, uri: #{env["REQUEST_URI"]}")
|
Rails.logger.warn("Global asset IP rate limit exceeded for #{ip}: 10 second rate limit, uri: #{request.env["REQUEST_URI"]}")
|
||||||
end
|
end
|
||||||
|
|
||||||
return !(GlobalSetting.max_reqs_per_ip_mode == "warn")
|
return !(GlobalSetting.max_reqs_per_ip_mode == "warn")
|
||||||
|
@ -257,7 +267,7 @@ class Middleware::RequestTracker
|
||||||
limiter60.performed!
|
limiter60.performed!
|
||||||
rescue RateLimiter::LimitExceeded
|
rescue RateLimiter::LimitExceeded
|
||||||
if warn
|
if warn
|
||||||
Rails.logger.warn("Global IP rate limit exceeded for #{ip}: #{type} second rate limit, uri: #{env["REQUEST_URI"]}")
|
Rails.logger.warn("Global IP rate limit exceeded for #{ip}: #{type} second rate limit, uri: #{request.env["REQUEST_URI"]}")
|
||||||
!(GlobalSetting.max_reqs_per_ip_mode == "warn")
|
!(GlobalSetting.max_reqs_per_ip_mode == "warn")
|
||||||
else
|
else
|
||||||
true
|
true
|
||||||
|
@ -266,6 +276,14 @@ class Middleware::RequestTracker
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def block_crawler(request)
|
||||||
|
request.get? &&
|
||||||
|
!request.xhr? &&
|
||||||
|
request.env['HTTP_ACCEPT'] =~ /text\/html/ &&
|
||||||
|
!request.path.ends_with?('robots.txt') &&
|
||||||
|
CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT'])
|
||||||
|
end
|
||||||
|
|
||||||
def log_later(data, host)
|
def log_later(data, host)
|
||||||
Scheduler::Defer.later("Track view", _db = nil) do
|
Scheduler::Defer.later("Track view", _db = nil) do
|
||||||
self.class.log_request_on_site(data, host)
|
self.class.log_request_on_site(data, host)
|
||||||
|
|
|
@ -28,6 +28,12 @@ describe CrawlerDetection do
|
||||||
expect(described_class.crawler?("Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)")).to eq(true)
|
expect(described_class.crawler?("Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)")).to eq(true)
|
||||||
expect(described_class.crawler?("Baiduspider+(+http://www.baidu.com/search/spider.htm)")).to eq(true)
|
expect(described_class.crawler?("Baiduspider+(+http://www.baidu.com/search/spider.htm)")).to eq(true)
|
||||||
expect(described_class.crawler?("Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)")).to eq(true)
|
expect(described_class.crawler?("Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)")).to eq(true)
|
||||||
|
|
||||||
|
expect(described_class.crawler?("DiscourseAPI Ruby Gem 0.19.0")).to eq(true)
|
||||||
|
expect(described_class.crawler?("Pingdom.com_bot_version_1.4_(http://www.pingdom.com/)")).to eq(true)
|
||||||
|
expect(described_class.crawler?("LogicMonitor SiteMonitor/1.0")).to eq(true)
|
||||||
|
expect(described_class.crawler?("Java/1.8.0_151")).to eq(true)
|
||||||
|
expect(described_class.crawler?("Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)")).to eq(true)
|
||||||
end
|
end
|
||||||
|
|
||||||
it "returns false for non-crawler user agents" do
|
it "returns false for non-crawler user agents" do
|
||||||
|
@ -37,13 +43,106 @@ describe CrawlerDetection do
|
||||||
expect(described_class.crawler?("Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25")).to eq(false)
|
expect(described_class.crawler?("Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25")).to eq(false)
|
||||||
expect(described_class.crawler?("Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0")).to eq(false)
|
expect(described_class.crawler?("Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0")).to eq(false)
|
||||||
expect(described_class.crawler?("Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30")).to eq(false)
|
expect(described_class.crawler?("Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30")).to eq(false)
|
||||||
|
|
||||||
expect(described_class.crawler?("DiscourseAPI Ruby Gem 0.19.0")).to eq(true)
|
|
||||||
expect(described_class.crawler?("Pingdom.com_bot_version_1.4_(http://www.pingdom.com/)")).to eq(true)
|
|
||||||
expect(described_class.crawler?("LogicMonitor SiteMonitor/1.0")).to eq(true)
|
|
||||||
expect(described_class.crawler?("Java/1.8.0_151")).to eq(true)
|
|
||||||
expect(described_class.crawler?("Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)")).to eq(true)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
describe 'allow_crawler?' do
|
||||||
|
it 'returns true if whitelist and blacklist are blank' do
|
||||||
|
expect(CrawlerDetection.allow_crawler?('Googlebot/2.1 (+http://www.google.com/bot.html)')).to eq(true)
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'whitelist is set' do
|
||||||
|
before do
|
||||||
|
SiteSetting.whitelisted_crawler_user_agents = 'Googlebot|Twitterbot'
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'returns true for matching user agents' do
|
||||||
|
expect(CrawlerDetection.allow_crawler?('Googlebot/2.1 (+http://www.google.com/bot.html)')).to eq(true)
|
||||||
|
expect(CrawlerDetection.allow_crawler?('Googlebot-Image/1.0')).to eq(true)
|
||||||
|
expect(CrawlerDetection.allow_crawler?('Twitterbot')).to eq(true)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'returns false for user agents that do not match' do
|
||||||
|
expect(CrawlerDetection.allow_crawler?('facebookexternalhit/1.1 (+http(s)://www.facebook.com/externalhit_uatext.php)')).to eq(false)
|
||||||
|
expect(CrawlerDetection.allow_crawler?('Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)')).to eq(false)
|
||||||
|
expect(CrawlerDetection.allow_crawler?('')).to eq(false)
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'and blacklist is set' do
|
||||||
|
before do
|
||||||
|
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot-Image'
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'ignores the blacklist' do
|
||||||
|
expect(CrawlerDetection.allow_crawler?('Googlebot-Image/1.0')).to eq(true)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'blacklist is set' do
|
||||||
|
before do
|
||||||
|
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot|Twitterbot'
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'returns true for crawlers that do not match' do
|
||||||
|
expect(CrawlerDetection.allow_crawler?('Mediapartners-Google')).to eq(true)
|
||||||
|
expect(CrawlerDetection.allow_crawler?('facebookexternalhit/1.1 (+http(s)://www.facebook.com/externalhit_uatext.php)')).to eq(true)
|
||||||
|
expect(CrawlerDetection.allow_crawler?('')).to eq(true)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'returns false for user agents that match' do
|
||||||
|
expect(CrawlerDetection.allow_crawler?('Googlebot/2.1 (+http://www.google.com/bot.html)')).to eq(false)
|
||||||
|
expect(CrawlerDetection.allow_crawler?('Googlebot-Image/1.0')).to eq(false)
|
||||||
|
expect(CrawlerDetection.allow_crawler?('Twitterbot')).to eq(false)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
describe 'is_blocked_crawler?' do
|
||||||
|
it 'is false if user agent is a crawler and no whitelist or blacklist is defined' do
|
||||||
|
expect(CrawlerDetection.is_blocked_crawler?('Twitterbot')).to eq(false)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'is false if user agent is not a crawler and no whitelist or blacklist is defined' do
|
||||||
|
expect(CrawlerDetection.is_blocked_crawler?('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')).to eq(false)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'is true if user agent is a crawler and is not whitelisted' do
|
||||||
|
SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
|
||||||
|
expect(CrawlerDetection.is_blocked_crawler?('Twitterbot')).to eq(true)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'is false if user agent is not a crawler and there is a whitelist' do
|
||||||
|
SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
|
||||||
|
expect(CrawlerDetection.is_blocked_crawler?('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')).to eq(false)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'is true if user agent is a crawler and is blacklisted' do
|
||||||
|
SiteSetting.blacklisted_crawler_user_agents = 'Twitterbot'
|
||||||
|
expect(CrawlerDetection.is_blocked_crawler?('Twitterbot')).to eq(true)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'is true if user agent is a crawler and is not blacklisted' do
|
||||||
|
SiteSetting.blacklisted_crawler_user_agents = 'Twitterbot'
|
||||||
|
expect(CrawlerDetection.is_blocked_crawler?('Googlebot')).to eq(false)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'is false if user agent is not a crawler and blacklist is defined' do
|
||||||
|
SiteSetting.blacklisted_crawler_user_agents = 'Mozilla'
|
||||||
|
expect(CrawlerDetection.is_blocked_crawler?('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')).to eq(false)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'is true if user agent is missing and whitelist is defined' do
|
||||||
|
SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
|
||||||
|
expect(CrawlerDetection.is_blocked_crawler?('')).to eq(true)
|
||||||
|
expect(CrawlerDetection.is_blocked_crawler?(nil)).to eq(true)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'is false if user agent is missing and blacklist is defined' do
|
||||||
|
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
|
||||||
|
expect(CrawlerDetection.is_blocked_crawler?('')).to eq(false)
|
||||||
|
expect(CrawlerDetection.is_blocked_crawler?(nil)).to eq(false)
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -9,6 +9,7 @@ describe Middleware::RequestTracker do
|
||||||
"HTTP_USER_AGENT" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
|
"HTTP_USER_AGENT" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
|
||||||
"REQUEST_URI" => "/path?bla=1",
|
"REQUEST_URI" => "/path?bla=1",
|
||||||
"REQUEST_METHOD" => "GET",
|
"REQUEST_METHOD" => "GET",
|
||||||
|
"HTTP_ACCEPT" => "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
|
||||||
"rack.input" => ""
|
"rack.input" => ""
|
||||||
}.merge(opts)
|
}.merge(opts)
|
||||||
end
|
end
|
||||||
|
@ -277,4 +278,58 @@ describe Middleware::RequestTracker do
|
||||||
expect(timing[:redis][:calls]).to eq 2
|
expect(timing[:redis][:calls]).to eq 2
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context "crawler blocking" do
|
||||||
|
let :middleware do
|
||||||
|
app = lambda do |env|
|
||||||
|
[200, {}, ['OK']]
|
||||||
|
end
|
||||||
|
|
||||||
|
Middleware::RequestTracker.new(app)
|
||||||
|
end
|
||||||
|
|
||||||
|
def expect_success_response(status, _, response)
|
||||||
|
expect(status).to eq(200)
|
||||||
|
expect(response).to eq(['OK'])
|
||||||
|
end
|
||||||
|
|
||||||
|
def expect_blocked_response(status, _, response)
|
||||||
|
expect(status).to eq(403)
|
||||||
|
expect(response).to eq(['Crawler is not allowed'])
|
||||||
|
end
|
||||||
|
|
||||||
|
it "applies whitelisted_crawler_user_agents correctly" do
|
||||||
|
SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
|
||||||
|
expect_success_response(*middleware.call(env))
|
||||||
|
expect_blocked_response(*middleware.call(env('HTTP_USER_AGENT' => 'Twitterbot')))
|
||||||
|
expect_success_response(*middleware.call(env('HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)')))
|
||||||
|
expect_blocked_response(*middleware.call(env('HTTP_USER_AGENT' => 'DiscourseAPI Ruby Gem 0.19.0')))
|
||||||
|
end
|
||||||
|
|
||||||
|
it "applies blacklisted_crawler_user_agents correctly" do
|
||||||
|
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
|
||||||
|
expect_success_response(*middleware.call(env))
|
||||||
|
expect_blocked_response(*middleware.call(env('HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)')))
|
||||||
|
expect_success_response(*middleware.call(env('HTTP_USER_AGENT' => 'Twitterbot')))
|
||||||
|
expect_success_response(*middleware.call(env('HTTP_USER_AGENT' => 'DiscourseAPI Ruby Gem 0.19.0')))
|
||||||
|
end
|
||||||
|
|
||||||
|
it "blocked crawlers shouldn't log page views" do
|
||||||
|
ApplicationRequest.clear_cache!
|
||||||
|
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
|
||||||
|
expect {
|
||||||
|
middleware.call(env('HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'))
|
||||||
|
ApplicationRequest.write_cache!
|
||||||
|
}.to_not change { ApplicationRequest.count }
|
||||||
|
end
|
||||||
|
|
||||||
|
it "allows json requests" do
|
||||||
|
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
|
||||||
|
expect_success_response(*middleware.call(env(
|
||||||
|
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)',
|
||||||
|
'HTTP_ACCEPT' => 'application/json'
|
||||||
|
)))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
Fabricator(:web_crawler_request) do
|
||||||
|
user_agent { sequence(:ua) { |i| "Googlebot #{i}.0" } }
|
||||||
|
date Time.zone.now.to_date
|
||||||
|
count 0
|
||||||
|
end
|
|
@ -0,0 +1,31 @@
|
||||||
|
require 'rails_helper'
|
||||||
|
|
||||||
|
describe Jobs::CleanUpCrawlerStats do
|
||||||
|
subject { Jobs::CleanUpCrawlerStats.new.execute({}) }
|
||||||
|
|
||||||
|
it "deletes records older than 30 days old" do
|
||||||
|
freeze_time
|
||||||
|
|
||||||
|
today = Fabricate(:web_crawler_request, date: Time.zone.now.to_date)
|
||||||
|
yesterday = Fabricate(:web_crawler_request, date: 1.day.ago.to_date)
|
||||||
|
too_old = Fabricate(:web_crawler_request, date: 31.days.ago.to_date)
|
||||||
|
|
||||||
|
expect { subject }.to change { WebCrawlerRequest.count }.by(-1)
|
||||||
|
expect(WebCrawlerRequest.where(id: too_old.id)).to_not exist
|
||||||
|
end
|
||||||
|
|
||||||
|
it "keeps only the top records from the previous day" do
|
||||||
|
freeze_time
|
||||||
|
|
||||||
|
WebCrawlerRequest.stubs(:max_records_per_day).returns(3)
|
||||||
|
|
||||||
|
req1 = Fabricate(:web_crawler_request, date: 1.day.ago.to_date, count: 100)
|
||||||
|
req4 = Fabricate(:web_crawler_request, date: 1.day.ago.to_date, count: 30)
|
||||||
|
req3 = Fabricate(:web_crawler_request, date: 1.day.ago.to_date, count: 40)
|
||||||
|
req2 = Fabricate(:web_crawler_request, date: 1.day.ago.to_date, count: 50)
|
||||||
|
req5 = Fabricate(:web_crawler_request, date: 1.day.ago.to_date, count: 1)
|
||||||
|
|
||||||
|
expect { subject }.to change { WebCrawlerRequest.count }.by(-2)
|
||||||
|
expect(WebCrawlerRequest.all).to contain_exactly(req1, req2, req3)
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,114 @@
|
||||||
|
require 'rails_helper'
|
||||||
|
|
||||||
|
describe WebCrawlerRequest do
|
||||||
|
before do
|
||||||
|
WebCrawlerRequest.last_flush = Time.now.utc
|
||||||
|
WebCrawlerRequest.clear_cache!
|
||||||
|
end
|
||||||
|
|
||||||
|
after do
|
||||||
|
WebCrawlerRequest.clear_cache!
|
||||||
|
end
|
||||||
|
|
||||||
|
def inc(user_agent, opts = nil)
|
||||||
|
WebCrawlerRequest.increment!(user_agent, opts)
|
||||||
|
end
|
||||||
|
|
||||||
|
def disable_date_flush!
|
||||||
|
freeze_time(Time.now)
|
||||||
|
WebCrawlerRequest.last_flush = Time.now.utc
|
||||||
|
end
|
||||||
|
|
||||||
|
def web_crawler_request(user_agent)
|
||||||
|
WebCrawlerRequest.where(user_agent: user_agent).first
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'works even if redis is in readonly' do
|
||||||
|
disable_date_flush!
|
||||||
|
|
||||||
|
inc('Googlebot')
|
||||||
|
inc('Googlebot')
|
||||||
|
|
||||||
|
$redis.without_namespace.stubs(:incr).raises(Redis::CommandError.new("READONLY"))
|
||||||
|
$redis.without_namespace.stubs(:eval).raises(Redis::CommandError.new("READONLY"))
|
||||||
|
|
||||||
|
inc('Googlebot', autoflush: 3)
|
||||||
|
WebCrawlerRequest.write_cache!
|
||||||
|
|
||||||
|
$redis.without_namespace.unstub(:incr)
|
||||||
|
$redis.without_namespace.unstub(:eval)
|
||||||
|
|
||||||
|
inc('Googlebot', autoflush: 3)
|
||||||
|
expect(web_crawler_request('Googlebot').count).to eq(3)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'logs nothing for an unflushed increment' do
|
||||||
|
WebCrawlerRequest.increment!('Googlebot')
|
||||||
|
expect(WebCrawlerRequest.count).to eq(0)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'can automatically flush' do
|
||||||
|
disable_date_flush!
|
||||||
|
|
||||||
|
inc('Googlebot', autoflush: 3)
|
||||||
|
expect(web_crawler_request('Googlebot')).to_not be_present
|
||||||
|
expect(WebCrawlerRequest.count).to eq(0)
|
||||||
|
inc('Googlebot', autoflush: 3)
|
||||||
|
expect(web_crawler_request('Googlebot')).to_not be_present
|
||||||
|
inc('Googlebot', autoflush: 3)
|
||||||
|
expect(web_crawler_request('Googlebot').count).to eq(3)
|
||||||
|
expect(WebCrawlerRequest.count).to eq(1)
|
||||||
|
|
||||||
|
3.times { inc('Googlebot', autoflush: 3) }
|
||||||
|
expect(web_crawler_request('Googlebot').count).to eq(6)
|
||||||
|
expect(WebCrawlerRequest.count).to eq(1)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'can flush based on time' do
|
||||||
|
t1 = Time.now.utc.at_midnight
|
||||||
|
freeze_time(t1)
|
||||||
|
WebCrawlerRequest.write_cache!
|
||||||
|
inc('Googlebot')
|
||||||
|
expect(WebCrawlerRequest.count).to eq(0)
|
||||||
|
|
||||||
|
freeze_time(t1 + WebCrawlerRequest.autoflush_seconds + 1)
|
||||||
|
inc('Googlebot')
|
||||||
|
|
||||||
|
expect(WebCrawlerRequest.count).to eq(1)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'flushes yesterdays results' do
|
||||||
|
t1 = Time.now.utc.at_midnight
|
||||||
|
freeze_time(t1)
|
||||||
|
inc('Googlebot')
|
||||||
|
freeze_time(t1.tomorrow)
|
||||||
|
inc('Googlebot')
|
||||||
|
|
||||||
|
WebCrawlerRequest.write_cache!
|
||||||
|
expect(WebCrawlerRequest.count).to eq(2)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'clears cache correctly' do
|
||||||
|
inc('Googlebot')
|
||||||
|
inc('Twitterbot')
|
||||||
|
WebCrawlerRequest.clear_cache!
|
||||||
|
WebCrawlerRequest.write_cache!
|
||||||
|
|
||||||
|
expect(WebCrawlerRequest.count).to eq(0)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'logs a few counts once flushed' do
|
||||||
|
time = Time.now.at_midnight
|
||||||
|
freeze_time(time)
|
||||||
|
|
||||||
|
3.times { inc('Googlebot') }
|
||||||
|
2.times { inc('Twitterbot') }
|
||||||
|
4.times { inc('Bingbot') }
|
||||||
|
|
||||||
|
WebCrawlerRequest.write_cache!
|
||||||
|
|
||||||
|
expect(web_crawler_request('Googlebot').count).to eq(3)
|
||||||
|
expect(web_crawler_request('Twitterbot').count).to eq(2)
|
||||||
|
expect(web_crawler_request('Bingbot').count).to eq(4)
|
||||||
|
end
|
||||||
|
end
|
|
@ -2,11 +2,64 @@ require 'rails_helper'
|
||||||
|
|
||||||
RSpec.describe RobotsTxtController do
|
RSpec.describe RobotsTxtController do
|
||||||
describe '#index' do
|
describe '#index' do
|
||||||
it "returns index when indexing is allowed" do
|
context 'allow_index_in_robots_txt is true' do
|
||||||
SiteSetting.allow_index_in_robots_txt = true
|
|
||||||
get '/robots.txt'
|
|
||||||
|
|
||||||
expect(response.body).to include("Disallow: /u/")
|
def expect_allowed_and_disallowed_sections(allow_index, disallow_index)
|
||||||
|
expect(allow_index).to be_present
|
||||||
|
expect(disallow_index).to be_present
|
||||||
|
|
||||||
|
allow_section = allow_index < disallow_index ?
|
||||||
|
response.body[allow_index...disallow_index] : response.body[allow_index..-1]
|
||||||
|
|
||||||
|
expect(allow_section).to include('Disallow: /u/')
|
||||||
|
expect(allow_section).to_not include("Disallow: /\n")
|
||||||
|
|
||||||
|
disallowed_section = allow_index < disallow_index ?
|
||||||
|
response.body[disallow_index..-1] : response.body[disallow_index...allow_index]
|
||||||
|
expect(disallowed_section).to include("Disallow: /\n")
|
||||||
|
end
|
||||||
|
|
||||||
|
it "returns index when indexing is allowed" do
|
||||||
|
SiteSetting.allow_index_in_robots_txt = true
|
||||||
|
get '/robots.txt'
|
||||||
|
|
||||||
|
i = response.body.index('User-agent: *')
|
||||||
|
expect(i).to be_present
|
||||||
|
expect(response.body[i..-1]).to include("Disallow: /u/")
|
||||||
|
end
|
||||||
|
|
||||||
|
it "can whitelist user agents" do
|
||||||
|
SiteSetting.whitelisted_crawler_user_agents = "Googlebot|Twitterbot"
|
||||||
|
get '/robots.txt'
|
||||||
|
expect(response.body).to include('User-agent: Googlebot')
|
||||||
|
expect(response.body).to include('User-agent: Twitterbot')
|
||||||
|
|
||||||
|
allowed_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min
|
||||||
|
disallow_all_index = response.body.index('User-agent: *')
|
||||||
|
|
||||||
|
expect_allowed_and_disallowed_sections(allowed_index, disallow_all_index)
|
||||||
|
end
|
||||||
|
|
||||||
|
it "can blacklist user agents" do
|
||||||
|
SiteSetting.blacklisted_crawler_user_agents = "Googlebot|Twitterbot"
|
||||||
|
get '/robots.txt'
|
||||||
|
expect(response.body).to include('User-agent: Googlebot')
|
||||||
|
expect(response.body).to include('User-agent: Twitterbot')
|
||||||
|
|
||||||
|
disallow_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min
|
||||||
|
allow_index = response.body.index('User-agent: *')
|
||||||
|
|
||||||
|
expect_allowed_and_disallowed_sections(allow_index, disallow_index)
|
||||||
|
end
|
||||||
|
|
||||||
|
it "ignores blacklist if whitelist is set" do
|
||||||
|
SiteSetting.whitelisted_crawler_user_agents = "Googlebot|Twitterbot"
|
||||||
|
SiteSetting.blacklisted_crawler_user_agents = "Bananabot"
|
||||||
|
get '/robots.txt'
|
||||||
|
expect(response.body).to_not include('Bananabot')
|
||||||
|
expect(response.body).to include('User-agent: Googlebot')
|
||||||
|
expect(response.body).to include('User-agent: Twitterbot')
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
it "returns noindex when indexing is disallowed" do
|
it "returns noindex when indexing is disallowed" do
|
||||||
|
|
Loading…
Reference in New Issue