diff --git a/app/assets/javascripts/admin/models/report.js.es6 b/app/assets/javascripts/admin/models/report.js.es6
index af078528b75..86281929ae0 100644
--- a/app/assets/javascripts/admin/models/report.js.es6
+++ b/app/assets/javascripts/admin/models/report.js.es6
@@ -2,6 +2,7 @@ import { ajax } from 'discourse/lib/ajax';
import round from "discourse/lib/round";
import { fmt } from 'discourse/lib/computed';
import { fillMissingDates } from 'discourse/lib/utilities';
+import computed from 'ember-addons/ember-computed-decorators';
const Report = Discourse.Model.extend({
reportUrl: fmt("type", "/admin/reports/%@"),
@@ -42,7 +43,8 @@ const Report = Discourse.Model.extend({
lastSevenDaysCount: function() { return this.valueFor(1, 7); }.property("data"),
lastThirtyDaysCount: function() { return this.valueFor(1, 30); }.property("data"),
- yesterdayTrend: function() {
+ @computed('data')
+ yesterdayTrend() {
const yesterdayVal = this.valueAt(1);
const twoDaysAgoVal = this.valueAt(2);
if (yesterdayVal > twoDaysAgoVal) {
@@ -52,9 +54,10 @@ const Report = Discourse.Model.extend({
} else {
return "no-change";
}
- }.property("data"),
+ },
- sevenDayTrend: function() {
+ @computed('data')
+ sevenDayTrend() {
const currentPeriod = this.valueFor(1, 7);
const prevPeriod = this.valueFor(8, 14);
if (currentPeriod > prevPeriod) {
@@ -64,36 +67,39 @@ const Report = Discourse.Model.extend({
} else {
return "no-change";
}
- }.property("data"),
+ },
- thirtyDayTrend: function() {
- if (this.get("prev30Days")) {
+ @computed('prev30Days', 'data')
+ thirtyDayTrend(prev30Days) {
+ if (prev30Days) {
const currentPeriod = this.valueFor(1, 30);
if (currentPeriod > this.get("prev30Days")) {
return "trending-up";
- } else if (currentPeriod < this.get("prev30Days")) {
+ } else if (currentPeriod < prev30Days) {
return "trending-down";
}
}
return "no-change";
- }.property("data", "prev30Days"),
+ },
- icon: function() {
- switch (this.get("type")) {
+ @computed('type')
+ icon(type) {
+ switch (type) {
case "flags": return "flag";
case "likes": return "heart";
case "bookmarks": return "bookmark";
default: return null;
}
- }.property("type"),
+ },
- method: function() {
- if (this.get("type") === "time_to_first_response") {
+ @computed('type')
+ method(type) {
+ if (type === "time_to_first_response") {
return "average";
} else {
return "sum";
}
- }.property("type"),
+ },
percentChangeString(val1, val2) {
const val = ((val1 - val2) / val2) * 100;
@@ -114,21 +120,31 @@ const Report = Discourse.Model.extend({
return title;
},
- yesterdayCountTitle: function() {
+ @computed('data')
+ yesterdayCountTitle() {
return this.changeTitle(this.valueAt(1), this.valueAt(2), "two days ago");
- }.property("data"),
+ },
- sevenDayCountTitle: function() {
+ @computed('data')
+ sevenDayCountTitle() {
return this.changeTitle(this.valueFor(1, 7), this.valueFor(8, 14), "two weeks ago");
- }.property("data"),
+ },
- thirtyDayCountTitle: function() {
- return this.changeTitle(this.valueFor(1, 30), this.get("prev30Days"), "in the previous 30 day period");
- }.property("data"),
+ @computed('prev30Days', 'data')
+ thirtyDayCountTitle(prev30Days) {
+ return this.changeTitle(this.valueFor(1, 30), prev30Days, "in the previous 30 day period");
+ },
- dataReversed: function() {
- return this.get("data").toArray().reverse();
- }.property("data")
+ @computed('data')
+ sortedData(data) {
+ return this.get('xAxisIsDate') ? data.toArray().reverse() : data.toArray();
+ },
+
+ @computed('data')
+ xAxisIsDate() {
+ if (!this.data[0]) return false;
+ return this.data && this.data[0].x.match(/\d{4}-\d{1,2}-\d{1,2}/);
+ }
});
@@ -152,6 +168,14 @@ Report.reopenClass({
const model = Report.create({ type: type });
model.setProperties(json.report);
+
+ if (json.report.related_report) {
+ // TODO: fillMissingDates if xaxis is date
+ const related = Report.create({ type: json.report.related_report.type });
+ related.setProperties(json.report.related_report);
+ model.set('relatedReport', related);
+ }
+
return model;
});
}
diff --git a/app/assets/javascripts/admin/templates/components/admin-table-report.hbs b/app/assets/javascripts/admin/templates/components/admin-table-report.hbs
new file mode 100644
index 00000000000..53ea2e51c6e
--- /dev/null
+++ b/app/assets/javascripts/admin/templates/components/admin-table-report.hbs
@@ -0,0 +1,17 @@
+{{#if model.sortedData}}
+
+
+ {{model.xaxis}} |
+ {{model.yaxis}} |
+
+
+ {{#each model.sortedData as |row|}}
+
+ {{row.x}} |
+
+ {{row.y}}
+ |
+
+ {{/each}}
+
+{{/if}}
diff --git a/app/assets/javascripts/admin/templates/reports.hbs b/app/assets/javascripts/admin/templates/reports.hbs
index 3927d1e384f..1a6b4bf0477 100644
--- a/app/assets/javascripts/admin/templates/reports.hbs
+++ b/app/assets/javascripts/admin/templates/reports.hbs
@@ -31,20 +31,10 @@
{{#if viewingGraph}}
{{admin-graph model=model}}
{{else}}
-
-
- {{model.xaxis}} |
- {{model.yaxis}} |
-
+ {{admin-table-report model=model}}
+ {{/if}}
- {{#each model.dataReversed as |row|}}
-
- {{row.x}} |
-
- {{row.y}}
- |
-
- {{/each}}
-
+ {{#if model.relatedReport}}
+ {{admin-table-report model=model.relatedReport}}
{{/if}}
{{/conditional-loading-spinner}}
diff --git a/app/assets/stylesheets/common/admin/admin_base.scss b/app/assets/stylesheets/common/admin/admin_base.scss
index 122a647925b..04708b6e646 100644
--- a/app/assets/stylesheets/common/admin/admin_base.scss
+++ b/app/assets/stylesheets/common/admin/admin_base.scss
@@ -167,6 +167,20 @@ $mobile-breakpoint: 700px;
}
}
+ &.web_crawlers {
+ tr {
+ th:nth-of-type(1) {
+ width: 60%;
+ }
+ }
+ td.x-value {
+ max-width: 0;
+ overflow: hidden;
+ text-overflow: ellipsis;
+ white-space: nowrap;
+ }
+ }
+
.bar-container {
float: left;
width: 300px;
diff --git a/app/controllers/robots_txt_controller.rb b/app/controllers/robots_txt_controller.rb
index 4a8600774cc..2315120935f 100644
--- a/app/controllers/robots_txt_controller.rb
+++ b/app/controllers/robots_txt_controller.rb
@@ -3,7 +3,21 @@ class RobotsTxtController < ApplicationController
skip_before_action :preload_json, :check_xhr, :redirect_to_login_if_required
def index
- path = SiteSetting.allow_index_in_robots_txt ? :index : :no_index
+ if SiteSetting.allow_index_in_robots_txt
+ path = :index
+ if SiteSetting.whitelisted_crawler_user_agents.present?
+ @allowed_user_agents = SiteSetting.whitelisted_crawler_user_agents.split('|')
+ @disallowed_user_agents = ['*']
+ elsif SiteSetting.blacklisted_crawler_user_agents.present?
+ @allowed_user_agents = ['*']
+ @disallowed_user_agents = SiteSetting.blacklisted_crawler_user_agents.split('|')
+ else
+ @allowed_user_agents = ['*']
+ end
+ else
+ path = :no_index
+ end
+
render path, content_type: 'text/plain'
end
end
diff --git a/app/jobs/scheduled/clean_up_crawler_stats.rb b/app/jobs/scheduled/clean_up_crawler_stats.rb
new file mode 100644
index 00000000000..816e91f0fd2
--- /dev/null
+++ b/app/jobs/scheduled/clean_up_crawler_stats.rb
@@ -0,0 +1,26 @@
+module Jobs
+
+ class CleanUpCrawlerStats < Jobs::Scheduled
+ every 1.day
+
+ def execute(args)
+ WebCrawlerRequest.where('date < ?', WebCrawlerRequest.max_record_age.ago).delete_all
+
+ # keep count of only the top user agents
+ WebCrawlerRequest.exec_sql <<~SQL
+ WITH ranked_requests AS (
+ SELECT row_number() OVER (ORDER BY count DESC) as row_number, id
+ FROM web_crawler_requests
+ WHERE date = '#{1.day.ago.strftime("%Y-%m-%d")}'
+ )
+ DELETE FROM web_crawler_requests
+ WHERE id IN (
+ SELECT ranked_requests.id
+ FROM ranked_requests
+ WHERE row_number > #{WebCrawlerRequest.max_records_per_day}
+ )
+ SQL
+ end
+ end
+
+end
diff --git a/app/models/application_request.rb b/app/models/application_request.rb
index f13e7275a7b..1e4459f681d 100644
--- a/app/models/application_request.rb
+++ b/app/models/application_request.rb
@@ -1,5 +1,6 @@
# frozen_string_literal: true
class ApplicationRequest < ActiveRecord::Base
+
enum req_type: %i(http_total
http_2xx
http_background
@@ -12,41 +13,12 @@ class ApplicationRequest < ActiveRecord::Base
page_view_logged_in_mobile
page_view_anon_mobile)
- cattr_accessor :autoflush, :autoflush_seconds, :last_flush
- # auto flush if backlog is larger than this
- self.autoflush = 2000
-
- # auto flush if older than this
- self.autoflush_seconds = 5.minutes
- self.last_flush = Time.now.utc
+ include CachedCounting
def self.increment!(type, opts = nil)
- key = redis_key(type)
- val = $redis.incr(key).to_i
-
- # readonly mode it is going to be 0, skip
- return if val == 0
-
- # 3.days, see: https://github.com/rails/rails/issues/21296
- $redis.expire(key, 259200)
-
- autoflush = (opts && opts[:autoflush]) || self.autoflush
- if autoflush > 0 && val >= autoflush
- write_cache!
- return
- end
-
- if (Time.now.utc - last_flush).to_i > autoflush_seconds
- write_cache!
- end
+ perform_increment!(redis_key(type), opts)
end
- GET_AND_RESET = <<~LUA
- local val = redis.call('get', KEYS[1])
- redis.call('set', KEYS[1], '0')
- return val
- LUA
-
def self.write_cache!(date = nil)
if date.nil?
write_cache!(Time.now.utc)
@@ -58,13 +30,9 @@ class ApplicationRequest < ActiveRecord::Base
date = date.to_date
- # this may seem a bit fancy but in so it allows
- # for concurrent calls without double counting
req_types.each do |req_type, _|
- key = redis_key(req_type, date)
+ val = get_and_reset(redis_key(req_type, date))
- namespaced_key = $redis.namespace_key(key)
- val = $redis.without_namespace.eval(GET_AND_RESET, keys: [namespaced_key]).to_i
next if val == 0
id = req_id(date, req_type)
diff --git a/app/models/concerns/cached_counting.rb b/app/models/concerns/cached_counting.rb
new file mode 100644
index 00000000000..8f629df70e8
--- /dev/null
+++ b/app/models/concerns/cached_counting.rb
@@ -0,0 +1,67 @@
+module CachedCounting
+ extend ActiveSupport::Concern
+
+ included do
+ class << self
+ attr_accessor :autoflush, :autoflush_seconds, :last_flush
+ end
+
+ # auto flush if backlog is larger than this
+ self.autoflush = 2000
+
+ # auto flush if older than this
+ self.autoflush_seconds = 5.minutes
+
+ self.last_flush = Time.now.utc
+ end
+
+ class_methods do
+ def perform_increment!(key, opts = nil)
+ val = $redis.incr(key).to_i
+
+ # readonly mode it is going to be 0, skip
+ return if val == 0
+
+ # 3.days, see: https://github.com/rails/rails/issues/21296
+ $redis.expire(key, 259200)
+
+ autoflush = (opts && opts[:autoflush]) || self.autoflush
+ if autoflush > 0 && val >= autoflush
+ write_cache!
+ return
+ end
+
+ if (Time.now.utc - last_flush).to_i > autoflush_seconds
+ write_cache!
+ end
+ end
+
+ def write_cache!(date = nil)
+ raise NotImplementedError
+ end
+
+ GET_AND_RESET = <<~LUA
+ local val = redis.call('get', KEYS[1])
+ redis.call('set', KEYS[1], '0')
+ return val
+ LUA
+
+ # this may seem a bit fancy but in so it allows
+ # for concurrent calls without double counting
+ def get_and_reset(key)
+ namespaced_key = $redis.namespace_key(key)
+ $redis.without_namespace.eval(GET_AND_RESET, keys: [namespaced_key]).to_i
+ end
+
+ def request_id(query_params, retries = 0)
+ id = where(query_params).pluck(:id).first
+ id ||= create!(query_params.merge(count: 0)).id
+ rescue # primary key violation
+ if retries == 0
+ request_id(query_params, 1)
+ else
+ raise
+ end
+ end
+ end
+end
diff --git a/app/models/report.rb b/app/models/report.rb
index 0a2871febff..2d6c8a3624c 100644
--- a/app/models/report.rb
+++ b/app/models/report.rb
@@ -27,7 +27,11 @@ class Report
category_id: category_id,
group_id: group_id,
prev30Days: self.prev30Days
- }
+ }.tap do |json|
+ if type == 'page_view_crawler_reqs'
+ json[:related_report] = Report.find('web_crawlers', start_date: start_date, end_date: end_date)&.as_json
+ end
+ end
end
def Report.add_report(name, &block)
@@ -231,4 +235,12 @@ class Report
def self.report_notify_user_private_messages(report)
private_messages_report report, TopicSubtype.notify_user
end
+
+ def self.report_web_crawlers(report)
+ report.data = WebCrawlerRequest.where('date >= ? and date <= ?', report.start_date, report.end_date)
+ .limit(200)
+ .order('sum_count DESC')
+ .group(:user_agent).sum(:count)
+ .map { |ua, count| { x: ua, y: count } }
+ end
end
diff --git a/app/models/web_crawler_request.rb b/app/models/web_crawler_request.rb
new file mode 100644
index 00000000000..5e418d1b1c1
--- /dev/null
+++ b/app/models/web_crawler_request.rb
@@ -0,0 +1,76 @@
+class WebCrawlerRequest < ActiveRecord::Base
+ include CachedCounting
+
+ # auto flush if older than this
+ self.autoflush_seconds = 1.hour
+
+ cattr_accessor :max_record_age, :max_records_per_day
+
+ # only keep the top records based on request count
+ self.max_records_per_day = 200
+
+ # delete records older than this
+ self.max_record_age = 30.days
+
+ def self.increment!(user_agent, opts = nil)
+ ua_list_key = user_agent_list_key
+ $redis.sadd(ua_list_key, user_agent)
+ $redis.expire(ua_list_key, 259200) # 3.days
+
+ perform_increment!(redis_key(user_agent), opts)
+ end
+
+ def self.write_cache!(date = nil)
+ if date.nil?
+ write_cache!(Time.now.utc)
+ write_cache!(Time.now.utc.yesterday)
+ return
+ end
+
+ self.last_flush = Time.now.utc
+
+ date = date.to_date
+ ua_list_key = user_agent_list_key(date)
+
+ while user_agent = $redis.spop(ua_list_key)
+ val = get_and_reset(redis_key(user_agent, date))
+
+ next if val == 0
+
+ self.where(id: req_id(date, user_agent)).update_all(["count = count + ?", val])
+ end
+ rescue Redis::CommandError => e
+ raise unless e.message =~ /READONLY/
+ nil
+ end
+
+ def self.clear_cache!(date = nil)
+ if date.nil?
+ clear_cache!(Time.now.utc)
+ clear_cache!(Time.now.utc.yesterday)
+ return
+ end
+
+ list_key = user_agent_list_key(date)
+
+ $redis.smembers(list_key).each do |user_agent, _|
+ $redis.del redis_key(user_agent, date)
+ end
+
+ $redis.del(list_key)
+ end
+
+ protected
+
+ def self.user_agent_list_key(time = Time.now.utc)
+ "crawl_ua_list:#{time.strftime('%Y%m%d')}"
+ end
+
+ def self.redis_key(user_agent, time = Time.now.utc)
+ "crawl_req:#{time.strftime('%Y%m%d')}:#{user_agent}"
+ end
+
+ def self.req_id(date, user_agent)
+ request_id(date: date, user_agent: user_agent)
+ end
+end
diff --git a/app/views/robots_txt/index.erb b/app/views/robots_txt/index.erb
index abb3cf51fee..4f9bc5ac468 100644
--- a/app/views/robots_txt/index.erb
+++ b/app/views/robots_txt/index.erb
@@ -1,6 +1,8 @@
# See http://www.robotstxt.org/wc/norobots.html for documentation on how to use the robots.txt file
#
-User-agent: *
+<% @allowed_user_agents.each do |user_agent| %>
+User-agent: <%= user_agent %>
+<% end %>
Disallow: /auth/cas
Disallow: /auth/facebook/callback
Disallow: /auth/twitter/callback
@@ -29,4 +31,12 @@ Disallow: /groups
Disallow: /groups/
Disallow: /uploads/
+<% if @disallowed_user_agents %>
+ <% @disallowed_user_agents.each do |user_agent| %>
+User-agent: <%= user_agent %>
+Disallow: /
+
+ <% end %>
+<% end %>
+
<%= server_plugin_outlet "robots_txt_index" %>
diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml
index f00e044a338..bae3db4905c 100644
--- a/config/locales/server.en.yml
+++ b/config/locales/server.en.yml
@@ -968,6 +968,10 @@ en:
title: "User Visits"
xaxis: "Day"
yaxis: "Number of visits"
+ web_crawlers:
+ title: "Web Crawler Requests"
+ xaxis: "User Agent"
+ yaxis: "Pageviews"
dashboard:
rails_env_warning: "Your server is running in %{env} mode."
@@ -1119,6 +1123,8 @@ en:
blacklist_ip_blocks: "A list of private IP blocks that should never be crawled by Discourse"
whitelist_internal_hosts: "A list of internal hosts that discourse can safely crawl for oneboxing and other purposes"
allowed_iframes: "A list of iframe src domain prefixes that discourse can safely allow in posts"
+ whitelisted_crawler_user_agents: 'User agents of web crawlers that should be allowed to access the site.'
+ blacklisted_crawler_user_agents: 'User agents of web crawlers that should not be allowed to access the site. Does not apply if whitelist is defined.'
top_menu: "Determine which items appear in the homepage navigation, and in what order. Example latest|new|unread|categories|top|read|posted|bookmarks"
post_menu: "Determine which items appear on the post menu, and in what order. Example like|edit|flag|delete|share|bookmark|reply"
post_menu_hidden_items: "The menu items to hide by default in the post menu unless an expansion ellipsis is clicked on."
diff --git a/config/site_settings.yml b/config/site_settings.yml
index dc00b023fe5..3b1b7f00c27 100644
--- a/config/site_settings.yml
+++ b/config/site_settings.yml
@@ -1018,6 +1018,12 @@ security:
default: 'https://www.google.com/maps/embed?|https://www.openstreetmap.org/export/embed.html?|https://calendar.google.com/calendar/embed?'
type: list
client: true
+ whitelisted_crawler_user_agents:
+ type: list
+ default: ''
+ blacklisted_crawler_user_agents:
+ type: list
+ default: ''
onebox:
enable_flash_video_onebox: false
diff --git a/db/migrate/20180320190339_create_web_crawler_requests.rb b/db/migrate/20180320190339_create_web_crawler_requests.rb
new file mode 100644
index 00000000000..f02fd776344
--- /dev/null
+++ b/db/migrate/20180320190339_create_web_crawler_requests.rb
@@ -0,0 +1,11 @@
+class CreateWebCrawlerRequests < ActiveRecord::Migration[5.1]
+ def change
+ create_table :web_crawler_requests do |t|
+ t.date :date, null: false
+ t.string :user_agent, null: false
+ t.integer :count, null: false, default: 0
+ end
+
+ add_index :web_crawler_requests, [:date, :user_agent], unique: true
+ end
+end
diff --git a/lib/crawler_detection.rb b/lib/crawler_detection.rb
index 2958087a3c2..bf28c6c1aa8 100644
--- a/lib/crawler_detection.rb
+++ b/lib/crawler_detection.rb
@@ -28,4 +28,25 @@ module CrawlerDetection
end
end
+
+ # Given a user_agent that returns true from crawler?, should its request be allowed?
+ def self.allow_crawler?(user_agent)
+ return true if SiteSetting.whitelisted_crawler_user_agents.blank? &&
+ SiteSetting.blacklisted_crawler_user_agents.blank?
+
+ @whitelisted_matchers ||= {}
+ @blacklisted_matchers ||= {}
+
+ if SiteSetting.whitelisted_crawler_user_agents.present?
+ whitelisted = @whitelisted_matchers[SiteSetting.whitelisted_crawler_user_agents] ||= to_matcher(SiteSetting.whitelisted_crawler_user_agents)
+ !user_agent.nil? && user_agent.match?(whitelisted)
+ else
+ blacklisted = @blacklisted_matchers[SiteSetting.blacklisted_crawler_user_agents] ||= to_matcher(SiteSetting.blacklisted_crawler_user_agents)
+ user_agent.nil? || !user_agent.match?(blacklisted)
+ end
+ end
+
+ def self.is_blocked_crawler?(user_agent)
+ crawler?(user_agent) && !allow_crawler?(user_agent)
+ end
end
diff --git a/lib/middleware/request_tracker.rb b/lib/middleware/request_tracker.rb
index 2ad01b05215..e7ffb8869a4 100644
--- a/lib/middleware/request_tracker.rb
+++ b/lib/middleware/request_tracker.rb
@@ -83,6 +83,7 @@ class Middleware::RequestTracker
if track_view
if data[:is_crawler]
ApplicationRequest.increment!(:page_view_crawler)
+ WebCrawlerRequest.increment!(data[:user_agent])
elsif data[:has_auth_cookie]
ApplicationRequest.increment!(:page_view_logged_in)
ApplicationRequest.increment!(:page_view_logged_in_mobile) if data[:is_mobile]
@@ -129,8 +130,9 @@ class Middleware::RequestTracker
is_mobile: helper.is_mobile?,
track_view: track_view,
timing: timing
- }
-
+ }.tap do |h|
+ h[:user_agent] = env['HTTP_USER_AGENT'] if h[:is_crawler]
+ end
end
def log_request_info(env, result, info)
@@ -155,12 +157,20 @@ class Middleware::RequestTracker
def call(env)
result = nil
+ log_request = true
+ request = Rack::Request.new(env)
- if rate_limit(env)
+ if rate_limit(request)
result = [429, {}, ["Slow down, too Many Requests from this IP Address"]]
return result
end
+ if block_crawler(request)
+ log_request = false
+ result = [403, { 'Content-Type' => 'text/plain' }, ['Crawler is not allowed']]
+ return result
+ end
+
env["discourse.request_tracker"] = self
MethodProfiler.start
result = @app.call(env)
@@ -186,7 +196,7 @@ class Middleware::RequestTracker
end
end
end
- log_request_info(env, result, info) unless env["discourse.request_tracker.skip"]
+ log_request_info(env, result, info) unless !log_request || env["discourse.request_tracker.skip"]
end
PRIVATE_IP ||= /^(127\.)|(192\.168\.)|(10\.)|(172\.1[6-9]\.)|(172\.2[0-9]\.)|(172\.3[0-1]\.)|(::1$)|([fF][cCdD])/
@@ -196,7 +206,7 @@ class Middleware::RequestTracker
!!(ip && ip.to_s.match?(PRIVATE_IP))
end
- def rate_limit(env)
+ def rate_limit(request)
if (
GlobalSetting.max_reqs_per_ip_mode == "block" ||
@@ -204,7 +214,7 @@ class Middleware::RequestTracker
GlobalSetting.max_reqs_per_ip_mode == "warn+block"
)
- ip = Rack::Request.new(env).ip
+ ip = request.ip
if !GlobalSetting.max_reqs_rate_limit_on_private
return false if is_private_ip?(ip)
@@ -236,15 +246,15 @@ class Middleware::RequestTracker
global: true
)
- env['DISCOURSE_RATE_LIMITERS'] = [limiter10, limiter60]
- env['DISCOURSE_ASSET_RATE_LIMITERS'] = [limiter_assets10]
+ request.env['DISCOURSE_RATE_LIMITERS'] = [limiter10, limiter60]
+ request.env['DISCOURSE_ASSET_RATE_LIMITERS'] = [limiter_assets10]
warn = GlobalSetting.max_reqs_per_ip_mode == "warn" ||
GlobalSetting.max_reqs_per_ip_mode == "warn+block"
if !limiter_assets10.can_perform?
if warn
- Rails.logger.warn("Global asset IP rate limit exceeded for #{ip}: 10 second rate limit, uri: #{env["REQUEST_URI"]}")
+ Rails.logger.warn("Global asset IP rate limit exceeded for #{ip}: 10 second rate limit, uri: #{request.env["REQUEST_URI"]}")
end
return !(GlobalSetting.max_reqs_per_ip_mode == "warn")
@@ -257,7 +267,7 @@ class Middleware::RequestTracker
limiter60.performed!
rescue RateLimiter::LimitExceeded
if warn
- Rails.logger.warn("Global IP rate limit exceeded for #{ip}: #{type} second rate limit, uri: #{env["REQUEST_URI"]}")
+ Rails.logger.warn("Global IP rate limit exceeded for #{ip}: #{type} second rate limit, uri: #{request.env["REQUEST_URI"]}")
!(GlobalSetting.max_reqs_per_ip_mode == "warn")
else
true
@@ -266,6 +276,14 @@ class Middleware::RequestTracker
end
end
+ def block_crawler(request)
+ request.get? &&
+ !request.xhr? &&
+ request.env['HTTP_ACCEPT'] =~ /text\/html/ &&
+ !request.path.ends_with?('robots.txt') &&
+ CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT'])
+ end
+
def log_later(data, host)
Scheduler::Defer.later("Track view", _db = nil) do
self.class.log_request_on_site(data, host)
diff --git a/spec/components/crawler_detection_spec.rb b/spec/components/crawler_detection_spec.rb
index 86b53450427..65f811ef518 100644
--- a/spec/components/crawler_detection_spec.rb
+++ b/spec/components/crawler_detection_spec.rb
@@ -28,6 +28,12 @@ describe CrawlerDetection do
expect(described_class.crawler?("Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)")).to eq(true)
expect(described_class.crawler?("Baiduspider+(+http://www.baidu.com/search/spider.htm)")).to eq(true)
expect(described_class.crawler?("Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)")).to eq(true)
+
+ expect(described_class.crawler?("DiscourseAPI Ruby Gem 0.19.0")).to eq(true)
+ expect(described_class.crawler?("Pingdom.com_bot_version_1.4_(http://www.pingdom.com/)")).to eq(true)
+ expect(described_class.crawler?("LogicMonitor SiteMonitor/1.0")).to eq(true)
+ expect(described_class.crawler?("Java/1.8.0_151")).to eq(true)
+ expect(described_class.crawler?("Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)")).to eq(true)
end
it "returns false for non-crawler user agents" do
@@ -37,13 +43,106 @@ describe CrawlerDetection do
expect(described_class.crawler?("Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25")).to eq(false)
expect(described_class.crawler?("Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0")).to eq(false)
expect(described_class.crawler?("Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30")).to eq(false)
-
- expect(described_class.crawler?("DiscourseAPI Ruby Gem 0.19.0")).to eq(true)
- expect(described_class.crawler?("Pingdom.com_bot_version_1.4_(http://www.pingdom.com/)")).to eq(true)
- expect(described_class.crawler?("LogicMonitor SiteMonitor/1.0")).to eq(true)
- expect(described_class.crawler?("Java/1.8.0_151")).to eq(true)
- expect(described_class.crawler?("Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)")).to eq(true)
end
end
+
+ describe 'allow_crawler?' do
+ it 'returns true if whitelist and blacklist are blank' do
+ expect(CrawlerDetection.allow_crawler?('Googlebot/2.1 (+http://www.google.com/bot.html)')).to eq(true)
+ end
+
+ context 'whitelist is set' do
+ before do
+ SiteSetting.whitelisted_crawler_user_agents = 'Googlebot|Twitterbot'
+ end
+
+ it 'returns true for matching user agents' do
+ expect(CrawlerDetection.allow_crawler?('Googlebot/2.1 (+http://www.google.com/bot.html)')).to eq(true)
+ expect(CrawlerDetection.allow_crawler?('Googlebot-Image/1.0')).to eq(true)
+ expect(CrawlerDetection.allow_crawler?('Twitterbot')).to eq(true)
+ end
+
+ it 'returns false for user agents that do not match' do
+ expect(CrawlerDetection.allow_crawler?('facebookexternalhit/1.1 (+http(s)://www.facebook.com/externalhit_uatext.php)')).to eq(false)
+ expect(CrawlerDetection.allow_crawler?('Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)')).to eq(false)
+ expect(CrawlerDetection.allow_crawler?('')).to eq(false)
+ end
+
+ context 'and blacklist is set' do
+ before do
+ SiteSetting.blacklisted_crawler_user_agents = 'Googlebot-Image'
+ end
+
+ it 'ignores the blacklist' do
+ expect(CrawlerDetection.allow_crawler?('Googlebot-Image/1.0')).to eq(true)
+ end
+ end
+ end
+
+ context 'blacklist is set' do
+ before do
+ SiteSetting.blacklisted_crawler_user_agents = 'Googlebot|Twitterbot'
+ end
+
+ it 'returns true for crawlers that do not match' do
+ expect(CrawlerDetection.allow_crawler?('Mediapartners-Google')).to eq(true)
+ expect(CrawlerDetection.allow_crawler?('facebookexternalhit/1.1 (+http(s)://www.facebook.com/externalhit_uatext.php)')).to eq(true)
+ expect(CrawlerDetection.allow_crawler?('')).to eq(true)
+ end
+
+ it 'returns false for user agents that match' do
+ expect(CrawlerDetection.allow_crawler?('Googlebot/2.1 (+http://www.google.com/bot.html)')).to eq(false)
+ expect(CrawlerDetection.allow_crawler?('Googlebot-Image/1.0')).to eq(false)
+ expect(CrawlerDetection.allow_crawler?('Twitterbot')).to eq(false)
+ end
+ end
+ end
+
+ describe 'is_blocked_crawler?' do
+ it 'is false if user agent is a crawler and no whitelist or blacklist is defined' do
+ expect(CrawlerDetection.is_blocked_crawler?('Twitterbot')).to eq(false)
+ end
+
+ it 'is false if user agent is not a crawler and no whitelist or blacklist is defined' do
+ expect(CrawlerDetection.is_blocked_crawler?('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')).to eq(false)
+ end
+
+ it 'is true if user agent is a crawler and is not whitelisted' do
+ SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
+ expect(CrawlerDetection.is_blocked_crawler?('Twitterbot')).to eq(true)
+ end
+
+ it 'is false if user agent is not a crawler and there is a whitelist' do
+ SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
+ expect(CrawlerDetection.is_blocked_crawler?('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')).to eq(false)
+ end
+
+ it 'is true if user agent is a crawler and is blacklisted' do
+ SiteSetting.blacklisted_crawler_user_agents = 'Twitterbot'
+ expect(CrawlerDetection.is_blocked_crawler?('Twitterbot')).to eq(true)
+ end
+
+ it 'is true if user agent is a crawler and is not blacklisted' do
+ SiteSetting.blacklisted_crawler_user_agents = 'Twitterbot'
+ expect(CrawlerDetection.is_blocked_crawler?('Googlebot')).to eq(false)
+ end
+
+ it 'is false if user agent is not a crawler and blacklist is defined' do
+ SiteSetting.blacklisted_crawler_user_agents = 'Mozilla'
+ expect(CrawlerDetection.is_blocked_crawler?('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')).to eq(false)
+ end
+
+ it 'is true if user agent is missing and whitelist is defined' do
+ SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
+ expect(CrawlerDetection.is_blocked_crawler?('')).to eq(true)
+ expect(CrawlerDetection.is_blocked_crawler?(nil)).to eq(true)
+ end
+
+ it 'is false if user agent is missing and blacklist is defined' do
+ SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
+ expect(CrawlerDetection.is_blocked_crawler?('')).to eq(false)
+ expect(CrawlerDetection.is_blocked_crawler?(nil)).to eq(false)
+ end
+ end
end
diff --git a/spec/components/middleware/request_tracker_spec.rb b/spec/components/middleware/request_tracker_spec.rb
index bd47b965ef4..96262e78ed4 100644
--- a/spec/components/middleware/request_tracker_spec.rb
+++ b/spec/components/middleware/request_tracker_spec.rb
@@ -9,6 +9,7 @@ describe Middleware::RequestTracker do
"HTTP_USER_AGENT" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
"REQUEST_URI" => "/path?bla=1",
"REQUEST_METHOD" => "GET",
+ "HTTP_ACCEPT" => "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"rack.input" => ""
}.merge(opts)
end
@@ -277,4 +278,58 @@ describe Middleware::RequestTracker do
expect(timing[:redis][:calls]).to eq 2
end
end
+
+ context "crawler blocking" do
+ let :middleware do
+ app = lambda do |env|
+ [200, {}, ['OK']]
+ end
+
+ Middleware::RequestTracker.new(app)
+ end
+
+ def expect_success_response(status, _, response)
+ expect(status).to eq(200)
+ expect(response).to eq(['OK'])
+ end
+
+ def expect_blocked_response(status, _, response)
+ expect(status).to eq(403)
+ expect(response).to eq(['Crawler is not allowed'])
+ end
+
+ it "applies whitelisted_crawler_user_agents correctly" do
+ SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
+ expect_success_response(*middleware.call(env))
+ expect_blocked_response(*middleware.call(env('HTTP_USER_AGENT' => 'Twitterbot')))
+ expect_success_response(*middleware.call(env('HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)')))
+ expect_blocked_response(*middleware.call(env('HTTP_USER_AGENT' => 'DiscourseAPI Ruby Gem 0.19.0')))
+ end
+
+ it "applies blacklisted_crawler_user_agents correctly" do
+ SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
+ expect_success_response(*middleware.call(env))
+ expect_blocked_response(*middleware.call(env('HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)')))
+ expect_success_response(*middleware.call(env('HTTP_USER_AGENT' => 'Twitterbot')))
+ expect_success_response(*middleware.call(env('HTTP_USER_AGENT' => 'DiscourseAPI Ruby Gem 0.19.0')))
+ end
+
+ it "blocked crawlers shouldn't log page views" do
+ ApplicationRequest.clear_cache!
+ SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
+ expect {
+ middleware.call(env('HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'))
+ ApplicationRequest.write_cache!
+ }.to_not change { ApplicationRequest.count }
+ end
+
+ it "allows json requests" do
+ SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
+ expect_success_response(*middleware.call(env(
+ 'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)',
+ 'HTTP_ACCEPT' => 'application/json'
+ )))
+ end
+ end
+
end
diff --git a/spec/fabricators/web_crawler_request_fabricator.rb b/spec/fabricators/web_crawler_request_fabricator.rb
new file mode 100644
index 00000000000..ed678cc8876
--- /dev/null
+++ b/spec/fabricators/web_crawler_request_fabricator.rb
@@ -0,0 +1,5 @@
+Fabricator(:web_crawler_request) do
+ user_agent { sequence(:ua) { |i| "Googlebot #{i}.0" } }
+ date Time.zone.now.to_date
+ count 0
+end
diff --git a/spec/jobs/clean_up_crawler_stats_spec.rb b/spec/jobs/clean_up_crawler_stats_spec.rb
new file mode 100644
index 00000000000..46c069e9e6f
--- /dev/null
+++ b/spec/jobs/clean_up_crawler_stats_spec.rb
@@ -0,0 +1,31 @@
+require 'rails_helper'
+
+describe Jobs::CleanUpCrawlerStats do
+ subject { Jobs::CleanUpCrawlerStats.new.execute({}) }
+
+ it "deletes records older than 30 days old" do
+ freeze_time
+
+ today = Fabricate(:web_crawler_request, date: Time.zone.now.to_date)
+ yesterday = Fabricate(:web_crawler_request, date: 1.day.ago.to_date)
+ too_old = Fabricate(:web_crawler_request, date: 31.days.ago.to_date)
+
+ expect { subject }.to change { WebCrawlerRequest.count }.by(-1)
+ expect(WebCrawlerRequest.where(id: too_old.id)).to_not exist
+ end
+
+ it "keeps only the top records from the previous day" do
+ freeze_time
+
+ WebCrawlerRequest.stubs(:max_records_per_day).returns(3)
+
+ req1 = Fabricate(:web_crawler_request, date: 1.day.ago.to_date, count: 100)
+ req4 = Fabricate(:web_crawler_request, date: 1.day.ago.to_date, count: 30)
+ req3 = Fabricate(:web_crawler_request, date: 1.day.ago.to_date, count: 40)
+ req2 = Fabricate(:web_crawler_request, date: 1.day.ago.to_date, count: 50)
+ req5 = Fabricate(:web_crawler_request, date: 1.day.ago.to_date, count: 1)
+
+ expect { subject }.to change { WebCrawlerRequest.count }.by(-2)
+ expect(WebCrawlerRequest.all).to contain_exactly(req1, req2, req3)
+ end
+end
diff --git a/spec/models/web_crawler_request_spec.rb b/spec/models/web_crawler_request_spec.rb
new file mode 100644
index 00000000000..03f8da5a866
--- /dev/null
+++ b/spec/models/web_crawler_request_spec.rb
@@ -0,0 +1,114 @@
+require 'rails_helper'
+
+describe WebCrawlerRequest do
+ before do
+ WebCrawlerRequest.last_flush = Time.now.utc
+ WebCrawlerRequest.clear_cache!
+ end
+
+ after do
+ WebCrawlerRequest.clear_cache!
+ end
+
+ def inc(user_agent, opts = nil)
+ WebCrawlerRequest.increment!(user_agent, opts)
+ end
+
+ def disable_date_flush!
+ freeze_time(Time.now)
+ WebCrawlerRequest.last_flush = Time.now.utc
+ end
+
+ def web_crawler_request(user_agent)
+ WebCrawlerRequest.where(user_agent: user_agent).first
+ end
+
+ it 'works even if redis is in readonly' do
+ disable_date_flush!
+
+ inc('Googlebot')
+ inc('Googlebot')
+
+ $redis.without_namespace.stubs(:incr).raises(Redis::CommandError.new("READONLY"))
+ $redis.without_namespace.stubs(:eval).raises(Redis::CommandError.new("READONLY"))
+
+ inc('Googlebot', autoflush: 3)
+ WebCrawlerRequest.write_cache!
+
+ $redis.without_namespace.unstub(:incr)
+ $redis.without_namespace.unstub(:eval)
+
+ inc('Googlebot', autoflush: 3)
+ expect(web_crawler_request('Googlebot').count).to eq(3)
+ end
+
+ it 'logs nothing for an unflushed increment' do
+ WebCrawlerRequest.increment!('Googlebot')
+ expect(WebCrawlerRequest.count).to eq(0)
+ end
+
+ it 'can automatically flush' do
+ disable_date_flush!
+
+ inc('Googlebot', autoflush: 3)
+ expect(web_crawler_request('Googlebot')).to_not be_present
+ expect(WebCrawlerRequest.count).to eq(0)
+ inc('Googlebot', autoflush: 3)
+ expect(web_crawler_request('Googlebot')).to_not be_present
+ inc('Googlebot', autoflush: 3)
+ expect(web_crawler_request('Googlebot').count).to eq(3)
+ expect(WebCrawlerRequest.count).to eq(1)
+
+ 3.times { inc('Googlebot', autoflush: 3) }
+ expect(web_crawler_request('Googlebot').count).to eq(6)
+ expect(WebCrawlerRequest.count).to eq(1)
+ end
+
+ it 'can flush based on time' do
+ t1 = Time.now.utc.at_midnight
+ freeze_time(t1)
+ WebCrawlerRequest.write_cache!
+ inc('Googlebot')
+ expect(WebCrawlerRequest.count).to eq(0)
+
+ freeze_time(t1 + WebCrawlerRequest.autoflush_seconds + 1)
+ inc('Googlebot')
+
+ expect(WebCrawlerRequest.count).to eq(1)
+ end
+
+ it 'flushes yesterdays results' do
+ t1 = Time.now.utc.at_midnight
+ freeze_time(t1)
+ inc('Googlebot')
+ freeze_time(t1.tomorrow)
+ inc('Googlebot')
+
+ WebCrawlerRequest.write_cache!
+ expect(WebCrawlerRequest.count).to eq(2)
+ end
+
+ it 'clears cache correctly' do
+ inc('Googlebot')
+ inc('Twitterbot')
+ WebCrawlerRequest.clear_cache!
+ WebCrawlerRequest.write_cache!
+
+ expect(WebCrawlerRequest.count).to eq(0)
+ end
+
+ it 'logs a few counts once flushed' do
+ time = Time.now.at_midnight
+ freeze_time(time)
+
+ 3.times { inc('Googlebot') }
+ 2.times { inc('Twitterbot') }
+ 4.times { inc('Bingbot') }
+
+ WebCrawlerRequest.write_cache!
+
+ expect(web_crawler_request('Googlebot').count).to eq(3)
+ expect(web_crawler_request('Twitterbot').count).to eq(2)
+ expect(web_crawler_request('Bingbot').count).to eq(4)
+ end
+end
diff --git a/spec/requests/robots_txt_controller_spec.rb b/spec/requests/robots_txt_controller_spec.rb
index 68d5912be41..be3590e8d8f 100644
--- a/spec/requests/robots_txt_controller_spec.rb
+++ b/spec/requests/robots_txt_controller_spec.rb
@@ -2,11 +2,64 @@ require 'rails_helper'
RSpec.describe RobotsTxtController do
describe '#index' do
- it "returns index when indexing is allowed" do
- SiteSetting.allow_index_in_robots_txt = true
- get '/robots.txt'
+ context 'allow_index_in_robots_txt is true' do
- expect(response.body).to include("Disallow: /u/")
+ def expect_allowed_and_disallowed_sections(allow_index, disallow_index)
+ expect(allow_index).to be_present
+ expect(disallow_index).to be_present
+
+ allow_section = allow_index < disallow_index ?
+ response.body[allow_index...disallow_index] : response.body[allow_index..-1]
+
+ expect(allow_section).to include('Disallow: /u/')
+ expect(allow_section).to_not include("Disallow: /\n")
+
+ disallowed_section = allow_index < disallow_index ?
+ response.body[disallow_index..-1] : response.body[disallow_index...allow_index]
+ expect(disallowed_section).to include("Disallow: /\n")
+ end
+
+ it "returns index when indexing is allowed" do
+ SiteSetting.allow_index_in_robots_txt = true
+ get '/robots.txt'
+
+ i = response.body.index('User-agent: *')
+ expect(i).to be_present
+ expect(response.body[i..-1]).to include("Disallow: /u/")
+ end
+
+ it "can whitelist user agents" do
+ SiteSetting.whitelisted_crawler_user_agents = "Googlebot|Twitterbot"
+ get '/robots.txt'
+ expect(response.body).to include('User-agent: Googlebot')
+ expect(response.body).to include('User-agent: Twitterbot')
+
+ allowed_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min
+ disallow_all_index = response.body.index('User-agent: *')
+
+ expect_allowed_and_disallowed_sections(allowed_index, disallow_all_index)
+ end
+
+ it "can blacklist user agents" do
+ SiteSetting.blacklisted_crawler_user_agents = "Googlebot|Twitterbot"
+ get '/robots.txt'
+ expect(response.body).to include('User-agent: Googlebot')
+ expect(response.body).to include('User-agent: Twitterbot')
+
+ disallow_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min
+ allow_index = response.body.index('User-agent: *')
+
+ expect_allowed_and_disallowed_sections(allow_index, disallow_index)
+ end
+
+ it "ignores blacklist if whitelist is set" do
+ SiteSetting.whitelisted_crawler_user_agents = "Googlebot|Twitterbot"
+ SiteSetting.blacklisted_crawler_user_agents = "Bananabot"
+ get '/robots.txt'
+ expect(response.body).to_not include('Bananabot')
+ expect(response.body).to include('User-agent: Googlebot')
+ expect(response.body).to include('User-agent: Twitterbot')
+ end
end
it "returns noindex when indexing is disallowed" do