diff --git a/app/assets/javascripts/admin/models/report.js.es6 b/app/assets/javascripts/admin/models/report.js.es6 index af078528b75..86281929ae0 100644 --- a/app/assets/javascripts/admin/models/report.js.es6 +++ b/app/assets/javascripts/admin/models/report.js.es6 @@ -2,6 +2,7 @@ import { ajax } from 'discourse/lib/ajax'; import round from "discourse/lib/round"; import { fmt } from 'discourse/lib/computed'; import { fillMissingDates } from 'discourse/lib/utilities'; +import computed from 'ember-addons/ember-computed-decorators'; const Report = Discourse.Model.extend({ reportUrl: fmt("type", "/admin/reports/%@"), @@ -42,7 +43,8 @@ const Report = Discourse.Model.extend({ lastSevenDaysCount: function() { return this.valueFor(1, 7); }.property("data"), lastThirtyDaysCount: function() { return this.valueFor(1, 30); }.property("data"), - yesterdayTrend: function() { + @computed('data') + yesterdayTrend() { const yesterdayVal = this.valueAt(1); const twoDaysAgoVal = this.valueAt(2); if (yesterdayVal > twoDaysAgoVal) { @@ -52,9 +54,10 @@ const Report = Discourse.Model.extend({ } else { return "no-change"; } - }.property("data"), + }, - sevenDayTrend: function() { + @computed('data') + sevenDayTrend() { const currentPeriod = this.valueFor(1, 7); const prevPeriod = this.valueFor(8, 14); if (currentPeriod > prevPeriod) { @@ -64,36 +67,39 @@ const Report = Discourse.Model.extend({ } else { return "no-change"; } - }.property("data"), + }, - thirtyDayTrend: function() { - if (this.get("prev30Days")) { + @computed('prev30Days', 'data') + thirtyDayTrend(prev30Days) { + if (prev30Days) { const currentPeriod = this.valueFor(1, 30); if (currentPeriod > this.get("prev30Days")) { return "trending-up"; - } else if (currentPeriod < this.get("prev30Days")) { + } else if (currentPeriod < prev30Days) { return "trending-down"; } } return "no-change"; - }.property("data", "prev30Days"), + }, - icon: function() { - switch (this.get("type")) { + @computed('type') + icon(type) { + switch (type) { case "flags": return "flag"; case "likes": return "heart"; case "bookmarks": return "bookmark"; default: return null; } - }.property("type"), + }, - method: function() { - if (this.get("type") === "time_to_first_response") { + @computed('type') + method(type) { + if (type === "time_to_first_response") { return "average"; } else { return "sum"; } - }.property("type"), + }, percentChangeString(val1, val2) { const val = ((val1 - val2) / val2) * 100; @@ -114,21 +120,31 @@ const Report = Discourse.Model.extend({ return title; }, - yesterdayCountTitle: function() { + @computed('data') + yesterdayCountTitle() { return this.changeTitle(this.valueAt(1), this.valueAt(2), "two days ago"); - }.property("data"), + }, - sevenDayCountTitle: function() { + @computed('data') + sevenDayCountTitle() { return this.changeTitle(this.valueFor(1, 7), this.valueFor(8, 14), "two weeks ago"); - }.property("data"), + }, - thirtyDayCountTitle: function() { - return this.changeTitle(this.valueFor(1, 30), this.get("prev30Days"), "in the previous 30 day period"); - }.property("data"), + @computed('prev30Days', 'data') + thirtyDayCountTitle(prev30Days) { + return this.changeTitle(this.valueFor(1, 30), prev30Days, "in the previous 30 day period"); + }, - dataReversed: function() { - return this.get("data").toArray().reverse(); - }.property("data") + @computed('data') + sortedData(data) { + return this.get('xAxisIsDate') ? data.toArray().reverse() : data.toArray(); + }, + + @computed('data') + xAxisIsDate() { + if (!this.data[0]) return false; + return this.data && this.data[0].x.match(/\d{4}-\d{1,2}-\d{1,2}/); + } }); @@ -152,6 +168,14 @@ Report.reopenClass({ const model = Report.create({ type: type }); model.setProperties(json.report); + + if (json.report.related_report) { + // TODO: fillMissingDates if xaxis is date + const related = Report.create({ type: json.report.related_report.type }); + related.setProperties(json.report.related_report); + model.set('relatedReport', related); + } + return model; }); } diff --git a/app/assets/javascripts/admin/templates/components/admin-table-report.hbs b/app/assets/javascripts/admin/templates/components/admin-table-report.hbs new file mode 100644 index 00000000000..53ea2e51c6e --- /dev/null +++ b/app/assets/javascripts/admin/templates/components/admin-table-report.hbs @@ -0,0 +1,17 @@ +{{#if model.sortedData}} + + + + + + + {{#each model.sortedData as |row|}} + + + + + {{/each}} +
{{model.xaxis}}{{model.yaxis}}
{{row.x}} + {{row.y}} +
+{{/if}} diff --git a/app/assets/javascripts/admin/templates/reports.hbs b/app/assets/javascripts/admin/templates/reports.hbs index 3927d1e384f..1a6b4bf0477 100644 --- a/app/assets/javascripts/admin/templates/reports.hbs +++ b/app/assets/javascripts/admin/templates/reports.hbs @@ -31,20 +31,10 @@ {{#if viewingGraph}} {{admin-graph model=model}} {{else}} - - - - - + {{admin-table-report model=model}} + {{/if}} - {{#each model.dataReversed as |row|}} - - - - - {{/each}} -
{{model.xaxis}}{{model.yaxis}}
{{row.x}} - {{row.y}} -
+ {{#if model.relatedReport}} + {{admin-table-report model=model.relatedReport}} {{/if}} {{/conditional-loading-spinner}} diff --git a/app/assets/stylesheets/common/admin/admin_base.scss b/app/assets/stylesheets/common/admin/admin_base.scss index 122a647925b..04708b6e646 100644 --- a/app/assets/stylesheets/common/admin/admin_base.scss +++ b/app/assets/stylesheets/common/admin/admin_base.scss @@ -167,6 +167,20 @@ $mobile-breakpoint: 700px; } } + &.web_crawlers { + tr { + th:nth-of-type(1) { + width: 60%; + } + } + td.x-value { + max-width: 0; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + } + } + .bar-container { float: left; width: 300px; diff --git a/app/controllers/robots_txt_controller.rb b/app/controllers/robots_txt_controller.rb index 4a8600774cc..2315120935f 100644 --- a/app/controllers/robots_txt_controller.rb +++ b/app/controllers/robots_txt_controller.rb @@ -3,7 +3,21 @@ class RobotsTxtController < ApplicationController skip_before_action :preload_json, :check_xhr, :redirect_to_login_if_required def index - path = SiteSetting.allow_index_in_robots_txt ? :index : :no_index + if SiteSetting.allow_index_in_robots_txt + path = :index + if SiteSetting.whitelisted_crawler_user_agents.present? + @allowed_user_agents = SiteSetting.whitelisted_crawler_user_agents.split('|') + @disallowed_user_agents = ['*'] + elsif SiteSetting.blacklisted_crawler_user_agents.present? + @allowed_user_agents = ['*'] + @disallowed_user_agents = SiteSetting.blacklisted_crawler_user_agents.split('|') + else + @allowed_user_agents = ['*'] + end + else + path = :no_index + end + render path, content_type: 'text/plain' end end diff --git a/app/jobs/scheduled/clean_up_crawler_stats.rb b/app/jobs/scheduled/clean_up_crawler_stats.rb new file mode 100644 index 00000000000..816e91f0fd2 --- /dev/null +++ b/app/jobs/scheduled/clean_up_crawler_stats.rb @@ -0,0 +1,26 @@ +module Jobs + + class CleanUpCrawlerStats < Jobs::Scheduled + every 1.day + + def execute(args) + WebCrawlerRequest.where('date < ?', WebCrawlerRequest.max_record_age.ago).delete_all + + # keep count of only the top user agents + WebCrawlerRequest.exec_sql <<~SQL + WITH ranked_requests AS ( + SELECT row_number() OVER (ORDER BY count DESC) as row_number, id + FROM web_crawler_requests + WHERE date = '#{1.day.ago.strftime("%Y-%m-%d")}' + ) + DELETE FROM web_crawler_requests + WHERE id IN ( + SELECT ranked_requests.id + FROM ranked_requests + WHERE row_number > #{WebCrawlerRequest.max_records_per_day} + ) + SQL + end + end + +end diff --git a/app/models/application_request.rb b/app/models/application_request.rb index f13e7275a7b..1e4459f681d 100644 --- a/app/models/application_request.rb +++ b/app/models/application_request.rb @@ -1,5 +1,6 @@ # frozen_string_literal: true class ApplicationRequest < ActiveRecord::Base + enum req_type: %i(http_total http_2xx http_background @@ -12,41 +13,12 @@ class ApplicationRequest < ActiveRecord::Base page_view_logged_in_mobile page_view_anon_mobile) - cattr_accessor :autoflush, :autoflush_seconds, :last_flush - # auto flush if backlog is larger than this - self.autoflush = 2000 - - # auto flush if older than this - self.autoflush_seconds = 5.minutes - self.last_flush = Time.now.utc + include CachedCounting def self.increment!(type, opts = nil) - key = redis_key(type) - val = $redis.incr(key).to_i - - # readonly mode it is going to be 0, skip - return if val == 0 - - # 3.days, see: https://github.com/rails/rails/issues/21296 - $redis.expire(key, 259200) - - autoflush = (opts && opts[:autoflush]) || self.autoflush - if autoflush > 0 && val >= autoflush - write_cache! - return - end - - if (Time.now.utc - last_flush).to_i > autoflush_seconds - write_cache! - end + perform_increment!(redis_key(type), opts) end - GET_AND_RESET = <<~LUA - local val = redis.call('get', KEYS[1]) - redis.call('set', KEYS[1], '0') - return val - LUA - def self.write_cache!(date = nil) if date.nil? write_cache!(Time.now.utc) @@ -58,13 +30,9 @@ class ApplicationRequest < ActiveRecord::Base date = date.to_date - # this may seem a bit fancy but in so it allows - # for concurrent calls without double counting req_types.each do |req_type, _| - key = redis_key(req_type, date) + val = get_and_reset(redis_key(req_type, date)) - namespaced_key = $redis.namespace_key(key) - val = $redis.without_namespace.eval(GET_AND_RESET, keys: [namespaced_key]).to_i next if val == 0 id = req_id(date, req_type) diff --git a/app/models/concerns/cached_counting.rb b/app/models/concerns/cached_counting.rb new file mode 100644 index 00000000000..8f629df70e8 --- /dev/null +++ b/app/models/concerns/cached_counting.rb @@ -0,0 +1,67 @@ +module CachedCounting + extend ActiveSupport::Concern + + included do + class << self + attr_accessor :autoflush, :autoflush_seconds, :last_flush + end + + # auto flush if backlog is larger than this + self.autoflush = 2000 + + # auto flush if older than this + self.autoflush_seconds = 5.minutes + + self.last_flush = Time.now.utc + end + + class_methods do + def perform_increment!(key, opts = nil) + val = $redis.incr(key).to_i + + # readonly mode it is going to be 0, skip + return if val == 0 + + # 3.days, see: https://github.com/rails/rails/issues/21296 + $redis.expire(key, 259200) + + autoflush = (opts && opts[:autoflush]) || self.autoflush + if autoflush > 0 && val >= autoflush + write_cache! + return + end + + if (Time.now.utc - last_flush).to_i > autoflush_seconds + write_cache! + end + end + + def write_cache!(date = nil) + raise NotImplementedError + end + + GET_AND_RESET = <<~LUA + local val = redis.call('get', KEYS[1]) + redis.call('set', KEYS[1], '0') + return val + LUA + + # this may seem a bit fancy but in so it allows + # for concurrent calls without double counting + def get_and_reset(key) + namespaced_key = $redis.namespace_key(key) + $redis.without_namespace.eval(GET_AND_RESET, keys: [namespaced_key]).to_i + end + + def request_id(query_params, retries = 0) + id = where(query_params).pluck(:id).first + id ||= create!(query_params.merge(count: 0)).id + rescue # primary key violation + if retries == 0 + request_id(query_params, 1) + else + raise + end + end + end +end diff --git a/app/models/report.rb b/app/models/report.rb index 0a2871febff..2d6c8a3624c 100644 --- a/app/models/report.rb +++ b/app/models/report.rb @@ -27,7 +27,11 @@ class Report category_id: category_id, group_id: group_id, prev30Days: self.prev30Days - } + }.tap do |json| + if type == 'page_view_crawler_reqs' + json[:related_report] = Report.find('web_crawlers', start_date: start_date, end_date: end_date)&.as_json + end + end end def Report.add_report(name, &block) @@ -231,4 +235,12 @@ class Report def self.report_notify_user_private_messages(report) private_messages_report report, TopicSubtype.notify_user end + + def self.report_web_crawlers(report) + report.data = WebCrawlerRequest.where('date >= ? and date <= ?', report.start_date, report.end_date) + .limit(200) + .order('sum_count DESC') + .group(:user_agent).sum(:count) + .map { |ua, count| { x: ua, y: count } } + end end diff --git a/app/models/web_crawler_request.rb b/app/models/web_crawler_request.rb new file mode 100644 index 00000000000..5e418d1b1c1 --- /dev/null +++ b/app/models/web_crawler_request.rb @@ -0,0 +1,76 @@ +class WebCrawlerRequest < ActiveRecord::Base + include CachedCounting + + # auto flush if older than this + self.autoflush_seconds = 1.hour + + cattr_accessor :max_record_age, :max_records_per_day + + # only keep the top records based on request count + self.max_records_per_day = 200 + + # delete records older than this + self.max_record_age = 30.days + + def self.increment!(user_agent, opts = nil) + ua_list_key = user_agent_list_key + $redis.sadd(ua_list_key, user_agent) + $redis.expire(ua_list_key, 259200) # 3.days + + perform_increment!(redis_key(user_agent), opts) + end + + def self.write_cache!(date = nil) + if date.nil? + write_cache!(Time.now.utc) + write_cache!(Time.now.utc.yesterday) + return + end + + self.last_flush = Time.now.utc + + date = date.to_date + ua_list_key = user_agent_list_key(date) + + while user_agent = $redis.spop(ua_list_key) + val = get_and_reset(redis_key(user_agent, date)) + + next if val == 0 + + self.where(id: req_id(date, user_agent)).update_all(["count = count + ?", val]) + end + rescue Redis::CommandError => e + raise unless e.message =~ /READONLY/ + nil + end + + def self.clear_cache!(date = nil) + if date.nil? + clear_cache!(Time.now.utc) + clear_cache!(Time.now.utc.yesterday) + return + end + + list_key = user_agent_list_key(date) + + $redis.smembers(list_key).each do |user_agent, _| + $redis.del redis_key(user_agent, date) + end + + $redis.del(list_key) + end + + protected + + def self.user_agent_list_key(time = Time.now.utc) + "crawl_ua_list:#{time.strftime('%Y%m%d')}" + end + + def self.redis_key(user_agent, time = Time.now.utc) + "crawl_req:#{time.strftime('%Y%m%d')}:#{user_agent}" + end + + def self.req_id(date, user_agent) + request_id(date: date, user_agent: user_agent) + end +end diff --git a/app/views/robots_txt/index.erb b/app/views/robots_txt/index.erb index abb3cf51fee..4f9bc5ac468 100644 --- a/app/views/robots_txt/index.erb +++ b/app/views/robots_txt/index.erb @@ -1,6 +1,8 @@ # See http://www.robotstxt.org/wc/norobots.html for documentation on how to use the robots.txt file # -User-agent: * +<% @allowed_user_agents.each do |user_agent| %> +User-agent: <%= user_agent %> +<% end %> Disallow: /auth/cas Disallow: /auth/facebook/callback Disallow: /auth/twitter/callback @@ -29,4 +31,12 @@ Disallow: /groups Disallow: /groups/ Disallow: /uploads/ +<% if @disallowed_user_agents %> + <% @disallowed_user_agents.each do |user_agent| %> +User-agent: <%= user_agent %> +Disallow: / + + <% end %> +<% end %> + <%= server_plugin_outlet "robots_txt_index" %> diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index f00e044a338..bae3db4905c 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -968,6 +968,10 @@ en: title: "User Visits" xaxis: "Day" yaxis: "Number of visits" + web_crawlers: + title: "Web Crawler Requests" + xaxis: "User Agent" + yaxis: "Pageviews" dashboard: rails_env_warning: "Your server is running in %{env} mode." @@ -1119,6 +1123,8 @@ en: blacklist_ip_blocks: "A list of private IP blocks that should never be crawled by Discourse" whitelist_internal_hosts: "A list of internal hosts that discourse can safely crawl for oneboxing and other purposes" allowed_iframes: "A list of iframe src domain prefixes that discourse can safely allow in posts" + whitelisted_crawler_user_agents: 'User agents of web crawlers that should be allowed to access the site.' + blacklisted_crawler_user_agents: 'User agents of web crawlers that should not be allowed to access the site. Does not apply if whitelist is defined.' top_menu: "Determine which items appear in the homepage navigation, and in what order. Example latest|new|unread|categories|top|read|posted|bookmarks" post_menu: "Determine which items appear on the post menu, and in what order. Example like|edit|flag|delete|share|bookmark|reply" post_menu_hidden_items: "The menu items to hide by default in the post menu unless an expansion ellipsis is clicked on." diff --git a/config/site_settings.yml b/config/site_settings.yml index dc00b023fe5..3b1b7f00c27 100644 --- a/config/site_settings.yml +++ b/config/site_settings.yml @@ -1018,6 +1018,12 @@ security: default: 'https://www.google.com/maps/embed?|https://www.openstreetmap.org/export/embed.html?|https://calendar.google.com/calendar/embed?' type: list client: true + whitelisted_crawler_user_agents: + type: list + default: '' + blacklisted_crawler_user_agents: + type: list + default: '' onebox: enable_flash_video_onebox: false diff --git a/db/migrate/20180320190339_create_web_crawler_requests.rb b/db/migrate/20180320190339_create_web_crawler_requests.rb new file mode 100644 index 00000000000..f02fd776344 --- /dev/null +++ b/db/migrate/20180320190339_create_web_crawler_requests.rb @@ -0,0 +1,11 @@ +class CreateWebCrawlerRequests < ActiveRecord::Migration[5.1] + def change + create_table :web_crawler_requests do |t| + t.date :date, null: false + t.string :user_agent, null: false + t.integer :count, null: false, default: 0 + end + + add_index :web_crawler_requests, [:date, :user_agent], unique: true + end +end diff --git a/lib/crawler_detection.rb b/lib/crawler_detection.rb index 2958087a3c2..bf28c6c1aa8 100644 --- a/lib/crawler_detection.rb +++ b/lib/crawler_detection.rb @@ -28,4 +28,25 @@ module CrawlerDetection end end + + # Given a user_agent that returns true from crawler?, should its request be allowed? + def self.allow_crawler?(user_agent) + return true if SiteSetting.whitelisted_crawler_user_agents.blank? && + SiteSetting.blacklisted_crawler_user_agents.blank? + + @whitelisted_matchers ||= {} + @blacklisted_matchers ||= {} + + if SiteSetting.whitelisted_crawler_user_agents.present? + whitelisted = @whitelisted_matchers[SiteSetting.whitelisted_crawler_user_agents] ||= to_matcher(SiteSetting.whitelisted_crawler_user_agents) + !user_agent.nil? && user_agent.match?(whitelisted) + else + blacklisted = @blacklisted_matchers[SiteSetting.blacklisted_crawler_user_agents] ||= to_matcher(SiteSetting.blacklisted_crawler_user_agents) + user_agent.nil? || !user_agent.match?(blacklisted) + end + end + + def self.is_blocked_crawler?(user_agent) + crawler?(user_agent) && !allow_crawler?(user_agent) + end end diff --git a/lib/middleware/request_tracker.rb b/lib/middleware/request_tracker.rb index 2ad01b05215..e7ffb8869a4 100644 --- a/lib/middleware/request_tracker.rb +++ b/lib/middleware/request_tracker.rb @@ -83,6 +83,7 @@ class Middleware::RequestTracker if track_view if data[:is_crawler] ApplicationRequest.increment!(:page_view_crawler) + WebCrawlerRequest.increment!(data[:user_agent]) elsif data[:has_auth_cookie] ApplicationRequest.increment!(:page_view_logged_in) ApplicationRequest.increment!(:page_view_logged_in_mobile) if data[:is_mobile] @@ -129,8 +130,9 @@ class Middleware::RequestTracker is_mobile: helper.is_mobile?, track_view: track_view, timing: timing - } - + }.tap do |h| + h[:user_agent] = env['HTTP_USER_AGENT'] if h[:is_crawler] + end end def log_request_info(env, result, info) @@ -155,12 +157,20 @@ class Middleware::RequestTracker def call(env) result = nil + log_request = true + request = Rack::Request.new(env) - if rate_limit(env) + if rate_limit(request) result = [429, {}, ["Slow down, too Many Requests from this IP Address"]] return result end + if block_crawler(request) + log_request = false + result = [403, { 'Content-Type' => 'text/plain' }, ['Crawler is not allowed']] + return result + end + env["discourse.request_tracker"] = self MethodProfiler.start result = @app.call(env) @@ -186,7 +196,7 @@ class Middleware::RequestTracker end end end - log_request_info(env, result, info) unless env["discourse.request_tracker.skip"] + log_request_info(env, result, info) unless !log_request || env["discourse.request_tracker.skip"] end PRIVATE_IP ||= /^(127\.)|(192\.168\.)|(10\.)|(172\.1[6-9]\.)|(172\.2[0-9]\.)|(172\.3[0-1]\.)|(::1$)|([fF][cCdD])/ @@ -196,7 +206,7 @@ class Middleware::RequestTracker !!(ip && ip.to_s.match?(PRIVATE_IP)) end - def rate_limit(env) + def rate_limit(request) if ( GlobalSetting.max_reqs_per_ip_mode == "block" || @@ -204,7 +214,7 @@ class Middleware::RequestTracker GlobalSetting.max_reqs_per_ip_mode == "warn+block" ) - ip = Rack::Request.new(env).ip + ip = request.ip if !GlobalSetting.max_reqs_rate_limit_on_private return false if is_private_ip?(ip) @@ -236,15 +246,15 @@ class Middleware::RequestTracker global: true ) - env['DISCOURSE_RATE_LIMITERS'] = [limiter10, limiter60] - env['DISCOURSE_ASSET_RATE_LIMITERS'] = [limiter_assets10] + request.env['DISCOURSE_RATE_LIMITERS'] = [limiter10, limiter60] + request.env['DISCOURSE_ASSET_RATE_LIMITERS'] = [limiter_assets10] warn = GlobalSetting.max_reqs_per_ip_mode == "warn" || GlobalSetting.max_reqs_per_ip_mode == "warn+block" if !limiter_assets10.can_perform? if warn - Rails.logger.warn("Global asset IP rate limit exceeded for #{ip}: 10 second rate limit, uri: #{env["REQUEST_URI"]}") + Rails.logger.warn("Global asset IP rate limit exceeded for #{ip}: 10 second rate limit, uri: #{request.env["REQUEST_URI"]}") end return !(GlobalSetting.max_reqs_per_ip_mode == "warn") @@ -257,7 +267,7 @@ class Middleware::RequestTracker limiter60.performed! rescue RateLimiter::LimitExceeded if warn - Rails.logger.warn("Global IP rate limit exceeded for #{ip}: #{type} second rate limit, uri: #{env["REQUEST_URI"]}") + Rails.logger.warn("Global IP rate limit exceeded for #{ip}: #{type} second rate limit, uri: #{request.env["REQUEST_URI"]}") !(GlobalSetting.max_reqs_per_ip_mode == "warn") else true @@ -266,6 +276,14 @@ class Middleware::RequestTracker end end + def block_crawler(request) + request.get? && + !request.xhr? && + request.env['HTTP_ACCEPT'] =~ /text\/html/ && + !request.path.ends_with?('robots.txt') && + CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT']) + end + def log_later(data, host) Scheduler::Defer.later("Track view", _db = nil) do self.class.log_request_on_site(data, host) diff --git a/spec/components/crawler_detection_spec.rb b/spec/components/crawler_detection_spec.rb index 86b53450427..65f811ef518 100644 --- a/spec/components/crawler_detection_spec.rb +++ b/spec/components/crawler_detection_spec.rb @@ -28,6 +28,12 @@ describe CrawlerDetection do expect(described_class.crawler?("Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)")).to eq(true) expect(described_class.crawler?("Baiduspider+(+http://www.baidu.com/search/spider.htm)")).to eq(true) expect(described_class.crawler?("Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)")).to eq(true) + + expect(described_class.crawler?("DiscourseAPI Ruby Gem 0.19.0")).to eq(true) + expect(described_class.crawler?("Pingdom.com_bot_version_1.4_(http://www.pingdom.com/)")).to eq(true) + expect(described_class.crawler?("LogicMonitor SiteMonitor/1.0")).to eq(true) + expect(described_class.crawler?("Java/1.8.0_151")).to eq(true) + expect(described_class.crawler?("Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)")).to eq(true) end it "returns false for non-crawler user agents" do @@ -37,13 +43,106 @@ describe CrawlerDetection do expect(described_class.crawler?("Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25")).to eq(false) expect(described_class.crawler?("Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0")).to eq(false) expect(described_class.crawler?("Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30")).to eq(false) - - expect(described_class.crawler?("DiscourseAPI Ruby Gem 0.19.0")).to eq(true) - expect(described_class.crawler?("Pingdom.com_bot_version_1.4_(http://www.pingdom.com/)")).to eq(true) - expect(described_class.crawler?("LogicMonitor SiteMonitor/1.0")).to eq(true) - expect(described_class.crawler?("Java/1.8.0_151")).to eq(true) - expect(described_class.crawler?("Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)")).to eq(true) end end + + describe 'allow_crawler?' do + it 'returns true if whitelist and blacklist are blank' do + expect(CrawlerDetection.allow_crawler?('Googlebot/2.1 (+http://www.google.com/bot.html)')).to eq(true) + end + + context 'whitelist is set' do + before do + SiteSetting.whitelisted_crawler_user_agents = 'Googlebot|Twitterbot' + end + + it 'returns true for matching user agents' do + expect(CrawlerDetection.allow_crawler?('Googlebot/2.1 (+http://www.google.com/bot.html)')).to eq(true) + expect(CrawlerDetection.allow_crawler?('Googlebot-Image/1.0')).to eq(true) + expect(CrawlerDetection.allow_crawler?('Twitterbot')).to eq(true) + end + + it 'returns false for user agents that do not match' do + expect(CrawlerDetection.allow_crawler?('facebookexternalhit/1.1 (+http(s)://www.facebook.com/externalhit_uatext.php)')).to eq(false) + expect(CrawlerDetection.allow_crawler?('Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)')).to eq(false) + expect(CrawlerDetection.allow_crawler?('')).to eq(false) + end + + context 'and blacklist is set' do + before do + SiteSetting.blacklisted_crawler_user_agents = 'Googlebot-Image' + end + + it 'ignores the blacklist' do + expect(CrawlerDetection.allow_crawler?('Googlebot-Image/1.0')).to eq(true) + end + end + end + + context 'blacklist is set' do + before do + SiteSetting.blacklisted_crawler_user_agents = 'Googlebot|Twitterbot' + end + + it 'returns true for crawlers that do not match' do + expect(CrawlerDetection.allow_crawler?('Mediapartners-Google')).to eq(true) + expect(CrawlerDetection.allow_crawler?('facebookexternalhit/1.1 (+http(s)://www.facebook.com/externalhit_uatext.php)')).to eq(true) + expect(CrawlerDetection.allow_crawler?('')).to eq(true) + end + + it 'returns false for user agents that match' do + expect(CrawlerDetection.allow_crawler?('Googlebot/2.1 (+http://www.google.com/bot.html)')).to eq(false) + expect(CrawlerDetection.allow_crawler?('Googlebot-Image/1.0')).to eq(false) + expect(CrawlerDetection.allow_crawler?('Twitterbot')).to eq(false) + end + end + end + + describe 'is_blocked_crawler?' do + it 'is false if user agent is a crawler and no whitelist or blacklist is defined' do + expect(CrawlerDetection.is_blocked_crawler?('Twitterbot')).to eq(false) + end + + it 'is false if user agent is not a crawler and no whitelist or blacklist is defined' do + expect(CrawlerDetection.is_blocked_crawler?('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')).to eq(false) + end + + it 'is true if user agent is a crawler and is not whitelisted' do + SiteSetting.whitelisted_crawler_user_agents = 'Googlebot' + expect(CrawlerDetection.is_blocked_crawler?('Twitterbot')).to eq(true) + end + + it 'is false if user agent is not a crawler and there is a whitelist' do + SiteSetting.whitelisted_crawler_user_agents = 'Googlebot' + expect(CrawlerDetection.is_blocked_crawler?('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')).to eq(false) + end + + it 'is true if user agent is a crawler and is blacklisted' do + SiteSetting.blacklisted_crawler_user_agents = 'Twitterbot' + expect(CrawlerDetection.is_blocked_crawler?('Twitterbot')).to eq(true) + end + + it 'is true if user agent is a crawler and is not blacklisted' do + SiteSetting.blacklisted_crawler_user_agents = 'Twitterbot' + expect(CrawlerDetection.is_blocked_crawler?('Googlebot')).to eq(false) + end + + it 'is false if user agent is not a crawler and blacklist is defined' do + SiteSetting.blacklisted_crawler_user_agents = 'Mozilla' + expect(CrawlerDetection.is_blocked_crawler?('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')).to eq(false) + end + + it 'is true if user agent is missing and whitelist is defined' do + SiteSetting.whitelisted_crawler_user_agents = 'Googlebot' + expect(CrawlerDetection.is_blocked_crawler?('')).to eq(true) + expect(CrawlerDetection.is_blocked_crawler?(nil)).to eq(true) + end + + it 'is false if user agent is missing and blacklist is defined' do + SiteSetting.blacklisted_crawler_user_agents = 'Googlebot' + expect(CrawlerDetection.is_blocked_crawler?('')).to eq(false) + expect(CrawlerDetection.is_blocked_crawler?(nil)).to eq(false) + end + end end diff --git a/spec/components/middleware/request_tracker_spec.rb b/spec/components/middleware/request_tracker_spec.rb index bd47b965ef4..96262e78ed4 100644 --- a/spec/components/middleware/request_tracker_spec.rb +++ b/spec/components/middleware/request_tracker_spec.rb @@ -9,6 +9,7 @@ describe Middleware::RequestTracker do "HTTP_USER_AGENT" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", "REQUEST_URI" => "/path?bla=1", "REQUEST_METHOD" => "GET", + "HTTP_ACCEPT" => "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "rack.input" => "" }.merge(opts) end @@ -277,4 +278,58 @@ describe Middleware::RequestTracker do expect(timing[:redis][:calls]).to eq 2 end end + + context "crawler blocking" do + let :middleware do + app = lambda do |env| + [200, {}, ['OK']] + end + + Middleware::RequestTracker.new(app) + end + + def expect_success_response(status, _, response) + expect(status).to eq(200) + expect(response).to eq(['OK']) + end + + def expect_blocked_response(status, _, response) + expect(status).to eq(403) + expect(response).to eq(['Crawler is not allowed']) + end + + it "applies whitelisted_crawler_user_agents correctly" do + SiteSetting.whitelisted_crawler_user_agents = 'Googlebot' + expect_success_response(*middleware.call(env)) + expect_blocked_response(*middleware.call(env('HTTP_USER_AGENT' => 'Twitterbot'))) + expect_success_response(*middleware.call(env('HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'))) + expect_blocked_response(*middleware.call(env('HTTP_USER_AGENT' => 'DiscourseAPI Ruby Gem 0.19.0'))) + end + + it "applies blacklisted_crawler_user_agents correctly" do + SiteSetting.blacklisted_crawler_user_agents = 'Googlebot' + expect_success_response(*middleware.call(env)) + expect_blocked_response(*middleware.call(env('HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'))) + expect_success_response(*middleware.call(env('HTTP_USER_AGENT' => 'Twitterbot'))) + expect_success_response(*middleware.call(env('HTTP_USER_AGENT' => 'DiscourseAPI Ruby Gem 0.19.0'))) + end + + it "blocked crawlers shouldn't log page views" do + ApplicationRequest.clear_cache! + SiteSetting.blacklisted_crawler_user_agents = 'Googlebot' + expect { + middleware.call(env('HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)')) + ApplicationRequest.write_cache! + }.to_not change { ApplicationRequest.count } + end + + it "allows json requests" do + SiteSetting.blacklisted_crawler_user_agents = 'Googlebot' + expect_success_response(*middleware.call(env( + 'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)', + 'HTTP_ACCEPT' => 'application/json' + ))) + end + end + end diff --git a/spec/fabricators/web_crawler_request_fabricator.rb b/spec/fabricators/web_crawler_request_fabricator.rb new file mode 100644 index 00000000000..ed678cc8876 --- /dev/null +++ b/spec/fabricators/web_crawler_request_fabricator.rb @@ -0,0 +1,5 @@ +Fabricator(:web_crawler_request) do + user_agent { sequence(:ua) { |i| "Googlebot #{i}.0" } } + date Time.zone.now.to_date + count 0 +end diff --git a/spec/jobs/clean_up_crawler_stats_spec.rb b/spec/jobs/clean_up_crawler_stats_spec.rb new file mode 100644 index 00000000000..46c069e9e6f --- /dev/null +++ b/spec/jobs/clean_up_crawler_stats_spec.rb @@ -0,0 +1,31 @@ +require 'rails_helper' + +describe Jobs::CleanUpCrawlerStats do + subject { Jobs::CleanUpCrawlerStats.new.execute({}) } + + it "deletes records older than 30 days old" do + freeze_time + + today = Fabricate(:web_crawler_request, date: Time.zone.now.to_date) + yesterday = Fabricate(:web_crawler_request, date: 1.day.ago.to_date) + too_old = Fabricate(:web_crawler_request, date: 31.days.ago.to_date) + + expect { subject }.to change { WebCrawlerRequest.count }.by(-1) + expect(WebCrawlerRequest.where(id: too_old.id)).to_not exist + end + + it "keeps only the top records from the previous day" do + freeze_time + + WebCrawlerRequest.stubs(:max_records_per_day).returns(3) + + req1 = Fabricate(:web_crawler_request, date: 1.day.ago.to_date, count: 100) + req4 = Fabricate(:web_crawler_request, date: 1.day.ago.to_date, count: 30) + req3 = Fabricate(:web_crawler_request, date: 1.day.ago.to_date, count: 40) + req2 = Fabricate(:web_crawler_request, date: 1.day.ago.to_date, count: 50) + req5 = Fabricate(:web_crawler_request, date: 1.day.ago.to_date, count: 1) + + expect { subject }.to change { WebCrawlerRequest.count }.by(-2) + expect(WebCrawlerRequest.all).to contain_exactly(req1, req2, req3) + end +end diff --git a/spec/models/web_crawler_request_spec.rb b/spec/models/web_crawler_request_spec.rb new file mode 100644 index 00000000000..03f8da5a866 --- /dev/null +++ b/spec/models/web_crawler_request_spec.rb @@ -0,0 +1,114 @@ +require 'rails_helper' + +describe WebCrawlerRequest do + before do + WebCrawlerRequest.last_flush = Time.now.utc + WebCrawlerRequest.clear_cache! + end + + after do + WebCrawlerRequest.clear_cache! + end + + def inc(user_agent, opts = nil) + WebCrawlerRequest.increment!(user_agent, opts) + end + + def disable_date_flush! + freeze_time(Time.now) + WebCrawlerRequest.last_flush = Time.now.utc + end + + def web_crawler_request(user_agent) + WebCrawlerRequest.where(user_agent: user_agent).first + end + + it 'works even if redis is in readonly' do + disable_date_flush! + + inc('Googlebot') + inc('Googlebot') + + $redis.without_namespace.stubs(:incr).raises(Redis::CommandError.new("READONLY")) + $redis.without_namespace.stubs(:eval).raises(Redis::CommandError.new("READONLY")) + + inc('Googlebot', autoflush: 3) + WebCrawlerRequest.write_cache! + + $redis.without_namespace.unstub(:incr) + $redis.without_namespace.unstub(:eval) + + inc('Googlebot', autoflush: 3) + expect(web_crawler_request('Googlebot').count).to eq(3) + end + + it 'logs nothing for an unflushed increment' do + WebCrawlerRequest.increment!('Googlebot') + expect(WebCrawlerRequest.count).to eq(0) + end + + it 'can automatically flush' do + disable_date_flush! + + inc('Googlebot', autoflush: 3) + expect(web_crawler_request('Googlebot')).to_not be_present + expect(WebCrawlerRequest.count).to eq(0) + inc('Googlebot', autoflush: 3) + expect(web_crawler_request('Googlebot')).to_not be_present + inc('Googlebot', autoflush: 3) + expect(web_crawler_request('Googlebot').count).to eq(3) + expect(WebCrawlerRequest.count).to eq(1) + + 3.times { inc('Googlebot', autoflush: 3) } + expect(web_crawler_request('Googlebot').count).to eq(6) + expect(WebCrawlerRequest.count).to eq(1) + end + + it 'can flush based on time' do + t1 = Time.now.utc.at_midnight + freeze_time(t1) + WebCrawlerRequest.write_cache! + inc('Googlebot') + expect(WebCrawlerRequest.count).to eq(0) + + freeze_time(t1 + WebCrawlerRequest.autoflush_seconds + 1) + inc('Googlebot') + + expect(WebCrawlerRequest.count).to eq(1) + end + + it 'flushes yesterdays results' do + t1 = Time.now.utc.at_midnight + freeze_time(t1) + inc('Googlebot') + freeze_time(t1.tomorrow) + inc('Googlebot') + + WebCrawlerRequest.write_cache! + expect(WebCrawlerRequest.count).to eq(2) + end + + it 'clears cache correctly' do + inc('Googlebot') + inc('Twitterbot') + WebCrawlerRequest.clear_cache! + WebCrawlerRequest.write_cache! + + expect(WebCrawlerRequest.count).to eq(0) + end + + it 'logs a few counts once flushed' do + time = Time.now.at_midnight + freeze_time(time) + + 3.times { inc('Googlebot') } + 2.times { inc('Twitterbot') } + 4.times { inc('Bingbot') } + + WebCrawlerRequest.write_cache! + + expect(web_crawler_request('Googlebot').count).to eq(3) + expect(web_crawler_request('Twitterbot').count).to eq(2) + expect(web_crawler_request('Bingbot').count).to eq(4) + end +end diff --git a/spec/requests/robots_txt_controller_spec.rb b/spec/requests/robots_txt_controller_spec.rb index 68d5912be41..be3590e8d8f 100644 --- a/spec/requests/robots_txt_controller_spec.rb +++ b/spec/requests/robots_txt_controller_spec.rb @@ -2,11 +2,64 @@ require 'rails_helper' RSpec.describe RobotsTxtController do describe '#index' do - it "returns index when indexing is allowed" do - SiteSetting.allow_index_in_robots_txt = true - get '/robots.txt' + context 'allow_index_in_robots_txt is true' do - expect(response.body).to include("Disallow: /u/") + def expect_allowed_and_disallowed_sections(allow_index, disallow_index) + expect(allow_index).to be_present + expect(disallow_index).to be_present + + allow_section = allow_index < disallow_index ? + response.body[allow_index...disallow_index] : response.body[allow_index..-1] + + expect(allow_section).to include('Disallow: /u/') + expect(allow_section).to_not include("Disallow: /\n") + + disallowed_section = allow_index < disallow_index ? + response.body[disallow_index..-1] : response.body[disallow_index...allow_index] + expect(disallowed_section).to include("Disallow: /\n") + end + + it "returns index when indexing is allowed" do + SiteSetting.allow_index_in_robots_txt = true + get '/robots.txt' + + i = response.body.index('User-agent: *') + expect(i).to be_present + expect(response.body[i..-1]).to include("Disallow: /u/") + end + + it "can whitelist user agents" do + SiteSetting.whitelisted_crawler_user_agents = "Googlebot|Twitterbot" + get '/robots.txt' + expect(response.body).to include('User-agent: Googlebot') + expect(response.body).to include('User-agent: Twitterbot') + + allowed_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min + disallow_all_index = response.body.index('User-agent: *') + + expect_allowed_and_disallowed_sections(allowed_index, disallow_all_index) + end + + it "can blacklist user agents" do + SiteSetting.blacklisted_crawler_user_agents = "Googlebot|Twitterbot" + get '/robots.txt' + expect(response.body).to include('User-agent: Googlebot') + expect(response.body).to include('User-agent: Twitterbot') + + disallow_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min + allow_index = response.body.index('User-agent: *') + + expect_allowed_and_disallowed_sections(allow_index, disallow_index) + end + + it "ignores blacklist if whitelist is set" do + SiteSetting.whitelisted_crawler_user_agents = "Googlebot|Twitterbot" + SiteSetting.blacklisted_crawler_user_agents = "Bananabot" + get '/robots.txt' + expect(response.body).to_not include('Bananabot') + expect(response.body).to include('User-agent: Googlebot') + expect(response.body).to include('User-agent: Twitterbot') + end end it "returns noindex when indexing is disallowed" do