From f6fdc1ebe81652be07e8c2c12b59812305de1ba5 Mon Sep 17 00:00:00 2001
From: Sam <sam.saffron@gmail.com>
Date: Fri, 29 Sep 2017 12:31:50 +1000
Subject: [PATCH] FEATURE: flexible crawler detection

You can use the crawler user agents site setting to amend what user agents
are considered crawlers based on a string match in the user agent

Also improves performance of crawler detection slightly
---
 config/locales/server.en.yml              |  1 +
 config/site_settings.yml                  |  3 +++
 lib/crawler_detection.rb                  | 10 +++++++++-
 lib/freedom_patches/regexp.rb             |  9 +++++++++
 spec/components/crawler_detection_spec.rb |  8 ++++++++
 5 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100644 lib/freedom_patches/regexp.rb

diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml
index e289dd0208d..dbcb7f8ba15 100644
--- a/config/locales/server.en.yml
+++ b/config/locales/server.en.yml
@@ -1063,6 +1063,7 @@ en:
     gtm_container_id: "Google Tag Manager container id. eg: GTM-ABCDEF"
     enable_escaped_fragments: "Fall back to Google's Ajax-Crawling API if no webcrawler is detected. See https://developers.google.com/webmasters/ajax-crawling/docs/learn-more"
     allow_moderators_to_create_categories: "Allow moderators to create new categories"
+    crawler_user_agents: "List of user agents that are considered crawlers and served static HTML instead of JavaScript payload"
     cors_origins: "Allowed origins for cross-origin requests (CORS). Each origin must include http:// or https://. The DISCOURSE_ENABLE_CORS env variable must be set to true to enable CORS."
     use_admin_ip_whitelist: "Admins can only log in if they are at an IP address defined in the Screened IPs list (Admin > Logs > Screened Ips)."
     blacklist_ip_blocks: "A list of private IP blocks that should never be crawled by Discourse"
diff --git a/config/site_settings.yml b/config/site_settings.yml
index 982d5cef5cc..a145999d671 100644
--- a/config/site_settings.yml
+++ b/config/site_settings.yml
@@ -926,6 +926,9 @@ security:
   enable_escaped_fragments: true
   allow_index_in_robots_txt: true
   allow_moderators_to_create_categories: false
+  crawler_user_agents:
+    default: 'Googlebot|Mediapartners|AdsBot|curl|HTTrack|Twitterbot|facebookexternalhit|bingbot|Baiduspider|ia_archiver|Wayback Save Page|360Spider|Swiftbot|YandexBot'
+    type: list
   cors_origins:
     default: ''
     type: list
diff --git a/lib/crawler_detection.rb b/lib/crawler_detection.rb
index a8892fdc769..5d222ecf7bb 100644
--- a/lib/crawler_detection.rb
+++ b/lib/crawler_detection.rb
@@ -1,9 +1,17 @@
 module CrawlerDetection
+
   # added 'ia_archiver' based on https://meta.discourse.org/t/unable-to-archive-discourse-pages-with-the-internet-archive/21232
   # added 'Wayback Save Page' based on https://meta.discourse.org/t/unable-to-archive-discourse-with-the-internet-archive-save-page-now-button/22875
   # added 'Swiftbot' based on https://meta.discourse.org/t/how-to-add-html-markup-or-meta-tags-for-external-search-engine/28220
+  def self.to_matcher(string)
+    escaped = string.split('|').map { |agent| Regexp.escape(agent) }.join('|')
+    Regexp.new(escaped)
+  end
 
   def self.crawler?(user_agent)
-    !/Googlebot|Mediapartners|AdsBot|curl|HTTrack|Twitterbot|facebookexternalhit|bingbot|Baiduspider|ia_archiver|Wayback Save Page|360Spider|Swiftbot|YandexBot/.match(user_agent).nil?
+    # this is done to avoid regenerating regexes
+    @matchers ||= {}
+    matcher = (@matchers[SiteSetting.crawler_user_agents] ||= to_matcher(SiteSetting.crawler_user_agents))
+    matcher.match?(user_agent)
   end
 end
diff --git a/lib/freedom_patches/regexp.rb b/lib/freedom_patches/regexp.rb
new file mode 100644
index 00000000000..5ff804c4900
--- /dev/null
+++ b/lib/freedom_patches/regexp.rb
@@ -0,0 +1,9 @@
+unless ::Regexp.instance_methods.include?(:match?)
+  class ::Regexp
+    # this is the fast way of checking a regex (zero string allocs) added in Ruby 2.4
+    # backfill it for now
+    def match?(string)
+      !!(string =~ self)
+    end
+  end
+end
diff --git a/spec/components/crawler_detection_spec.rb b/spec/components/crawler_detection_spec.rb
index e3956b1070c..6443d84a529 100644
--- a/spec/components/crawler_detection_spec.rb
+++ b/spec/components/crawler_detection_spec.rb
@@ -3,6 +3,14 @@ require_dependency 'crawler_detection'
 
 describe CrawlerDetection do
   describe "crawler?" do
+
+    it "can be amended via site settings" do
+      SiteSetting.crawler_user_agents = 'Mooble|Kaboodle+*'
+      expect(CrawlerDetection.crawler?("Mozilla/5.0 (compatible; Kaboodle+*/2.1; +http://www.google.com/bot.html)")).to eq(true)
+      expect(CrawlerDetection.crawler?("Mozilla/5.0 (compatible; Mooble+*/2.1; +http://www.google.com/bot.html)")).to eq(true)
+      expect(CrawlerDetection.crawler?("Mozilla/5.0 (compatible; Gooble+*/2.1; +http://www.google.com/bot.html)")).to eq(false)
+    end
+
     it "returns true for crawler user agents" do
       # https://support.google.com/webmasters/answer/1061943?hl=en
       expect(described_class.crawler?("Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)")).to eq(true)