From f6fdc1ebe81652be07e8c2c12b59812305de1ba5 Mon Sep 17 00:00:00 2001 From: Sam Date: Fri, 29 Sep 2017 12:31:50 +1000 Subject: [PATCH] FEATURE: flexible crawler detection You can use the crawler user agents site setting to amend what user agents are considered crawlers based on a string match in the user agent Also improves performance of crawler detection slightly --- config/locales/server.en.yml | 1 + config/site_settings.yml | 3 +++ lib/crawler_detection.rb | 10 +++++++++- lib/freedom_patches/regexp.rb | 9 +++++++++ spec/components/crawler_detection_spec.rb | 8 ++++++++ 5 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 lib/freedom_patches/regexp.rb diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index e289dd0208d..dbcb7f8ba15 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -1063,6 +1063,7 @@ en: gtm_container_id: "Google Tag Manager container id. eg: GTM-ABCDEF" enable_escaped_fragments: "Fall back to Google's Ajax-Crawling API if no webcrawler is detected. See https://developers.google.com/webmasters/ajax-crawling/docs/learn-more" allow_moderators_to_create_categories: "Allow moderators to create new categories" + crawler_user_agents: "List of user agents that are considered crawlers and served static HTML instead of JavaScript payload" cors_origins: "Allowed origins for cross-origin requests (CORS). Each origin must include http:// or https://. The DISCOURSE_ENABLE_CORS env variable must be set to true to enable CORS." use_admin_ip_whitelist: "Admins can only log in if they are at an IP address defined in the Screened IPs list (Admin > Logs > Screened Ips)." blacklist_ip_blocks: "A list of private IP blocks that should never be crawled by Discourse" diff --git a/config/site_settings.yml b/config/site_settings.yml index 982d5cef5cc..a145999d671 100644 --- a/config/site_settings.yml +++ b/config/site_settings.yml @@ -926,6 +926,9 @@ security: enable_escaped_fragments: true allow_index_in_robots_txt: true allow_moderators_to_create_categories: false + crawler_user_agents: + default: 'Googlebot|Mediapartners|AdsBot|curl|HTTrack|Twitterbot|facebookexternalhit|bingbot|Baiduspider|ia_archiver|Wayback Save Page|360Spider|Swiftbot|YandexBot' + type: list cors_origins: default: '' type: list diff --git a/lib/crawler_detection.rb b/lib/crawler_detection.rb index a8892fdc769..5d222ecf7bb 100644 --- a/lib/crawler_detection.rb +++ b/lib/crawler_detection.rb @@ -1,9 +1,17 @@ module CrawlerDetection + # added 'ia_archiver' based on https://meta.discourse.org/t/unable-to-archive-discourse-pages-with-the-internet-archive/21232 # added 'Wayback Save Page' based on https://meta.discourse.org/t/unable-to-archive-discourse-with-the-internet-archive-save-page-now-button/22875 # added 'Swiftbot' based on https://meta.discourse.org/t/how-to-add-html-markup-or-meta-tags-for-external-search-engine/28220 + def self.to_matcher(string) + escaped = string.split('|').map { |agent| Regexp.escape(agent) }.join('|') + Regexp.new(escaped) + end def self.crawler?(user_agent) - !/Googlebot|Mediapartners|AdsBot|curl|HTTrack|Twitterbot|facebookexternalhit|bingbot|Baiduspider|ia_archiver|Wayback Save Page|360Spider|Swiftbot|YandexBot/.match(user_agent).nil? + # this is done to avoid regenerating regexes + @matchers ||= {} + matcher = (@matchers[SiteSetting.crawler_user_agents] ||= to_matcher(SiteSetting.crawler_user_agents)) + matcher.match?(user_agent) end end diff --git a/lib/freedom_patches/regexp.rb b/lib/freedom_patches/regexp.rb new file mode 100644 index 00000000000..5ff804c4900 --- /dev/null +++ b/lib/freedom_patches/regexp.rb @@ -0,0 +1,9 @@ +unless ::Regexp.instance_methods.include?(:match?) + class ::Regexp + # this is the fast way of checking a regex (zero string allocs) added in Ruby 2.4 + # backfill it for now + def match?(string) + !!(string =~ self) + end + end +end diff --git a/spec/components/crawler_detection_spec.rb b/spec/components/crawler_detection_spec.rb index e3956b1070c..6443d84a529 100644 --- a/spec/components/crawler_detection_spec.rb +++ b/spec/components/crawler_detection_spec.rb @@ -3,6 +3,14 @@ require_dependency 'crawler_detection' describe CrawlerDetection do describe "crawler?" do + + it "can be amended via site settings" do + SiteSetting.crawler_user_agents = 'Mooble|Kaboodle+*' + expect(CrawlerDetection.crawler?("Mozilla/5.0 (compatible; Kaboodle+*/2.1; +http://www.google.com/bot.html)")).to eq(true) + expect(CrawlerDetection.crawler?("Mozilla/5.0 (compatible; Mooble+*/2.1; +http://www.google.com/bot.html)")).to eq(true) + expect(CrawlerDetection.crawler?("Mozilla/5.0 (compatible; Gooble+*/2.1; +http://www.google.com/bot.html)")).to eq(false) + end + it "returns true for crawler user agents" do # https://support.google.com/webmasters/answer/1061943?hl=en expect(described_class.crawler?("Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)")).to eq(true)