2019-04-29 20:27:42 -04:00
# frozen_string_literal: true
2015-10-11 05:41:23 -04:00
require 'rails_helper'
2014-02-20 16:07:02 -05:00
describe CrawlerDetection do
2018-06-20 20:56:46 -04:00
2019-06-02 22:13:32 -04:00
def crawler! ( user_agent , via = nil )
if ( ! CrawlerDetection . crawler? ( user_agent , via ) )
raise " #{ user_agent } should be a crawler! "
2018-06-20 20:56:46 -04:00
end
end
def not_crawler! ( s )
if CrawlerDetection . crawler? ( s )
raise " #{ s } should not be a crawler! "
end
end
2014-02-20 16:07:02 -05:00
describe " crawler? " do
2017-09-28 22:31:50 -04:00
it " can be amended via site settings " do
SiteSetting . crawler_user_agents = 'Mooble|Kaboodle+*'
2018-06-20 20:56:46 -04:00
crawler! " Mozilla/5.0 Safari (compatible; Kaboodle+*/2.1; +http://www.google.com/bot.html) "
crawler! " Mozilla/5.0 Safari (compatible; Mooble+*/2.1; +http://www.google.com/bot.html) "
not_crawler! " Mozilla/5.0 Safari (compatible; Gooble+*/2.1; +http://www.google.com/bot.html) "
2017-09-28 22:31:50 -04:00
end
2014-02-20 16:07:02 -05:00
it " returns true for crawler user agents " do
# https://support.google.com/webmasters/answer/1061943?hl=en
2018-06-20 20:56:46 -04:00
crawler! " Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html) "
crawler! " Googlebot/2.1 (+http://www.google.com/bot.html) "
crawler! " Googlebot-News "
crawler! " Googlebot-Image/1.0 "
crawler! " Googlebot-Video/1.0 "
crawler! " (compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html) "
crawler! " Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25 (compatible; Googlebot/2.1; +http://www.google.com/bot.html) "
crawler! " (compatible; Mediapartners-Google/2.1; +http://www.google.com/bot.html) "
crawler! " Mediapartners-Google "
crawler! " AdsBot-Google (+http://www.google.com/adsbot.html) "
crawler! " Twitterbot "
crawler! " facebookexternalhit/1.1 (+http(s)://www.facebook.com/externalhit_uatext.php) "
crawler! " Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) "
crawler! " Baiduspider+(+http://www.baidu.com/search/spider.htm) "
crawler! " Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots) "
crawler! " Pingdom.com_bot_version_1.4_(http://www.pingdom.com/) "
crawler! " LogicMonitor SiteMonitor/1.0 "
crawler! " Java/1.8.0_151 "
crawler! " Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp) "
2019-11-27 16:15:34 -05:00
crawler! " Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3694.0 Mobile Safari/537.36 Chrome-Lighthouse "
2014-02-20 16:07:02 -05:00
end
2019-06-02 22:13:32 -04:00
it " returns true when VIA header contains 'web.archive.org' " do
2020-05-14 07:10:07 -04:00
crawler! " Mozilla/5.0 (compatible; archive.org_bot +http://archive.org/details/archive.org_bot) "
crawler! " Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36 " , " HTTP/1.0 web.archive.org (Wayback Save Page) "
crawler! " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36 " , " Mozilla/5.0 (compatible; archive.org_bot; Wayback Machine Live Record; http://archive.org/details/archive.org_bot), 1.1 warcprox "
2019-06-02 22:13:32 -04:00
end
2014-02-20 16:07:02 -05:00
it " returns false for non-crawler user agents " do
2018-06-20 20:56:46 -04:00
not_crawler! " Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36 "
not_crawler! " Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko "
not_crawler! " Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0) "
not_crawler! " Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25 "
not_crawler! " Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0 "
not_crawler! " Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30 "
not_crawler! " Mozilla/5.0 (Linux; Android 6.0; CUBOT DINOSAUR Build/MRA58K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Mobile Safari/537.36+ "
2019-05-08 09:58:47 -04:00
not_crawler! " DiscourseAPI Ruby Gem 0.19.0 "
2018-03-15 17:10:45 -04:00
end
2018-01-15 23:41:13 -05:00
2018-03-15 17:10:45 -04:00
end
describe 'allow_crawler?' do
2020-07-26 20:23:54 -04:00
it 'returns true if allowlist and blocklist are blank' do
2018-03-15 17:10:45 -04:00
expect ( CrawlerDetection . allow_crawler? ( 'Googlebot/2.1 (+http://www.google.com/bot.html)' ) ) . to eq ( true )
end
2020-07-26 20:23:54 -04:00
context 'allowlist is set' do
2018-03-15 17:10:45 -04:00
before do
2020-07-26 20:23:54 -04:00
SiteSetting . allowed_crawler_user_agents = 'Googlebot|Twitterbot'
2018-03-15 17:10:45 -04:00
end
it 'returns true for matching user agents' do
expect ( CrawlerDetection . allow_crawler? ( 'Googlebot/2.1 (+http://www.google.com/bot.html)' ) ) . to eq ( true )
expect ( CrawlerDetection . allow_crawler? ( 'Googlebot-Image/1.0' ) ) . to eq ( true )
expect ( CrawlerDetection . allow_crawler? ( 'Twitterbot' ) ) . to eq ( true )
end
it 'returns false for user agents that do not match' do
expect ( CrawlerDetection . allow_crawler? ( 'facebookexternalhit/1.1 (+http(s)://www.facebook.com/externalhit_uatext.php)' ) ) . to eq ( false )
expect ( CrawlerDetection . allow_crawler? ( 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)' ) ) . to eq ( false )
expect ( CrawlerDetection . allow_crawler? ( '' ) ) . to eq ( false )
end
2020-07-26 20:23:54 -04:00
context 'and blocklist is set' do
2018-03-15 17:10:45 -04:00
before do
2020-07-26 20:23:54 -04:00
SiteSetting . blocked_crawler_user_agents = 'Googlebot-Image'
2018-03-15 17:10:45 -04:00
end
2020-07-26 20:23:54 -04:00
it 'ignores the blocklist' do
2018-03-15 17:10:45 -04:00
expect ( CrawlerDetection . allow_crawler? ( 'Googlebot-Image/1.0' ) ) . to eq ( true )
end
end
end
2020-07-26 20:23:54 -04:00
context 'blocklist is set' do
2018-03-15 17:10:45 -04:00
before do
2020-07-26 20:23:54 -04:00
SiteSetting . blocked_crawler_user_agents = 'Googlebot|Twitterbot'
2018-03-15 17:10:45 -04:00
end
it 'returns true for crawlers that do not match' do
expect ( CrawlerDetection . allow_crawler? ( 'Mediapartners-Google' ) ) . to eq ( true )
expect ( CrawlerDetection . allow_crawler? ( 'facebookexternalhit/1.1 (+http(s)://www.facebook.com/externalhit_uatext.php)' ) ) . to eq ( true )
expect ( CrawlerDetection . allow_crawler? ( '' ) ) . to eq ( true )
end
it 'returns false for user agents that match' do
expect ( CrawlerDetection . allow_crawler? ( 'Googlebot/2.1 (+http://www.google.com/bot.html)' ) ) . to eq ( false )
expect ( CrawlerDetection . allow_crawler? ( 'Googlebot-Image/1.0' ) ) . to eq ( false )
expect ( CrawlerDetection . allow_crawler? ( 'Twitterbot' ) ) . to eq ( false )
end
end
end
describe 'is_blocked_crawler?' do
2020-07-26 20:23:54 -04:00
it 'is false if user agent is a crawler and no allowlist or blocklist is defined' do
2018-03-15 17:10:45 -04:00
expect ( CrawlerDetection . is_blocked_crawler? ( 'Twitterbot' ) ) . to eq ( false )
2014-02-20 16:07:02 -05:00
end
2020-07-26 20:23:54 -04:00
it 'is false if user agent is not a crawler and no allowlist or blocklist is defined' do
2018-03-15 17:10:45 -04:00
expect ( CrawlerDetection . is_blocked_crawler? ( 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' ) ) . to eq ( false )
end
2020-07-26 20:23:54 -04:00
it 'is true if user agent is a crawler and is not allowlisted' do
SiteSetting . allowed_crawler_user_agents = 'Googlebot'
2018-03-15 17:10:45 -04:00
expect ( CrawlerDetection . is_blocked_crawler? ( 'Twitterbot' ) ) . to eq ( true )
end
2020-07-26 20:23:54 -04:00
it 'is false if user agent is not a crawler and there is a allowlist' do
SiteSetting . allowed_crawler_user_agents = 'Googlebot'
2018-03-15 17:10:45 -04:00
expect ( CrawlerDetection . is_blocked_crawler? ( 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' ) ) . to eq ( false )
end
2020-07-26 20:23:54 -04:00
it 'is true if user agent is a crawler and is blocklisted' do
SiteSetting . blocked_crawler_user_agents = 'Twitterbot'
2018-03-15 17:10:45 -04:00
expect ( CrawlerDetection . is_blocked_crawler? ( 'Twitterbot' ) ) . to eq ( true )
end
2020-07-26 20:23:54 -04:00
it 'is true if user agent is a crawler and is not blocklisted' do
SiteSetting . blocked_crawler_user_agents = 'Twitterbot'
2018-03-15 17:10:45 -04:00
expect ( CrawlerDetection . is_blocked_crawler? ( 'Googlebot' ) ) . to eq ( false )
end
2020-07-26 20:23:54 -04:00
it 'is false if user agent is not a crawler and blocklist is defined' do
SiteSetting . blocked_crawler_user_agents = 'Mozilla'
2018-03-15 17:10:45 -04:00
expect ( CrawlerDetection . is_blocked_crawler? ( 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' ) ) . to eq ( false )
end
2020-07-26 20:23:54 -04:00
it 'is true if user agent is missing and allowlist is defined' do
SiteSetting . allowed_crawler_user_agents = 'Googlebot'
2018-03-15 17:10:45 -04:00
expect ( CrawlerDetection . is_blocked_crawler? ( '' ) ) . to eq ( true )
expect ( CrawlerDetection . is_blocked_crawler? ( nil ) ) . to eq ( true )
end
2020-07-26 20:23:54 -04:00
it 'is false if user agent is missing and blocklist is defined' do
SiteSetting . blocked_crawler_user_agents = 'Googlebot'
2018-03-15 17:10:45 -04:00
expect ( CrawlerDetection . is_blocked_crawler? ( '' ) ) . to eq ( false )
expect ( CrawlerDetection . is_blocked_crawler? ( nil ) ) . to eq ( false )
end
2014-02-20 16:07:02 -05:00
end
end