2019-04-29 20:27:42 -04:00
# frozen_string_literal: true
2022-07-27 22:27:38 -04:00
RSpec . describe CrawlerDetection do
2019-06-02 22:13:32 -04:00
def crawler! ( user_agent , via = nil )
raise " #{ user_agent } should be a crawler! " if ( ! CrawlerDetection . crawler? ( user_agent , via ) )
2018-06-20 20:56:46 -04:00
end
def not_crawler! ( s )
raise " #{ s } should not be a crawler! " if CrawlerDetection . crawler? ( s )
end
2022-07-27 12:14:14 -04:00
describe " .crawler? " do
2017-09-28 22:31:50 -04:00
it " can be amended via site settings " do
SiteSetting . crawler_user_agents = " Mooble|Kaboodle+* "
2018-06-20 20:56:46 -04:00
crawler! " Mozilla/5.0 Safari (compatible; Kaboodle+*/2.1; +http://www.google.com/bot.html) "
crawler! " Mozilla/5.0 Safari (compatible; Mooble+*/2.1; +http://www.google.com/bot.html) "
not_crawler! " Mozilla/5.0 Safari (compatible; Gooble+*/2.1; +http://www.google.com/bot.html) "
2017-09-28 22:31:50 -04:00
end
2014-02-20 16:07:02 -05:00
it " returns true for crawler user agents " do
# https://support.google.com/webmasters/answer/1061943?hl=en
2018-06-20 20:56:46 -04:00
crawler! " Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html) "
crawler! " Googlebot/2.1 (+http://www.google.com/bot.html) "
crawler! " Googlebot-News "
crawler! " Googlebot-Image/1.0 "
crawler! " Googlebot-Video/1.0 "
crawler! " (compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html) "
crawler! " Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25 (compatible; Googlebot/2.1; +http://www.google.com/bot.html) "
crawler! " (compatible; Mediapartners-Google/2.1; +http://www.google.com/bot.html) "
crawler! " Mediapartners-Google "
crawler! " AdsBot-Google (+http://www.google.com/adsbot.html) "
crawler! " Twitterbot "
crawler! " facebookexternalhit/1.1 (+http(s)://www.facebook.com/externalhit_uatext.php) "
crawler! " Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) "
crawler! " Baiduspider+(+http://www.baidu.com/search/spider.htm) "
crawler! " Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots) "
crawler! " Pingdom.com_bot_version_1.4_(http://www.pingdom.com/) "
crawler! " LogicMonitor SiteMonitor/1.0 "
crawler! " Java/1.8.0_151 "
crawler! " Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp) "
2019-11-27 16:15:34 -05:00
crawler! " Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3694.0 Mobile Safari/537.36 Chrome-Lighthouse "
2014-02-20 16:07:02 -05:00
end
2019-06-02 22:13:32 -04:00
it " returns true when VIA header contains 'web.archive.org' " do
2020-05-14 07:10:07 -04:00
crawler! " Mozilla/5.0 (compatible; archive.org_bot +http://archive.org/details/archive.org_bot) "
crawler! " Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36 " ,
" HTTP/1.0 web.archive.org (Wayback Save Page) "
crawler! " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36 " ,
" Mozilla/5.0 (compatible; archive.org_bot; Wayback Machine Live Record; http://archive.org/details/archive.org_bot), 1.1 warcprox "
2019-06-02 22:13:32 -04:00
end
2014-02-20 16:07:02 -05:00
it " returns false for non-crawler user agents " do
2018-06-20 20:56:46 -04:00
not_crawler! " Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36 "
not_crawler! " Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko "
not_crawler! " Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0) "
not_crawler! " Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25 "
not_crawler! " Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0 "
not_crawler! " Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30 "
not_crawler! " Mozilla/5.0 (Linux; Android 6.0; CUBOT DINOSAUR Build/MRA58K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Mobile Safari/537.36+ "
2019-05-08 09:58:47 -04:00
not_crawler! " DiscourseAPI Ruby Gem 0.19.0 "
2018-03-15 17:10:45 -04:00
end
end
2022-07-27 12:14:14 -04:00
describe " .show_browser_update? " do
2021-03-22 13:41:42 -04:00
it " always returns false if setting is empty " do
SiteSetting . browser_update_user_agents = " "
expect (
CrawlerDetection . show_browser_update? (
" Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0) " ,
2023-01-09 06:18:21 -05:00
) ,
2021-03-22 13:41:42 -04:00
) . to eq ( false )
expect (
CrawlerDetection . show_browser_update? (
" Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C; .NET4.0E) " ,
2023-01-09 06:18:21 -05:00
) ,
2021-03-22 13:41:42 -04:00
) . to eq ( false )
end
it " returns true if setting matches user agent " do
SiteSetting . browser_update_user_agents = " MSIE 6|MSIE 7|MSIE 8|MSIE 9 "
expect (
CrawlerDetection . show_browser_update? (
" Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0) " ,
2023-01-09 06:18:21 -05:00
) ,
2021-03-22 13:41:42 -04:00
) . to eq ( false )
expect (
CrawlerDetection . show_browser_update? (
" Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C; .NET4.0E) " ,
2023-01-09 06:18:21 -05:00
) ,
2021-03-22 13:41:42 -04:00
) . to eq ( true )
end
end
2022-07-27 12:14:14 -04:00
describe " .allow_crawler? " do
2020-07-26 20:23:54 -04:00
it " returns true if allowlist and blocklist are blank " do
2018-03-15 17:10:45 -04:00
expect (
CrawlerDetection . allow_crawler? ( " Googlebot/2.1 (+http://www.google.com/bot.html) " ) ,
) . to eq ( true )
end
2022-07-27 12:14:14 -04:00
context " when allowlist is set " do
2020-07-26 20:23:54 -04:00
before { SiteSetting . allowed_crawler_user_agents = " Googlebot|Twitterbot " }
2018-03-15 17:10:45 -04:00
it " returns true for matching user agents " do
expect (
CrawlerDetection . allow_crawler? ( " Googlebot/2.1 (+http://www.google.com/bot.html) " ) ,
) . to eq ( true )
expect ( CrawlerDetection . allow_crawler? ( " Googlebot-Image/1.0 " ) ) . to eq ( true )
expect ( CrawlerDetection . allow_crawler? ( " Twitterbot " ) ) . to eq ( true )
end
it " returns false for user agents that do not match " do
expect (
CrawlerDetection . allow_crawler? (
" facebookexternalhit/1.1 (+http(s)://www.facebook.com/externalhit_uatext.php) " ,
2023-01-09 06:18:21 -05:00
) ,
2018-03-15 17:10:45 -04:00
) . to eq ( false )
expect (
CrawlerDetection . allow_crawler? (
" Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) " ,
2023-01-09 06:18:21 -05:00
) ,
2018-03-15 17:10:45 -04:00
) . to eq ( false )
expect ( CrawlerDetection . allow_crawler? ( " " ) ) . to eq ( false )
end
2022-07-27 12:14:14 -04:00
context " when blocklist is set " do
2020-07-26 20:23:54 -04:00
before { SiteSetting . blocked_crawler_user_agents = " Googlebot-Image " }
2018-03-15 17:10:45 -04:00
2020-07-26 20:23:54 -04:00
it " ignores the blocklist " do
2018-03-15 17:10:45 -04:00
expect ( CrawlerDetection . allow_crawler? ( " Googlebot-Image/1.0 " ) ) . to eq ( true )
end
end
end
2022-07-27 12:14:14 -04:00
context " when blocklist is set " do
2020-07-26 20:23:54 -04:00
before { SiteSetting . blocked_crawler_user_agents = " Googlebot|Twitterbot " }
2018-03-15 17:10:45 -04:00
it " returns true for crawlers that do not match " do
expect ( CrawlerDetection . allow_crawler? ( " Mediapartners-Google " ) ) . to eq ( true )
expect (
CrawlerDetection . allow_crawler? (
" facebookexternalhit/1.1 (+http(s)://www.facebook.com/externalhit_uatext.php) " ,
2023-01-09 06:18:21 -05:00
) ,
2018-03-15 17:10:45 -04:00
) . to eq ( true )
expect ( CrawlerDetection . allow_crawler? ( " " ) ) . to eq ( true )
end
it " returns false for user agents that match " do
expect (
CrawlerDetection . allow_crawler? ( " Googlebot/2.1 (+http://www.google.com/bot.html) " ) ,
) . to eq ( false )
expect ( CrawlerDetection . allow_crawler? ( " Googlebot-Image/1.0 " ) ) . to eq ( false )
expect ( CrawlerDetection . allow_crawler? ( " Twitterbot " ) ) . to eq ( false )
end
end
end
2022-07-27 12:14:14 -04:00
describe " .is_blocked_crawler? " do
2020-07-26 20:23:54 -04:00
it " is false if user agent is a crawler and no allowlist or blocklist is defined " do
2018-03-15 17:10:45 -04:00
expect ( CrawlerDetection . is_blocked_crawler? ( " Twitterbot " ) ) . to eq ( false )
2014-02-20 16:07:02 -05:00
end
2020-07-26 20:23:54 -04:00
it " is false if user agent is not a crawler and no allowlist or blocklist is defined " do
2018-03-15 17:10:45 -04:00
expect (
CrawlerDetection . is_blocked_crawler? (
" Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 " ,
2023-01-09 06:18:21 -05:00
) ,
2018-03-15 17:10:45 -04:00
) . to eq ( false )
end
2020-07-26 20:23:54 -04:00
it " is true if user agent is a crawler and is not allowlisted " do
SiteSetting . allowed_crawler_user_agents = " Googlebot "
2018-03-15 17:10:45 -04:00
expect ( CrawlerDetection . is_blocked_crawler? ( " Twitterbot " ) ) . to eq ( true )
end
2020-07-26 20:23:54 -04:00
it " is false if user agent is not a crawler and there is a allowlist " do
SiteSetting . allowed_crawler_user_agents = " Googlebot "
2018-03-15 17:10:45 -04:00
expect (
CrawlerDetection . is_blocked_crawler? (
" Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 " ,
2023-01-09 06:18:21 -05:00
) ,
2018-03-15 17:10:45 -04:00
) . to eq ( false )
end
2020-07-26 20:23:54 -04:00
it " is true if user agent is a crawler and is blocklisted " do
SiteSetting . blocked_crawler_user_agents = " Twitterbot "
2018-03-15 17:10:45 -04:00
expect ( CrawlerDetection . is_blocked_crawler? ( " Twitterbot " ) ) . to eq ( true )
end
2020-07-26 20:23:54 -04:00
it " is true if user agent is a crawler and is not blocklisted " do
SiteSetting . blocked_crawler_user_agents = " Twitterbot "
2018-03-15 17:10:45 -04:00
expect ( CrawlerDetection . is_blocked_crawler? ( " Googlebot " ) ) . to eq ( false )
end
2020-07-26 20:23:54 -04:00
it " is false if user agent is not a crawler and blocklist is defined " do
SiteSetting . blocked_crawler_user_agents = " Mozilla "
2018-03-15 17:10:45 -04:00
expect (
CrawlerDetection . is_blocked_crawler? (
" Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 " ,
2023-01-09 06:18:21 -05:00
) ,
2018-03-15 17:10:45 -04:00
) . to eq ( false )
end
2020-07-26 20:23:54 -04:00
it " is true if user agent is missing and allowlist is defined " do
SiteSetting . allowed_crawler_user_agents = " Googlebot "
2018-03-15 17:10:45 -04:00
expect ( CrawlerDetection . is_blocked_crawler? ( " " ) ) . to eq ( true )
expect ( CrawlerDetection . is_blocked_crawler? ( nil ) ) . to eq ( true )
end
2020-07-26 20:23:54 -04:00
it " is false if user agent is missing and blocklist is defined " do
SiteSetting . blocked_crawler_user_agents = " Googlebot "
2018-03-15 17:10:45 -04:00
expect ( CrawlerDetection . is_blocked_crawler? ( " " ) ) . to eq ( false )
expect ( CrawlerDetection . is_blocked_crawler? ( nil ) ) . to eq ( false )
end
2014-02-20 16:07:02 -05:00
end
end