# frozen_string_literal: true module CrawlerDetection WAYBACK_MACHINE_URL = "archive.org" def self.to_matcher(string, type: nil) escaped = string.split('|').map { |agent| Regexp.escape(agent) }.join('|') if type == :real && Rails.env == "test" # we need this bypass so we properly render views escaped << "|Rails Testing" end Regexp.new(escaped, Regexp::IGNORECASE) end def self.crawler?(user_agent, via_header = nil) return true if user_agent.nil? || user_agent&.include?(WAYBACK_MACHINE_URL) || via_header&.include?(WAYBACK_MACHINE_URL) # this is done to avoid regenerating regexes @non_crawler_matchers ||= {} @matchers ||= {} possibly_real = (@non_crawler_matchers[SiteSetting.non_crawler_user_agents] ||= to_matcher(SiteSetting.non_crawler_user_agents, type: :real)) if user_agent.match?(possibly_real) known_bots = (@matchers[SiteSetting.crawler_user_agents] ||= to_matcher(SiteSetting.crawler_user_agents)) if user_agent.match?(known_bots) bypass = (@matchers[SiteSetting.crawler_check_bypass_agents] ||= to_matcher(SiteSetting.crawler_check_bypass_agents)) !user_agent.match?(bypass) else false end else true end end def self.show_browser_update?(user_agent) return false if SiteSetting.browser_update_user_agents.blank? @browser_update_matchers ||= {} matcher = @browser_update_matchers[SiteSetting.browser_update_user_agents] ||= to_matcher(SiteSetting.browser_update_user_agents) user_agent.match?(matcher) end # Given a user_agent that returns true from crawler?, should its request be allowed? def self.allow_crawler?(user_agent) return true if SiteSetting.allowed_crawler_user_agents.blank? && SiteSetting.blocked_crawler_user_agents.blank? @allowlisted_matchers ||= {} @blocklisted_matchers ||= {} if SiteSetting.allowed_crawler_user_agents.present? allowlisted = @allowlisted_matchers[SiteSetting.allowed_crawler_user_agents] ||= to_matcher(SiteSetting.allowed_crawler_user_agents) !user_agent.nil? && user_agent.match?(allowlisted) else blocklisted = @blocklisted_matchers[SiteSetting.blocked_crawler_user_agents] ||= to_matcher(SiteSetting.blocked_crawler_user_agents) user_agent.nil? || !user_agent.match?(blocklisted) end end def self.is_blocked_crawler?(user_agent) crawler?(user_agent) && !allow_crawler?(user_agent) end end