discourse/spec/requests/robots_txt_controller_spec.rb

require 'rails_helper'

RSpec.describe RobotsTxtController do
  describe '#index' do

    context 'crawl delay' do
      it 'allows you to set crawl delay on particular bots' do
        SiteSetting.allow_index_in_robots_txt = true
        SiteSetting.slow_down_crawler_rate = 17
        SiteSetting.slow_down_crawler_user_agents = 'bingbot|googlebot'
        get '/robots.txt'
        expect(response.body).to include("\nUser-agent: bingbot\nCrawl-delay: 17")
        expect(response.body).to include("\nUser-agent: googlebot\nCrawl-delay: 17")
      end
    end

    context 'allow_index_in_robots_txt is true' do

      def expect_allowed_and_disallowed_sections(allow_index, disallow_index)
        expect(allow_index).to be_present
        expect(disallow_index).to be_present

        allow_section = allow_index < disallow_index ?
          response.body[allow_index...disallow_index] : response.body[allow_index..-1]

        expect(allow_section).to include('Disallow: /u/')
        expect(allow_section).to_not include("Disallow: /\n")

        disallowed_section = allow_index < disallow_index ?
          response.body[disallow_index..-1] : response.body[disallow_index...allow_index]
        expect(disallowed_section).to include("Disallow: /\n")
      end

      it "returns index when indexing is allowed" do
        SiteSetting.allow_index_in_robots_txt = true
        get '/robots.txt'

        i = response.body.index('User-agent: *')
        expect(i).to be_present
        expect(response.body[i..-1]).to include("Disallow: /u/")
      end

      it "can whitelist user agents" do
        SiteSetting.whitelisted_crawler_user_agents = "Googlebot|Twitterbot"
        get '/robots.txt'
        expect(response.body).to include('User-agent: Googlebot')
        expect(response.body).to include('User-agent: Twitterbot')

        allowed_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min
        disallow_all_index = response.body.index('User-agent: *')

        expect_allowed_and_disallowed_sections(allowed_index, disallow_all_index)
      end

      it "can blacklist user agents" do
        SiteSetting.blacklisted_crawler_user_agents = "Googlebot|Twitterbot"
        get '/robots.txt'
        expect(response.body).to include('User-agent: Googlebot')
        expect(response.body).to include('User-agent: Twitterbot')

        disallow_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min
        allow_index = response.body.index('User-agent: *')

        expect_allowed_and_disallowed_sections(allow_index, disallow_index)
      end

      it "ignores blacklist if whitelist is set" do
        SiteSetting.whitelisted_crawler_user_agents = "Googlebot|Twitterbot"
        SiteSetting.blacklisted_crawler_user_agents = "Bananabot"
        get '/robots.txt'
        expect(response.body).to_not include('Bananabot')
        expect(response.body).to include('User-agent: Googlebot')
        expect(response.body).to include('User-agent: Twitterbot')
      end
    end

    it "returns noindex when indexing is disallowed" do
      SiteSetting.allow_index_in_robots_txt = false
      get '/robots.txt'

      expect(response.body).to_not include("Disallow: /u/")
    end
  end
end
Prepare for separation of RSpec helper files Since rspec-rails 3, the default installation creates two helper files: * `spec_helper.rb` * `rails_helper.rb` `spec_helper.rb` is intended as a way of running specs that do not require Rails, whereas `rails_helper.rb` loads Rails (as Discourse's current `spec_helper.rb` does). For more information: https://www.relishapp.com/rspec/rspec-rails/docs/upgrade#default-helper-files In this commit, I've simply replaced all instances of `spec_helper` with `rails_helper`, and renamed the original `spec_helper.rb`. This brings the Discourse project closer to the standard usage of RSpec in a Rails app. At present, every spec relies on loading Rails, but there are likely many that don't need to. In a future pull request, I hope to introduce a separate, minimal `spec_helper.rb` which can be used in tests which don't rely on Rails. 2015-10-11 05:41:23 -04:00			`require 'rails_helper'`
added support for disabling indexing by google using SiteSetting.allow_index_in_robots_txt = false 2013-02-10 19:02:57 -05:00
Fix all the errors to get our tests green on Rails 5.1. 2017-08-31 00:06:56 -04:00			`RSpec.describe RobotsTxtController do`
			`describe '#index' do`
FEATURE: allow for setting crawl delay per user agent Also moved to default crawl delay bing so no more than a req every 5 seconds is allowed New site settings: "slow_down_crawler_user_agents" - list of crawlers that will be slowed down "slow_down_crawler_rate" - how many seconds to wait between requests Not enforced server side yet 2018-04-05 20:15:23 -04:00
			`context 'crawl delay' do`
			`it 'allows you to set crawl delay on particular bots' do`
			`SiteSetting.allow_index_in_robots_txt = true`
			`SiteSetting.slow_down_crawler_rate = 17`
			`SiteSetting.slow_down_crawler_user_agents = 'bingbot\|googlebot'`
			`get '/robots.txt'`
			`expect(response.body).to include("\nUser-agent: bingbot\nCrawl-delay: 17")`
			`expect(response.body).to include("\nUser-agent: googlebot\nCrawl-delay: 17")`
			`end`
			`end`

FEATURE: control which web crawlers can access using a whitelist or blacklist 2018-03-15 17:10:45 -04:00			`context 'allow_index_in_robots_txt is true' do`

			`def expect_allowed_and_disallowed_sections(allow_index, disallow_index)`
			`expect(allow_index).to be_present`
			`expect(disallow_index).to be_present`

			`allow_section = allow_index < disallow_index ?`
			`response.body[allow_index...disallow_index] : response.body[allow_index..-1]`

			`expect(allow_section).to include('Disallow: /u/')`
			`expect(allow_section).to_not include("Disallow: /\n")`

			`disallowed_section = allow_index < disallow_index ?`
			`response.body[disallow_index..-1] : response.body[disallow_index...allow_index]`
			`expect(disallowed_section).to include("Disallow: /\n")`
			`end`

			`it "returns index when indexing is allowed" do`
			`SiteSetting.allow_index_in_robots_txt = true`
			`get '/robots.txt'`

			`i = response.body.index('User-agent: *')`
			`expect(i).to be_present`
			`expect(response.body[i..-1]).to include("Disallow: /u/")`
			`end`

			`it "can whitelist user agents" do`
			`SiteSetting.whitelisted_crawler_user_agents = "Googlebot\|Twitterbot"`
			`get '/robots.txt'`
			`expect(response.body).to include('User-agent: Googlebot')`
			`expect(response.body).to include('User-agent: Twitterbot')`

			`allowed_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min`
			`disallow_all_index = response.body.index('User-agent: *')`

			`expect_allowed_and_disallowed_sections(allowed_index, disallow_all_index)`
			`end`

			`it "can blacklist user agents" do`
			`SiteSetting.blacklisted_crawler_user_agents = "Googlebot\|Twitterbot"`
			`get '/robots.txt'`
			`expect(response.body).to include('User-agent: Googlebot')`
			`expect(response.body).to include('User-agent: Twitterbot')`

			`disallow_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min`
			`allow_index = response.body.index('User-agent: *')`

			`expect_allowed_and_disallowed_sections(allow_index, disallow_index)`
			`end`
Fix all the errors to get our tests green on Rails 5.1. 2017-08-31 00:06:56 -04:00
FEATURE: control which web crawlers can access using a whitelist or blacklist 2018-03-15 17:10:45 -04:00			`it "ignores blacklist if whitelist is set" do`
			`SiteSetting.whitelisted_crawler_user_agents = "Googlebot\|Twitterbot"`
			`SiteSetting.blacklisted_crawler_user_agents = "Bananabot"`
			`get '/robots.txt'`
			`expect(response.body).to_not include('Bananabot')`
			`expect(response.body).to include('User-agent: Googlebot')`
			`expect(response.body).to include('User-agent: Twitterbot')`
			`end`
added support for disabling indexing by google using SiteSetting.allow_index_in_robots_txt = false 2013-02-10 19:02:57 -05:00			`end`

Merge branch 'whitespace-cleanese' of git://github.com/goshakkk/discourse Conflicts: lib/oneboxer.rb lib/oneboxer/whitelist.rb spec/controllers/robots_txt_controller_spec.rb 2013-02-26 10:42:49 -05:00			`it "returns noindex when indexing is disallowed" do`
Remove site setting stubbing (Round 1) 2015-06-03 06:14:00 -04:00			`SiteSetting.allow_index_in_robots_txt = false`
Fix all the errors to get our tests green on Rails 5.1. 2017-08-31 00:06:56 -04:00			`get '/robots.txt'`
remove trailing whitespaces :heart: 2013-02-25 11:42:20 -05:00
Fix all the errors to get our tests green on Rails 5.1. 2017-08-31 00:06:56 -04:00			`expect(response.body).to_not include("Disallow: /u/")`
			`end`
added support for disabling indexing by google using SiteSetting.allow_index_in_robots_txt = false 2013-02-10 19:02:57 -05:00			`end`
			`end`