discourse/spec/requests/robots_txt_controller_spec.rb

# frozen_string_literal: true

RSpec.describe RobotsTxtController do
  describe '#builder' do
    it "returns json information for building a robots.txt" do
      get "/robots-builder.json"
      json = response.parsed_body
      expect(json).to be_present
      expect(json['header']).to be_present
      expect(json['agents']).to be_present
    end

    it "includes overridden content if robots.txt is is overridden" do
      SiteSetting.overridden_robots_txt = "something"

      get "/robots-builder.json"
      expect(response.status).to eq(200)
      json = response.parsed_body
      expect(json['header']).to be_present
      expect(json['agents']).to be_present
      expect(json['overridden']).to eq("something")
    end
  end

  describe '#index' do

    context "header for when the content is overridden" do
      it "is not prepended if there are no overrides" do
        sign_in(Fabricate(:admin))
        get '/robots.txt'
        expect(response.body).not_to start_with(RobotsTxtController::OVERRIDDEN_HEADER)
      end

      it "is prepended if there are overrides and the user is admin" do
        SiteSetting.overridden_robots_txt = "overridden_content"
        sign_in(Fabricate(:admin))
        get '/robots.txt'
        expect(response.body).to start_with(RobotsTxtController::OVERRIDDEN_HEADER)
      end

      it "is not prepended if the user is not admin" do
        SiteSetting.overridden_robots_txt = "overridden_content"
        get '/robots.txt'
        expect(response.body).not_to start_with(RobotsTxtController::OVERRIDDEN_HEADER)
      end
    end

    context 'subfolder' do
      it 'prefixes the rules with the directory' do
        set_subfolder "/forum"

        get '/robots.txt'
        expect(response.body).to include("\nDisallow: /forum/email/")
      end
    end

    context 'allow_index_in_robots_txt is true' do

      def expect_allowed_and_disallowed_sections(allow_index, disallow_index)
        expect(allow_index).to be_present
        expect(disallow_index).to be_present

        allow_section = allow_index < disallow_index ?
          response.body[allow_index...disallow_index] : response.body[allow_index..-1]

        expect(allow_section).to include('Disallow: /auth/')
        expect(allow_section).to_not include("Disallow: /\n")

        disallowed_section = allow_index < disallow_index ?
          response.body[disallow_index..-1] : response.body[disallow_index...allow_index]
        expect(disallowed_section).to include("Disallow: /\n")
      end

      it "returns index when indexing is allowed" do
        SiteSetting.allow_index_in_robots_txt = true
        get '/robots.txt'

        i = response.body.index('User-agent: *')
        expect(i).to be_present
        expect(response.body[i..-1]).to include("Disallow: /auth/")
        # we have to insert Googlebot for special handling
        expect(response.body[i..-1]).to include("User-agent: Googlebot")
      end

      it "can allowlist user agents" do
        SiteSetting.allowed_crawler_user_agents = "Googlebot|Twitterbot"
        get '/robots.txt'
        expect(response.body).to include('User-agent: Googlebot')
        expect(response.body).to include('User-agent: Twitterbot')

        allowed_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min
        disallow_all_index = response.body.index('User-agent: *')

        expect_allowed_and_disallowed_sections(allowed_index, disallow_all_index)
      end

      it "can blocklist user agents" do
        SiteSetting.blocked_crawler_user_agents = "Googlebot|Twitterbot"
        get '/robots.txt'
        expect(response.body).to include('User-agent: Googlebot')
        expect(response.body).to include('User-agent: Twitterbot')

        disallow_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min
        allow_index = response.body.index('User-agent: *')

        expect_allowed_and_disallowed_sections(allow_index, disallow_index)
      end

      it "ignores blocklist if allowlist is set" do
        SiteSetting.allowed_crawler_user_agents = "Googlebot|Twitterbot"
        SiteSetting.blocked_crawler_user_agents = "Bananabot"
        get '/robots.txt'
        expect(response.body).to_not include('Bananabot')
        expect(response.body).to include('User-agent: Googlebot')
        expect(response.body).to include('User-agent: Twitterbot')
      end
    end

    it "returns noindex when indexing is disallowed" do
      SiteSetting.allow_index_in_robots_txt = false
      get '/robots.txt'

      expect(response.body).to_not include("Disallow: /auth/")
      expect(response.body).to include("User-agent: googlebot\nAllow")
    end

    it "returns overridden robots.txt if the file is overridden" do
      SiteSetting.overridden_robots_txt = "blah whatever"
      get '/robots.txt'
      expect(response.status).to eq(200)
      expect(response.body).to eq(SiteSetting.overridden_robots_txt)
    end
  end
end
DEV: use #frozen_string_literal: true on all spec This change both speeds up specs (less strings to allocate) and helps catch cases where methods in Discourse are mutating inputs. Overall we will be migrating everything to use #frozen_string_literal: true it will take a while, but this is the first and safest move in this direction 2019-04-29 20:27:42 -04:00			`# frozen_string_literal: true`

Fix all the errors to get our tests green on Rails 5.1. 2017-08-31 00:06:56 -04:00			`RSpec.describe RobotsTxtController do`
FEATURE: An API to help sites build robots.txt files programatically This is mainly useful for subfolder sites, who need to expose their robots.txt contents to a parent site. 2018-04-16 15:43:20 -04:00			`describe '#builder' do`
			`it "returns json information for building a robots.txt" do`
			`get "/robots-builder.json"`
DEV: Use `response.parsed_body` in specs (#9615) Most of it was autofixed with rubocop-discourse 2.1.1. 2020-05-07 11:04:12 -04:00			`json = response.parsed_body`
FEATURE: An API to help sites build robots.txt files programatically This is mainly useful for subfolder sites, who need to expose their robots.txt contents to a parent site. 2018-04-16 15:43:20 -04:00			`expect(json).to be_present`
			`expect(json['header']).to be_present`
			`expect(json['agents']).to be_present`
			`end`
FEATURE: Allow customization of robots.txt (#7884) * FEATURE: Allow customization of robots.txt This allows admins to customize/override the content of the robots.txt file at /admin/customize/robots. That page is not linked to anywhere in the UI -- admins have to manually type the URL to access that page. * use Ember.computed.not * Jeff feedback * Feedback * Remove unused import 2019-07-15 13:47:44 -04:00
			`it "includes overridden content if robots.txt is is overridden" do`
			`SiteSetting.overridden_robots_txt = "something"`

			`get "/robots-builder.json"`
			`expect(response.status).to eq(200)`
DEV: Use `response.parsed_body` in specs (#9615) Most of it was autofixed with rubocop-discourse 2.1.1. 2020-05-07 11:04:12 -04:00			`json = response.parsed_body`
FEATURE: Allow customization of robots.txt (#7884) * FEATURE: Allow customization of robots.txt This allows admins to customize/override the content of the robots.txt file at /admin/customize/robots. That page is not linked to anywhere in the UI -- admins have to manually type the URL to access that page. * use Ember.computed.not * Jeff feedback * Feedback * Remove unused import 2019-07-15 13:47:44 -04:00			`expect(json['header']).to be_present`
			`expect(json['agents']).to be_present`
			`expect(json['overridden']).to eq("something")`
			`end`
FEATURE: An API to help sites build robots.txt files programatically This is mainly useful for subfolder sites, who need to expose their robots.txt contents to a parent site. 2018-04-16 15:43:20 -04:00			`end`

Fix all the errors to get our tests green on Rails 5.1. 2017-08-31 00:06:56 -04:00			`describe '#index' do`
FEATURE: allow for setting crawl delay per user agent Also moved to default crawl delay bing so no more than a req every 5 seconds is allowed New site settings: "slow_down_crawler_user_agents" - list of crawlers that will be slowed down "slow_down_crawler_rate" - how many seconds to wait between requests Not enforced server side yet 2018-04-05 20:15:23 -04:00
FEATURE: Allow customization of robots.txt (#7884) * FEATURE: Allow customization of robots.txt This allows admins to customize/override the content of the robots.txt file at /admin/customize/robots. That page is not linked to anywhere in the UI -- admins have to manually type the URL to access that page. * use Ember.computed.not * Jeff feedback * Feedback * Remove unused import 2019-07-15 13:47:44 -04:00			`context "header for when the content is overridden" do`
			`it "is not prepended if there are no overrides" do`
			`sign_in(Fabricate(:admin))`
			`get '/robots.txt'`
			`expect(response.body).not_to start_with(RobotsTxtController::OVERRIDDEN_HEADER)`
			`end`

			`it "is prepended if there are overrides and the user is admin" do`
			`SiteSetting.overridden_robots_txt = "overridden_content"`
			`sign_in(Fabricate(:admin))`
			`get '/robots.txt'`
			`expect(response.body).to start_with(RobotsTxtController::OVERRIDDEN_HEADER)`
			`end`

			`it "is not prepended if the user is not admin" do`
			`SiteSetting.overridden_robots_txt = "overridden_content"`
			`get '/robots.txt'`
			`expect(response.body).not_to start_with(RobotsTxtController::OVERRIDDEN_HEADER)`
			`end`
			`end`

prefix the robots.txt rules with the directory when using subfolder 2018-04-11 16:05:02 -04:00			`context 'subfolder' do`
			`it 'prefixes the rules with the directory' do`
DEV: improve usability of subfolder specs Previously people were not consistent about mocking which left internals in a fragile state when running subfolder specs. This introduces a simple helper `set_subfolder` which you can use to set the subfolder for the spec. It takes care of proper configuration of subfolder and teardown. ``` # usage set_subfolder "/my_amazing_subfolder" ``` You should no longer stub base_uri or global_settings 2019-11-15 00:48:24 -05:00			`set_subfolder "/forum"`

prefix the robots.txt rules with the directory when using subfolder 2018-04-11 16:05:02 -04:00			`get '/robots.txt'`
Fix spec (#10539) 2020-08-26 17:31:02 -04:00			`expect(response.body).to include("\nDisallow: /forum/email/")`
prefix the robots.txt rules with the directory when using subfolder 2018-04-11 16:05:02 -04:00			`end`
			`end`

FEATURE: control which web crawlers can access using a whitelist or blacklist 2018-03-15 17:10:45 -04:00			`context 'allow_index_in_robots_txt is true' do`

			`def expect_allowed_and_disallowed_sections(allow_index, disallow_index)`
			`expect(allow_index).to be_present`
			`expect(disallow_index).to be_present`

			`allow_section = allow_index < disallow_index ?`
			`response.body[allow_index...disallow_index] : response.body[allow_index..-1]`

FIX: Broken specs `/u/` is no longer in robots.txt, so don't test for it 2020-06-25 14:30:57 -04:00			`expect(allow_section).to include('Disallow: /auth/')`
FEATURE: control which web crawlers can access using a whitelist or blacklist 2018-03-15 17:10:45 -04:00			`expect(allow_section).to_not include("Disallow: /\n")`

			`disallowed_section = allow_index < disallow_index ?`
			`response.body[disallow_index..-1] : response.body[disallow_index...allow_index]`
			`expect(disallowed_section).to include("Disallow: /\n")`
			`end`

			`it "returns index when indexing is allowed" do`
			`SiteSetting.allow_index_in_robots_txt = true`
			`get '/robots.txt'`

			`i = response.body.index('User-agent: *')`
			`expect(i).to be_present`
FIX: Broken specs `/u/` is no longer in robots.txt, so don't test for it 2020-06-25 14:30:57 -04:00			`expect(response.body[i..-1]).to include("Disallow: /auth/")`
FEATURE: explicitly ban outlier traffic sources in robots.txt (#11553) Googlebot handles no-index headers very elegantly. It advises to leave as many routes as possible open and uses headers for high fidelity rules regarding indexes. Discourse adds special `x-robot-tags` noindex headers to users, badges, groups, search and tag routes. Following up on b52143feff8c32f2 we now have it so Googlebot gets special handling. Rest of the crawlers get a far more aggressive disallow list to protect against excessive crawling. 2020-12-22 16:51:14 -05:00			`# we have to insert Googlebot for special handling`
			`expect(response.body[i..-1]).to include("User-agent: Googlebot")`
FEATURE: control which web crawlers can access using a whitelist or blacklist 2018-03-15 17:10:45 -04:00			`end`

FIX: use allowlist and blocklist terminology (#10209) This is a PR of the renaming whitelist to allowlist and blacklist to the blocklist. 2020-07-26 20:23:54 -04:00			`it "can allowlist user agents" do`
			`SiteSetting.allowed_crawler_user_agents = "Googlebot\|Twitterbot"`
FEATURE: control which web crawlers can access using a whitelist or blacklist 2018-03-15 17:10:45 -04:00			`get '/robots.txt'`
			`expect(response.body).to include('User-agent: Googlebot')`
			`expect(response.body).to include('User-agent: Twitterbot')`

			`allowed_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min`
			`disallow_all_index = response.body.index('User-agent: *')`

			`expect_allowed_and_disallowed_sections(allowed_index, disallow_all_index)`
			`end`

FIX: use allowlist and blocklist terminology (#10209) This is a PR of the renaming whitelist to allowlist and blacklist to the blocklist. 2020-07-26 20:23:54 -04:00			`it "can blocklist user agents" do`
			`SiteSetting.blocked_crawler_user_agents = "Googlebot\|Twitterbot"`
FEATURE: control which web crawlers can access using a whitelist or blacklist 2018-03-15 17:10:45 -04:00			`get '/robots.txt'`
			`expect(response.body).to include('User-agent: Googlebot')`
			`expect(response.body).to include('User-agent: Twitterbot')`

			`disallow_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min`
			`allow_index = response.body.index('User-agent: *')`

			`expect_allowed_and_disallowed_sections(allow_index, disallow_index)`
			`end`
Fix all the errors to get our tests green on Rails 5.1. 2017-08-31 00:06:56 -04:00
FIX: use allowlist and blocklist terminology (#10209) This is a PR of the renaming whitelist to allowlist and blacklist to the blocklist. 2020-07-26 20:23:54 -04:00			`it "ignores blocklist if allowlist is set" do`
			`SiteSetting.allowed_crawler_user_agents = "Googlebot\|Twitterbot"`
			`SiteSetting.blocked_crawler_user_agents = "Bananabot"`
FEATURE: control which web crawlers can access using a whitelist or blacklist 2018-03-15 17:10:45 -04:00			`get '/robots.txt'`
			`expect(response.body).to_not include('Bananabot')`
			`expect(response.body).to include('User-agent: Googlebot')`
			`expect(response.body).to include('User-agent: Twitterbot')`
			`end`
added support for disabling indexing by google using SiteSetting.allow_index_in_robots_txt = false 2013-02-10 19:02:57 -05:00			`end`

Merge branch 'whitespace-cleanese' of git://github.com/goshakkk/discourse Conflicts: lib/oneboxer.rb lib/oneboxer/whitelist.rb spec/controllers/robots_txt_controller_spec.rb 2013-02-26 10:42:49 -05:00			`it "returns noindex when indexing is disallowed" do`
Remove site setting stubbing (Round 1) 2015-06-03 06:14:00 -04:00			`SiteSetting.allow_index_in_robots_txt = false`
Fix all the errors to get our tests green on Rails 5.1. 2017-08-31 00:06:56 -04:00			`get '/robots.txt'`
remove trailing whitespaces :heart: 2013-02-25 11:42:20 -05:00
FIX: Broken specs `/u/` is no longer in robots.txt, so don't test for it 2020-06-25 14:30:57 -04:00			`expect(response.body).to_not include("Disallow: /auth/")`
FEATURE: let Google index pages so it can remove them Google insists on indexing pages so it can figure out if they can be removed from the index. see: https://support.google.com/webmasters/answer/6332384?hl=en This change ensures the we have special behavior for Googlebot where we allow indexing, but block the actual indexing via X-Robots-Tag 2020-05-10 22:14:21 -04:00			`expect(response.body).to include("User-agent: googlebot\nAllow")`
Fix all the errors to get our tests green on Rails 5.1. 2017-08-31 00:06:56 -04:00			`end`
FEATURE: Allow customization of robots.txt (#7884) * FEATURE: Allow customization of robots.txt This allows admins to customize/override the content of the robots.txt file at /admin/customize/robots. That page is not linked to anywhere in the UI -- admins have to manually type the URL to access that page. * use Ember.computed.not * Jeff feedback * Feedback * Remove unused import 2019-07-15 13:47:44 -04:00
			`it "returns overridden robots.txt if the file is overridden" do`
			`SiteSetting.overridden_robots_txt = "blah whatever"`
			`get '/robots.txt'`
			`expect(response.status).to eq(200)`
			`expect(response.body).to eq(SiteSetting.overridden_robots_txt)`
			`end`
added support for disabling indexing by google using SiteSetting.allow_index_in_robots_txt = false 2013-02-10 19:02:57 -05:00			`end`
			`end`