2019-04-29 20:27:42 -04:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2018-12-16 18:09:13 -05:00
|
|
|
require "site_settings/validations"
|
|
|
|
|
2022-07-27 22:27:38 -04:00
|
|
|
RSpec.describe SiteSettings::Validations do
|
2023-06-21 10:00:19 -04:00
|
|
|
subject(:validations) { Class.new.include(described_class).new }
|
2018-12-16 18:09:13 -05:00
|
|
|
|
2022-07-27 12:14:14 -04:00
|
|
|
describe "default_categories" do
|
2023-11-09 17:47:59 -05:00
|
|
|
fab!(:category)
|
2019-07-11 13:41:51 -04:00
|
|
|
|
|
|
|
it "supports valid categories" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect {
|
|
|
|
validations.validate_default_categories_watching("#{category.id}")
|
|
|
|
}.not_to raise_error
|
2019-07-11 13:41:51 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
it "won't allow you to input junk categories" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_default_categories_watching("junk") }.to raise_error(
|
2019-07-11 13:41:51 -04:00
|
|
|
Discourse::InvalidParameters,
|
|
|
|
)
|
|
|
|
|
|
|
|
expect {
|
2023-06-21 10:00:19 -04:00
|
|
|
validations.validate_default_categories_watching("#{category.id}|12312323")
|
2019-07-11 13:41:51 -04:00
|
|
|
}.to raise_error(Discourse::InvalidParameters)
|
|
|
|
end
|
2019-10-06 14:50:07 -04:00
|
|
|
|
|
|
|
it "prevents using the same category in more than one default group" do
|
|
|
|
SiteSetting.default_categories_watching = "#{category.id}"
|
|
|
|
|
|
|
|
expect { SiteSetting.default_categories_tracking = "#{category.id}" }.to raise_error(
|
|
|
|
Discourse::InvalidParameters,
|
|
|
|
)
|
2023-03-24 02:10:37 -04:00
|
|
|
|
|
|
|
expect { SiteSetting.default_categories_normal = "#{category.id}" }.to raise_error(
|
|
|
|
Discourse::InvalidParameters,
|
|
|
|
)
|
2019-10-06 14:50:07 -04:00
|
|
|
end
|
2019-07-11 13:41:51 -04:00
|
|
|
end
|
|
|
|
|
2022-07-27 12:14:14 -04:00
|
|
|
describe "s3 buckets reusage" do
|
2018-12-16 18:09:13 -05:00
|
|
|
let(:error_message) { I18n.t("errors.site_settings.s3_bucket_reused") }
|
|
|
|
|
|
|
|
shared_examples "s3 bucket validation" do
|
|
|
|
def change_bucket_value(value)
|
2019-05-06 21:00:09 -04:00
|
|
|
SiteSetting.set(other_setting_name, value)
|
2018-12-16 18:09:13 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
it "shouldn't raise an error when both buckets are blank" do
|
|
|
|
change_bucket_value("")
|
2018-12-17 10:10:10 -05:00
|
|
|
validate("")
|
2018-12-16 18:09:13 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
it "shouldn't raise an error when only one bucket is set" do
|
|
|
|
change_bucket_value("")
|
2018-12-17 10:10:10 -05:00
|
|
|
validate("my-awesome-bucket")
|
2018-12-16 18:09:13 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
it "shouldn't raise an error when both buckets are equal, but use a different path" do
|
|
|
|
change_bucket_value("my-awesome-bucket/foo")
|
2018-12-17 10:10:10 -05:00
|
|
|
validate("my-awesome-bucket/bar")
|
2018-12-16 18:09:13 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
it "should raise an error when both buckets are equal" do
|
|
|
|
change_bucket_value("my-awesome-bucket")
|
|
|
|
expect { validate("my-awesome-bucket") }.to raise_error(
|
|
|
|
Discourse::InvalidParameters,
|
|
|
|
error_message,
|
|
|
|
)
|
|
|
|
end
|
|
|
|
|
|
|
|
it "should raise an error when both buckets are equal except for a trailing slash" do
|
|
|
|
change_bucket_value("my-awesome-bucket/")
|
|
|
|
expect { validate("my-awesome-bucket") }.to raise_error(
|
|
|
|
Discourse::InvalidParameters,
|
|
|
|
error_message,
|
|
|
|
)
|
|
|
|
|
|
|
|
change_bucket_value("my-awesome-bucket")
|
|
|
|
expect { validate("my-awesome-bucket/") }.to raise_error(
|
|
|
|
Discourse::InvalidParameters,
|
|
|
|
error_message,
|
|
|
|
)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
describe "#validate_s3_backup_bucket" do
|
|
|
|
let(:other_setting_name) { "s3_upload_bucket" }
|
|
|
|
|
|
|
|
def validate(new_value)
|
2023-06-21 10:00:19 -04:00
|
|
|
validations.validate_s3_backup_bucket(new_value)
|
2018-12-16 18:09:13 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
it_behaves_like "s3 bucket validation"
|
|
|
|
|
|
|
|
it "shouldn't raise an error when the 's3_backup_bucket' is a subdirectory of 's3_upload_bucket'" do
|
|
|
|
SiteSetting.s3_upload_bucket = "my-awesome-bucket"
|
2018-12-17 10:10:10 -05:00
|
|
|
validate("my-awesome-bucket/backups")
|
2018-12-16 18:09:13 -05:00
|
|
|
|
|
|
|
SiteSetting.s3_upload_bucket = "my-awesome-bucket/foo"
|
2018-12-17 10:10:10 -05:00
|
|
|
validate("my-awesome-bucket/foo/backups")
|
2018-12-16 18:09:13 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
describe "#validate_s3_upload_bucket" do
|
|
|
|
let(:other_setting_name) { "s3_backup_bucket" }
|
|
|
|
|
|
|
|
def validate(new_value)
|
2023-06-21 10:00:19 -04:00
|
|
|
validations.validate_s3_upload_bucket(new_value)
|
2018-12-16 18:09:13 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
it_behaves_like "s3 bucket validation"
|
|
|
|
|
|
|
|
it "should raise an error when the 's3_upload_bucket' is a subdirectory of 's3_backup_bucket'" do
|
|
|
|
SiteSetting.s3_backup_bucket = "my-awesome-bucket"
|
|
|
|
expect { validate("my-awesome-bucket/uploads") }.to raise_error(
|
|
|
|
Discourse::InvalidParameters,
|
|
|
|
error_message,
|
|
|
|
)
|
|
|
|
|
|
|
|
SiteSetting.s3_backup_bucket = "my-awesome-bucket/foo"
|
|
|
|
expect { validate("my-awesome-bucket/foo/uploads") }.to raise_error(
|
|
|
|
Discourse::InvalidParameters,
|
|
|
|
error_message,
|
|
|
|
)
|
|
|
|
end
|
2020-03-06 08:49:28 -05:00
|
|
|
|
|
|
|
it "cannot be made blank unless the setting is false" do
|
|
|
|
SiteSetting.s3_backup_bucket = "really-real-cool-bucket"
|
|
|
|
SiteSetting.enable_s3_uploads = true
|
|
|
|
|
|
|
|
expect { validate("") }.to raise_error(Discourse::InvalidParameters)
|
|
|
|
SiteSetting.enable_s3_uploads = false
|
|
|
|
validate("")
|
|
|
|
end
|
2018-12-16 18:09:13 -05:00
|
|
|
end
|
|
|
|
end
|
2019-11-15 01:05:10 -05:00
|
|
|
|
2020-08-19 14:16:31 -04:00
|
|
|
describe "enforce second factor & local/auth provider login interplay" do
|
2019-11-15 01:05:10 -05:00
|
|
|
describe "#validate_enforce_second_factor" do
|
|
|
|
context "when local logins are disabled" do
|
2020-08-19 14:16:31 -04:00
|
|
|
let(:error_message) do
|
|
|
|
I18n.t("errors.site_settings.second_factor_cannot_be_enforced_with_disabled_local_login")
|
2019-11-15 01:05:10 -05:00
|
|
|
end
|
|
|
|
before { SiteSetting.enable_local_logins = false }
|
|
|
|
|
|
|
|
it "should raise an error" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_enforce_second_factor("t") }.to raise_error(
|
2019-11-15 01:05:10 -05:00
|
|
|
Discourse::InvalidParameters,
|
|
|
|
error_message,
|
|
|
|
)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
context "when local logins are enabled" do
|
|
|
|
before { SiteSetting.enable_local_logins = true }
|
|
|
|
|
|
|
|
it "should be ok" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_enforce_second_factor("t") }.not_to raise_error
|
2019-11-15 01:05:10 -05:00
|
|
|
end
|
|
|
|
end
|
2020-08-19 14:16:31 -04:00
|
|
|
|
|
|
|
context "when social logins are enabled" do
|
|
|
|
let(:error_message) do
|
|
|
|
I18n.t(
|
|
|
|
"errors.site_settings.second_factor_cannot_enforce_with_socials",
|
|
|
|
auth_provider_names: "facebook, github",
|
|
|
|
)
|
2023-01-09 06:18:21 -05:00
|
|
|
end
|
2020-08-19 14:16:31 -04:00
|
|
|
before do
|
|
|
|
SiteSetting.enable_facebook_logins = true
|
|
|
|
SiteSetting.enable_github_logins = true
|
|
|
|
end
|
|
|
|
|
|
|
|
it "raises and error, and specifies the auth providers" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_enforce_second_factor("all") }.to raise_error(
|
2020-08-19 14:16:31 -04:00
|
|
|
Discourse::InvalidParameters,
|
|
|
|
error_message,
|
|
|
|
)
|
|
|
|
end
|
|
|
|
end
|
2020-10-09 13:06:38 -04:00
|
|
|
|
|
|
|
context "when SSO is enabled" do
|
2021-02-08 05:04:33 -05:00
|
|
|
let(:error_message) do
|
|
|
|
I18n.t(
|
|
|
|
"errors.site_settings.second_factor_cannot_be_enforced_with_discourse_connect_enabled",
|
|
|
|
)
|
2023-01-09 06:18:21 -05:00
|
|
|
end
|
2020-10-09 13:06:38 -04:00
|
|
|
before do
|
2021-02-08 05:04:33 -05:00
|
|
|
SiteSetting.discourse_connect_url = "https://www.example.com/sso"
|
|
|
|
SiteSetting.enable_discourse_connect = true
|
2020-10-09 13:06:38 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
it "should raise an error" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_enforce_second_factor("t") }.to raise_error(
|
2020-10-09 13:06:38 -04:00
|
|
|
Discourse::InvalidParameters,
|
|
|
|
error_message,
|
|
|
|
)
|
|
|
|
end
|
|
|
|
end
|
2019-11-15 01:05:10 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
describe "#validate_enable_local_logins" do
|
|
|
|
let(:error_message) do
|
|
|
|
I18n.t("errors.site_settings.local_login_cannot_be_disabled_if_second_factor_enforced")
|
2023-01-09 06:18:21 -05:00
|
|
|
end
|
2019-11-15 01:05:10 -05:00
|
|
|
|
|
|
|
context "when the new value is false" do
|
|
|
|
context "when enforce second factor is enabled" do
|
|
|
|
before { SiteSetting.enforce_second_factor = "all" }
|
|
|
|
|
|
|
|
it "should raise an error" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_enable_local_logins("f") }.to raise_error(
|
2019-11-15 01:05:10 -05:00
|
|
|
Discourse::InvalidParameters,
|
|
|
|
error_message,
|
|
|
|
)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
context "when enforce second factor is disabled" do
|
|
|
|
before { SiteSetting.enforce_second_factor = "no" }
|
|
|
|
|
|
|
|
it "should be ok" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_enable_local_logins("f") }.not_to raise_error
|
2019-11-15 01:05:10 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
context "when the new value is true" do
|
|
|
|
it "should be ok" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_enable_local_logins("t") }.not_to raise_error
|
2019-11-19 16:46:44 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2020-10-28 22:01:06 -04:00
|
|
|
describe "#validate_cors_origins" do
|
|
|
|
let(:error_message) do
|
|
|
|
I18n.t("errors.site_settings.cors_origins_should_not_have_trailing_slash")
|
2023-01-09 06:18:21 -05:00
|
|
|
end
|
2020-10-28 22:01:06 -04:00
|
|
|
|
|
|
|
context "when the new value has trailing slash" do
|
|
|
|
it "should raise an error" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_cors_origins("https://www.rainbows.com/") }.to raise_error(
|
2020-10-28 22:01:06 -04:00
|
|
|
Discourse::InvalidParameters,
|
|
|
|
error_message,
|
|
|
|
)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2020-11-05 19:33:19 -05:00
|
|
|
describe "#validate_enable_page_publishing" do
|
|
|
|
context "when the new value is true" do
|
|
|
|
it "is ok" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_enable_page_publishing("t") }.not_to raise_error
|
2020-11-05 19:33:19 -05:00
|
|
|
end
|
|
|
|
|
2022-09-28 19:24:33 -04:00
|
|
|
context "if secure uploads is enabled" do
|
2020-11-05 19:33:19 -05:00
|
|
|
let(:error_message) { I18n.t("errors.site_settings.page_publishing_requirements") }
|
2022-09-28 19:24:33 -04:00
|
|
|
before { enable_secure_uploads }
|
2020-11-05 19:33:19 -05:00
|
|
|
|
|
|
|
it "is not ok" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_enable_page_publishing("t") }.to raise_error(
|
2020-11-05 19:33:19 -05:00
|
|
|
Discourse::InvalidParameters,
|
|
|
|
error_message,
|
|
|
|
)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2023-06-06 01:47:40 -04:00
|
|
|
describe "#validate_s3_use_acls" do
|
|
|
|
context "when the new value is true" do
|
|
|
|
it "is ok" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_s3_use_acls("t") }.not_to raise_error
|
2023-06-06 01:47:40 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
context "when the new value is false" do
|
|
|
|
it "is ok" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_s3_use_acls("f") }.not_to raise_error
|
2023-06-06 01:47:40 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
context "if secure uploads is enabled" do
|
|
|
|
let(:error_message) { I18n.t("errors.site_settings.s3_use_acls_requirements") }
|
|
|
|
before { enable_secure_uploads }
|
|
|
|
|
|
|
|
it "is not ok" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_s3_use_acls("f") }.to raise_error(
|
2023-06-06 01:47:40 -04:00
|
|
|
Discourse::InvalidParameters,
|
|
|
|
error_message,
|
|
|
|
)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2022-09-28 19:24:33 -04:00
|
|
|
describe "#validate_secure_uploads" do
|
|
|
|
let(:error_message) { I18n.t("errors.site_settings.secure_uploads_requirements") }
|
2019-11-19 16:46:44 -05:00
|
|
|
|
2023-06-06 01:47:40 -04:00
|
|
|
context "when the new secure uploads value is true" do
|
2019-11-19 16:46:44 -05:00
|
|
|
context "if site setting for enable_s3_uploads is enabled" do
|
|
|
|
before { SiteSetting.enable_s3_uploads = true }
|
|
|
|
|
|
|
|
it "should be ok" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_secure_uploads("t") }.not_to raise_error
|
2019-11-19 16:46:44 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
context "if site setting for enable_s3_uploads is not enabled" do
|
|
|
|
before { SiteSetting.enable_s3_uploads = false }
|
|
|
|
|
|
|
|
it "is not ok" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_secure_uploads("t") }.to raise_error(
|
2022-09-28 19:24:33 -04:00
|
|
|
Discourse::InvalidParameters,
|
|
|
|
error_message,
|
|
|
|
)
|
2019-11-19 16:46:44 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
context "if global s3 setting is enabled" do
|
|
|
|
before { GlobalSetting.stubs(:use_s3?).returns(true) }
|
|
|
|
|
|
|
|
it "should be ok" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_secure_uploads("t") }.not_to raise_error
|
2019-11-19 16:46:44 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2023-06-06 01:47:40 -04:00
|
|
|
|
|
|
|
context "if site setting for s3_use_acls is not enabled" do
|
|
|
|
before { SiteSetting.s3_use_acls = false }
|
|
|
|
|
|
|
|
it "is not ok" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_secure_uploads("t") }.to raise_error(
|
2023-06-06 01:47:40 -04:00
|
|
|
Discourse::InvalidParameters,
|
|
|
|
error_message,
|
|
|
|
)
|
|
|
|
end
|
|
|
|
end
|
2019-11-19 16:46:44 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
describe "#validate_enable_s3_uploads" do
|
|
|
|
let(:error_message) do
|
|
|
|
I18n.t("errors.site_settings.cannot_enable_s3_uploads_when_s3_enabled_globally")
|
2023-01-09 06:18:21 -05:00
|
|
|
end
|
2019-11-19 16:46:44 -05:00
|
|
|
|
|
|
|
context "when the new value is true" do
|
|
|
|
context "when s3 uploads are already globally enabled" do
|
|
|
|
before { GlobalSetting.stubs(:use_s3?).returns(true) }
|
|
|
|
|
|
|
|
it "is not ok" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_enable_s3_uploads("t") }.to raise_error(
|
2019-11-19 16:46:44 -05:00
|
|
|
Discourse::InvalidParameters,
|
|
|
|
error_message,
|
|
|
|
)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
context "when s3 uploads are not already globally enabled" do
|
|
|
|
before { GlobalSetting.stubs(:use_s3?).returns(false) }
|
|
|
|
|
|
|
|
it "should be ok" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_enable_s3_uploads("t") }.not_to raise_error
|
2019-11-19 16:46:44 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
context "when the s3_upload_bucket is blank" do
|
|
|
|
let(:error_message) { I18n.t("errors.site_settings.s3_upload_bucket_is_required") }
|
|
|
|
|
|
|
|
before { SiteSetting.s3_upload_bucket = nil }
|
|
|
|
|
|
|
|
it "is not ok" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_enable_s3_uploads("t") }.to raise_error(
|
2019-11-19 16:46:44 -05:00
|
|
|
Discourse::InvalidParameters,
|
|
|
|
error_message,
|
|
|
|
)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
context "when the s3_upload_bucket is not blank" do
|
|
|
|
before { SiteSetting.s3_upload_bucket = "some-bucket" }
|
|
|
|
|
|
|
|
it "should be ok" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_enable_s3_uploads("t") }.not_to raise_error
|
2019-11-19 16:46:44 -05:00
|
|
|
end
|
2019-11-15 01:05:10 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
FEATURE: Replace `Crawl-delay` directive with proper rate limiting (#15131)
We have a couple of site setting, `slow_down_crawler_user_agents` and `slow_down_crawler_rate`, that are meant to allow site owners to signal to specific crawlers that they're crawling the site too aggressively and that they should slow down.
When a crawler is added to the `slow_down_crawler_user_agents` setting, Discourse currently adds a `Crawl-delay` directive for that crawler in `/robots.txt`. Unfortunately, many crawlers don't support the `Crawl-delay` directive in `/robots.txt` which leaves the site owners no options if a crawler is crawling the site too aggressively.
This PR replaces the `Crawl-delay` directive with proper rate limiting for crawlers added to the `slow_down_crawler_user_agents` list. On every request made by a non-logged in user, Discourse will check the User Agent string and if it contains one of the values of the `slow_down_crawler_user_agents` list, Discourse will only allow 1 request every N seconds for that User Agent (N is the value of the `slow_down_crawler_rate` setting) and the rest of requests made within the same interval will get a 429 response.
The `slow_down_crawler_user_agents` setting becomes quite dangerous with this PR since it could rate limit lots if not all of anonymous traffic if the setting is not used appropriately. So to protect against this scenario, we've added a couple of new validations to the setting when it's changed:
1) each value added to setting must 3 characters or longer
2) each value cannot be a substring of tokens found in popular browser User Agent. The current list of prohibited values is: apple, windows, linux, ubuntu, gecko, firefox, chrome, safari, applewebkit, webkit, mozilla, macintosh, khtml, intel, osx, os x, iphone, ipad and mac.
2021-11-30 04:55:25 -05:00
|
|
|
|
2022-07-27 12:14:14 -04:00
|
|
|
describe "slow_down_crawler_user_agents" do
|
FEATURE: Replace `Crawl-delay` directive with proper rate limiting (#15131)
We have a couple of site setting, `slow_down_crawler_user_agents` and `slow_down_crawler_rate`, that are meant to allow site owners to signal to specific crawlers that they're crawling the site too aggressively and that they should slow down.
When a crawler is added to the `slow_down_crawler_user_agents` setting, Discourse currently adds a `Crawl-delay` directive for that crawler in `/robots.txt`. Unfortunately, many crawlers don't support the `Crawl-delay` directive in `/robots.txt` which leaves the site owners no options if a crawler is crawling the site too aggressively.
This PR replaces the `Crawl-delay` directive with proper rate limiting for crawlers added to the `slow_down_crawler_user_agents` list. On every request made by a non-logged in user, Discourse will check the User Agent string and if it contains one of the values of the `slow_down_crawler_user_agents` list, Discourse will only allow 1 request every N seconds for that User Agent (N is the value of the `slow_down_crawler_rate` setting) and the rest of requests made within the same interval will get a 429 response.
The `slow_down_crawler_user_agents` setting becomes quite dangerous with this PR since it could rate limit lots if not all of anonymous traffic if the setting is not used appropriately. So to protect against this scenario, we've added a couple of new validations to the setting when it's changed:
1) each value added to setting must 3 characters or longer
2) each value cannot be a substring of tokens found in popular browser User Agent. The current list of prohibited values is: apple, windows, linux, ubuntu, gecko, firefox, chrome, safari, applewebkit, webkit, mozilla, macintosh, khtml, intel, osx, os x, iphone, ipad and mac.
2021-11-30 04:55:25 -05:00
|
|
|
let(:too_short_message) do
|
|
|
|
I18n.t("errors.site_settings.slow_down_crawler_user_agent_must_be_at_least_3_characters")
|
|
|
|
end
|
|
|
|
let(:popular_browser_message) do
|
|
|
|
I18n.t(
|
|
|
|
"errors.site_settings.slow_down_crawler_user_agent_cannot_be_popular_browsers",
|
|
|
|
values:
|
|
|
|
SiteSettings::Validations::PROHIBITED_USER_AGENT_STRINGS.join(
|
|
|
|
I18n.t("word_connector.comma"),
|
2023-01-09 06:18:21 -05:00
|
|
|
),
|
FEATURE: Replace `Crawl-delay` directive with proper rate limiting (#15131)
We have a couple of site setting, `slow_down_crawler_user_agents` and `slow_down_crawler_rate`, that are meant to allow site owners to signal to specific crawlers that they're crawling the site too aggressively and that they should slow down.
When a crawler is added to the `slow_down_crawler_user_agents` setting, Discourse currently adds a `Crawl-delay` directive for that crawler in `/robots.txt`. Unfortunately, many crawlers don't support the `Crawl-delay` directive in `/robots.txt` which leaves the site owners no options if a crawler is crawling the site too aggressively.
This PR replaces the `Crawl-delay` directive with proper rate limiting for crawlers added to the `slow_down_crawler_user_agents` list. On every request made by a non-logged in user, Discourse will check the User Agent string and if it contains one of the values of the `slow_down_crawler_user_agents` list, Discourse will only allow 1 request every N seconds for that User Agent (N is the value of the `slow_down_crawler_rate` setting) and the rest of requests made within the same interval will get a 429 response.
The `slow_down_crawler_user_agents` setting becomes quite dangerous with this PR since it could rate limit lots if not all of anonymous traffic if the setting is not used appropriately. So to protect against this scenario, we've added a couple of new validations to the setting when it's changed:
1) each value added to setting must 3 characters or longer
2) each value cannot be a substring of tokens found in popular browser User Agent. The current list of prohibited values is: apple, windows, linux, ubuntu, gecko, firefox, chrome, safari, applewebkit, webkit, mozilla, macintosh, khtml, intel, osx, os x, iphone, ipad and mac.
2021-11-30 04:55:25 -05:00
|
|
|
)
|
|
|
|
end
|
|
|
|
|
|
|
|
it "cannot contain a user agent that's shorter than 3 characters" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_slow_down_crawler_user_agents("ao|acsw") }.to raise_error(
|
FEATURE: Replace `Crawl-delay` directive with proper rate limiting (#15131)
We have a couple of site setting, `slow_down_crawler_user_agents` and `slow_down_crawler_rate`, that are meant to allow site owners to signal to specific crawlers that they're crawling the site too aggressively and that they should slow down.
When a crawler is added to the `slow_down_crawler_user_agents` setting, Discourse currently adds a `Crawl-delay` directive for that crawler in `/robots.txt`. Unfortunately, many crawlers don't support the `Crawl-delay` directive in `/robots.txt` which leaves the site owners no options if a crawler is crawling the site too aggressively.
This PR replaces the `Crawl-delay` directive with proper rate limiting for crawlers added to the `slow_down_crawler_user_agents` list. On every request made by a non-logged in user, Discourse will check the User Agent string and if it contains one of the values of the `slow_down_crawler_user_agents` list, Discourse will only allow 1 request every N seconds for that User Agent (N is the value of the `slow_down_crawler_rate` setting) and the rest of requests made within the same interval will get a 429 response.
The `slow_down_crawler_user_agents` setting becomes quite dangerous with this PR since it could rate limit lots if not all of anonymous traffic if the setting is not used appropriately. So to protect against this scenario, we've added a couple of new validations to the setting when it's changed:
1) each value added to setting must 3 characters or longer
2) each value cannot be a substring of tokens found in popular browser User Agent. The current list of prohibited values is: apple, windows, linux, ubuntu, gecko, firefox, chrome, safari, applewebkit, webkit, mozilla, macintosh, khtml, intel, osx, os x, iphone, ipad and mac.
2021-11-30 04:55:25 -05:00
|
|
|
Discourse::InvalidParameters,
|
|
|
|
too_short_message,
|
|
|
|
)
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_slow_down_crawler_user_agents("up") }.to raise_error(
|
FEATURE: Replace `Crawl-delay` directive with proper rate limiting (#15131)
We have a couple of site setting, `slow_down_crawler_user_agents` and `slow_down_crawler_rate`, that are meant to allow site owners to signal to specific crawlers that they're crawling the site too aggressively and that they should slow down.
When a crawler is added to the `slow_down_crawler_user_agents` setting, Discourse currently adds a `Crawl-delay` directive for that crawler in `/robots.txt`. Unfortunately, many crawlers don't support the `Crawl-delay` directive in `/robots.txt` which leaves the site owners no options if a crawler is crawling the site too aggressively.
This PR replaces the `Crawl-delay` directive with proper rate limiting for crawlers added to the `slow_down_crawler_user_agents` list. On every request made by a non-logged in user, Discourse will check the User Agent string and if it contains one of the values of the `slow_down_crawler_user_agents` list, Discourse will only allow 1 request every N seconds for that User Agent (N is the value of the `slow_down_crawler_rate` setting) and the rest of requests made within the same interval will get a 429 response.
The `slow_down_crawler_user_agents` setting becomes quite dangerous with this PR since it could rate limit lots if not all of anonymous traffic if the setting is not used appropriately. So to protect against this scenario, we've added a couple of new validations to the setting when it's changed:
1) each value added to setting must 3 characters or longer
2) each value cannot be a substring of tokens found in popular browser User Agent. The current list of prohibited values is: apple, windows, linux, ubuntu, gecko, firefox, chrome, safari, applewebkit, webkit, mozilla, macintosh, khtml, intel, osx, os x, iphone, ipad and mac.
2021-11-30 04:55:25 -05:00
|
|
|
Discourse::InvalidParameters,
|
|
|
|
too_short_message,
|
|
|
|
)
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_slow_down_crawler_user_agents("a|") }.to raise_error(
|
FEATURE: Replace `Crawl-delay` directive with proper rate limiting (#15131)
We have a couple of site setting, `slow_down_crawler_user_agents` and `slow_down_crawler_rate`, that are meant to allow site owners to signal to specific crawlers that they're crawling the site too aggressively and that they should slow down.
When a crawler is added to the `slow_down_crawler_user_agents` setting, Discourse currently adds a `Crawl-delay` directive for that crawler in `/robots.txt`. Unfortunately, many crawlers don't support the `Crawl-delay` directive in `/robots.txt` which leaves the site owners no options if a crawler is crawling the site too aggressively.
This PR replaces the `Crawl-delay` directive with proper rate limiting for crawlers added to the `slow_down_crawler_user_agents` list. On every request made by a non-logged in user, Discourse will check the User Agent string and if it contains one of the values of the `slow_down_crawler_user_agents` list, Discourse will only allow 1 request every N seconds for that User Agent (N is the value of the `slow_down_crawler_rate` setting) and the rest of requests made within the same interval will get a 429 response.
The `slow_down_crawler_user_agents` setting becomes quite dangerous with this PR since it could rate limit lots if not all of anonymous traffic if the setting is not used appropriately. So to protect against this scenario, we've added a couple of new validations to the setting when it's changed:
1) each value added to setting must 3 characters or longer
2) each value cannot be a substring of tokens found in popular browser User Agent. The current list of prohibited values is: apple, windows, linux, ubuntu, gecko, firefox, chrome, safari, applewebkit, webkit, mozilla, macintosh, khtml, intel, osx, os x, iphone, ipad and mac.
2021-11-30 04:55:25 -05:00
|
|
|
Discourse::InvalidParameters,
|
|
|
|
too_short_message,
|
|
|
|
)
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_slow_down_crawler_user_agents("|a") }.to raise_error(
|
FEATURE: Replace `Crawl-delay` directive with proper rate limiting (#15131)
We have a couple of site setting, `slow_down_crawler_user_agents` and `slow_down_crawler_rate`, that are meant to allow site owners to signal to specific crawlers that they're crawling the site too aggressively and that they should slow down.
When a crawler is added to the `slow_down_crawler_user_agents` setting, Discourse currently adds a `Crawl-delay` directive for that crawler in `/robots.txt`. Unfortunately, many crawlers don't support the `Crawl-delay` directive in `/robots.txt` which leaves the site owners no options if a crawler is crawling the site too aggressively.
This PR replaces the `Crawl-delay` directive with proper rate limiting for crawlers added to the `slow_down_crawler_user_agents` list. On every request made by a non-logged in user, Discourse will check the User Agent string and if it contains one of the values of the `slow_down_crawler_user_agents` list, Discourse will only allow 1 request every N seconds for that User Agent (N is the value of the `slow_down_crawler_rate` setting) and the rest of requests made within the same interval will get a 429 response.
The `slow_down_crawler_user_agents` setting becomes quite dangerous with this PR since it could rate limit lots if not all of anonymous traffic if the setting is not used appropriately. So to protect against this scenario, we've added a couple of new validations to the setting when it's changed:
1) each value added to setting must 3 characters or longer
2) each value cannot be a substring of tokens found in popular browser User Agent. The current list of prohibited values is: apple, windows, linux, ubuntu, gecko, firefox, chrome, safari, applewebkit, webkit, mozilla, macintosh, khtml, intel, osx, os x, iphone, ipad and mac.
2021-11-30 04:55:25 -05:00
|
|
|
Discourse::InvalidParameters,
|
|
|
|
too_short_message,
|
|
|
|
)
|
|
|
|
end
|
|
|
|
|
|
|
|
it "allows user agents that are 3 characters or longer" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_slow_down_crawler_user_agents("aoc") }.not_to raise_error
|
|
|
|
expect { validations.validate_slow_down_crawler_user_agents("anuq") }.not_to raise_error
|
|
|
|
expect { validations.validate_slow_down_crawler_user_agents("pupsc|kcx") }.not_to raise_error
|
FEATURE: Replace `Crawl-delay` directive with proper rate limiting (#15131)
We have a couple of site setting, `slow_down_crawler_user_agents` and `slow_down_crawler_rate`, that are meant to allow site owners to signal to specific crawlers that they're crawling the site too aggressively and that they should slow down.
When a crawler is added to the `slow_down_crawler_user_agents` setting, Discourse currently adds a `Crawl-delay` directive for that crawler in `/robots.txt`. Unfortunately, many crawlers don't support the `Crawl-delay` directive in `/robots.txt` which leaves the site owners no options if a crawler is crawling the site too aggressively.
This PR replaces the `Crawl-delay` directive with proper rate limiting for crawlers added to the `slow_down_crawler_user_agents` list. On every request made by a non-logged in user, Discourse will check the User Agent string and if it contains one of the values of the `slow_down_crawler_user_agents` list, Discourse will only allow 1 request every N seconds for that User Agent (N is the value of the `slow_down_crawler_rate` setting) and the rest of requests made within the same interval will get a 429 response.
The `slow_down_crawler_user_agents` setting becomes quite dangerous with this PR since it could rate limit lots if not all of anonymous traffic if the setting is not used appropriately. So to protect against this scenario, we've added a couple of new validations to the setting when it's changed:
1) each value added to setting must 3 characters or longer
2) each value cannot be a substring of tokens found in popular browser User Agent. The current list of prohibited values is: apple, windows, linux, ubuntu, gecko, firefox, chrome, safari, applewebkit, webkit, mozilla, macintosh, khtml, intel, osx, os x, iphone, ipad and mac.
2021-11-30 04:55:25 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
it "allows the setting to be empty" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_slow_down_crawler_user_agents("") }.not_to raise_error
|
FEATURE: Replace `Crawl-delay` directive with proper rate limiting (#15131)
We have a couple of site setting, `slow_down_crawler_user_agents` and `slow_down_crawler_rate`, that are meant to allow site owners to signal to specific crawlers that they're crawling the site too aggressively and that they should slow down.
When a crawler is added to the `slow_down_crawler_user_agents` setting, Discourse currently adds a `Crawl-delay` directive for that crawler in `/robots.txt`. Unfortunately, many crawlers don't support the `Crawl-delay` directive in `/robots.txt` which leaves the site owners no options if a crawler is crawling the site too aggressively.
This PR replaces the `Crawl-delay` directive with proper rate limiting for crawlers added to the `slow_down_crawler_user_agents` list. On every request made by a non-logged in user, Discourse will check the User Agent string and if it contains one of the values of the `slow_down_crawler_user_agents` list, Discourse will only allow 1 request every N seconds for that User Agent (N is the value of the `slow_down_crawler_rate` setting) and the rest of requests made within the same interval will get a 429 response.
The `slow_down_crawler_user_agents` setting becomes quite dangerous with this PR since it could rate limit lots if not all of anonymous traffic if the setting is not used appropriately. So to protect against this scenario, we've added a couple of new validations to the setting when it's changed:
1) each value added to setting must 3 characters or longer
2) each value cannot be a substring of tokens found in popular browser User Agent. The current list of prohibited values is: apple, windows, linux, ubuntu, gecko, firefox, chrome, safari, applewebkit, webkit, mozilla, macintosh, khtml, intel, osx, os x, iphone, ipad and mac.
2021-11-30 04:55:25 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
it "cannot contain a token of a popular browser user agent" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_slow_down_crawler_user_agents("mOzilla") }.to raise_error(
|
FEATURE: Replace `Crawl-delay` directive with proper rate limiting (#15131)
We have a couple of site setting, `slow_down_crawler_user_agents` and `slow_down_crawler_rate`, that are meant to allow site owners to signal to specific crawlers that they're crawling the site too aggressively and that they should slow down.
When a crawler is added to the `slow_down_crawler_user_agents` setting, Discourse currently adds a `Crawl-delay` directive for that crawler in `/robots.txt`. Unfortunately, many crawlers don't support the `Crawl-delay` directive in `/robots.txt` which leaves the site owners no options if a crawler is crawling the site too aggressively.
This PR replaces the `Crawl-delay` directive with proper rate limiting for crawlers added to the `slow_down_crawler_user_agents` list. On every request made by a non-logged in user, Discourse will check the User Agent string and if it contains one of the values of the `slow_down_crawler_user_agents` list, Discourse will only allow 1 request every N seconds for that User Agent (N is the value of the `slow_down_crawler_rate` setting) and the rest of requests made within the same interval will get a 429 response.
The `slow_down_crawler_user_agents` setting becomes quite dangerous with this PR since it could rate limit lots if not all of anonymous traffic if the setting is not used appropriately. So to protect against this scenario, we've added a couple of new validations to the setting when it's changed:
1) each value added to setting must 3 characters or longer
2) each value cannot be a substring of tokens found in popular browser User Agent. The current list of prohibited values is: apple, windows, linux, ubuntu, gecko, firefox, chrome, safari, applewebkit, webkit, mozilla, macintosh, khtml, intel, osx, os x, iphone, ipad and mac.
2021-11-30 04:55:25 -05:00
|
|
|
Discourse::InvalidParameters,
|
|
|
|
popular_browser_message,
|
|
|
|
)
|
|
|
|
|
2023-06-21 10:00:19 -04:00
|
|
|
expect {
|
|
|
|
validations.validate_slow_down_crawler_user_agents("chRome|badcrawler")
|
|
|
|
}.to raise_error(Discourse::InvalidParameters, popular_browser_message)
|
FEATURE: Replace `Crawl-delay` directive with proper rate limiting (#15131)
We have a couple of site setting, `slow_down_crawler_user_agents` and `slow_down_crawler_rate`, that are meant to allow site owners to signal to specific crawlers that they're crawling the site too aggressively and that they should slow down.
When a crawler is added to the `slow_down_crawler_user_agents` setting, Discourse currently adds a `Crawl-delay` directive for that crawler in `/robots.txt`. Unfortunately, many crawlers don't support the `Crawl-delay` directive in `/robots.txt` which leaves the site owners no options if a crawler is crawling the site too aggressively.
This PR replaces the `Crawl-delay` directive with proper rate limiting for crawlers added to the `slow_down_crawler_user_agents` list. On every request made by a non-logged in user, Discourse will check the User Agent string and if it contains one of the values of the `slow_down_crawler_user_agents` list, Discourse will only allow 1 request every N seconds for that User Agent (N is the value of the `slow_down_crawler_rate` setting) and the rest of requests made within the same interval will get a 429 response.
The `slow_down_crawler_user_agents` setting becomes quite dangerous with this PR since it could rate limit lots if not all of anonymous traffic if the setting is not used appropriately. So to protect against this scenario, we've added a couple of new validations to the setting when it's changed:
1) each value added to setting must 3 characters or longer
2) each value cannot be a substring of tokens found in popular browser User Agent. The current list of prohibited values is: apple, windows, linux, ubuntu, gecko, firefox, chrome, safari, applewebkit, webkit, mozilla, macintosh, khtml, intel, osx, os x, iphone, ipad and mac.
2021-11-30 04:55:25 -05:00
|
|
|
|
2023-06-21 10:00:19 -04:00
|
|
|
expect {
|
|
|
|
validations.validate_slow_down_crawler_user_agents("html|badcrawler")
|
|
|
|
}.to raise_error(Discourse::InvalidParameters, popular_browser_message)
|
FEATURE: Replace `Crawl-delay` directive with proper rate limiting (#15131)
We have a couple of site setting, `slow_down_crawler_user_agents` and `slow_down_crawler_rate`, that are meant to allow site owners to signal to specific crawlers that they're crawling the site too aggressively and that they should slow down.
When a crawler is added to the `slow_down_crawler_user_agents` setting, Discourse currently adds a `Crawl-delay` directive for that crawler in `/robots.txt`. Unfortunately, many crawlers don't support the `Crawl-delay` directive in `/robots.txt` which leaves the site owners no options if a crawler is crawling the site too aggressively.
This PR replaces the `Crawl-delay` directive with proper rate limiting for crawlers added to the `slow_down_crawler_user_agents` list. On every request made by a non-logged in user, Discourse will check the User Agent string and if it contains one of the values of the `slow_down_crawler_user_agents` list, Discourse will only allow 1 request every N seconds for that User Agent (N is the value of the `slow_down_crawler_rate` setting) and the rest of requests made within the same interval will get a 429 response.
The `slow_down_crawler_user_agents` setting becomes quite dangerous with this PR since it could rate limit lots if not all of anonymous traffic if the setting is not used appropriately. So to protect against this scenario, we've added a couple of new validations to the setting when it's changed:
1) each value added to setting must 3 characters or longer
2) each value cannot be a substring of tokens found in popular browser User Agent. The current list of prohibited values is: apple, windows, linux, ubuntu, gecko, firefox, chrome, safari, applewebkit, webkit, mozilla, macintosh, khtml, intel, osx, os x, iphone, ipad and mac.
2021-11-30 04:55:25 -05:00
|
|
|
end
|
|
|
|
end
|
2022-05-05 14:13:17 -04:00
|
|
|
|
|
|
|
describe "strip image metadata and composer media optimization interplay" do
|
|
|
|
describe "#validate_strip_image_metadata" do
|
|
|
|
let(:error_message) do
|
|
|
|
I18n.t(
|
|
|
|
"errors.site_settings.strip_image_metadata_cannot_be_disabled_if_composer_media_optimization_image_enabled",
|
|
|
|
)
|
2023-01-09 06:18:21 -05:00
|
|
|
end
|
2022-05-05 14:13:17 -04:00
|
|
|
|
|
|
|
context "when the new value is false" do
|
|
|
|
context "when composer_media_optimization_image_enabled is enabled" do
|
|
|
|
before { SiteSetting.composer_media_optimization_image_enabled = true }
|
|
|
|
|
|
|
|
it "should raise an error" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_strip_image_metadata("f") }.to raise_error(
|
2022-05-05 14:13:17 -04:00
|
|
|
Discourse::InvalidParameters,
|
|
|
|
error_message,
|
|
|
|
)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
context "when composer_media_optimization_image_enabled is disabled" do
|
|
|
|
before { SiteSetting.composer_media_optimization_image_enabled = false }
|
|
|
|
|
|
|
|
it "should be ok" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_strip_image_metadata("f") }.not_to raise_error
|
2022-05-05 14:13:17 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
context "when the new value is true" do
|
|
|
|
it "should be ok" do
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_strip_image_metadata("t") }.not_to raise_error
|
2022-05-05 14:13:17 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2022-06-02 19:02:57 -04:00
|
|
|
|
|
|
|
describe "#twitter_summary_large_image" do
|
|
|
|
it "does not allow SVG image files" do
|
|
|
|
upload = Fabricate(:upload, url: "/images/logo-dark.svg", extension: "svg")
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_twitter_summary_large_image(upload.id) }.to raise_error(
|
2022-06-02 19:02:57 -04:00
|
|
|
Discourse::InvalidParameters,
|
|
|
|
I18n.t("errors.site_settings.twitter_summary_large_image_no_svg"),
|
|
|
|
)
|
|
|
|
upload.update!(url: "/images/logo-dark.png", extension: "png")
|
2023-06-21 10:00:19 -04:00
|
|
|
expect { validations.validate_twitter_summary_large_image(upload.id) }.not_to raise_error
|
|
|
|
expect { validations.validate_twitter_summary_large_image(nil) }.not_to raise_error
|
2022-06-02 19:02:57 -04:00
|
|
|
end
|
|
|
|
end
|
2018-12-16 18:09:13 -05:00
|
|
|
end
|