FIX: broken onebox images due to url normalization bugs

normalized_encode in addressable has a number of issues, including https://github.com/sporkmonger/addressable/issues/472

To temporaily work around those issues for the majority of cases, we try parsing with `::URI`. If that fails (e.g. due to non-ascii characters) then we will fall back to addressable.

Hopefully we can simplify this back to `Addressable::URI.normalized_encode` in the future.

This commit also adds support for unicode domain names and emoji domain names with escape_uri.

This removes an unneeded hack checking for pre-signed urls, which are now handled by the general case due to starting off valid and only being minimally normalized. Previous test case continues to pass.

UrlHelper.s3_presigned_url? which was somewhat wide was removed.
This commit is contained in:
Sam Saffron 2022-08-09 14:42:23 +10:00 committed by David Taylor
parent 3755bad03c
commit f0a0252526
4 changed files with 96 additions and 30 deletions

View File

@ -60,9 +60,35 @@ class UrlHelper
self.absolute(Upload.secure_media_url_from_upload_url(url), nil)
end
# This is a poorly named method. In reality, it **normalizes** the given URL,
# and does not escape it. Therefore it's idempotent, and can be used on user
# input which includes a mix of escaped and unescaped characters
def self.escape_uri(uri)
return uri if s3_presigned_url?(uri)
Addressable::URI.normalized_encode(uri)
validated = nil
url = uri.to_s
# Ideally we will jump straight to `Addressable::URI.normalized_encode`. However,
# that implementation has some edge-case issues like https://github.com/sporkmonger/addressable/issues/472.
# To temporaily work around those issues for the majority of cases, we try parsing with `::URI`.
# If that fails (e.g. due to non-ascii characters) then we will fall back to addressable.
# Hopefully we can simplify this back to `Addressable::URI.normalized_encode` in the future.
# edge case where we expect mailto:test%40test.com to normalize to mailto:test@test.com
if url.match(/\Amailto:/)
return normalize_with_addressable(url)
end
# If it doesn't pass the regexp, it's definitely not gonna parse with URI.parse. Skip
# to addressable
if !url.match?(/\A#{URI::regexp}\z/)
return normalize_with_addressable(url)
end
begin
normalize_with_ruby_uri(url)
rescue URI::Error
normalize_with_addressable(url)
end
end
def self.rails_route_from_url(url)
@ -72,10 +98,6 @@ class UrlHelper
nil
end
def self.s3_presigned_url?(url)
url[/x-amz-(algorithm|credential)/i].present?
end
def self.cook_url(url, secure: false, local: nil)
is_secure = SiteSetting.secure_media && secure
local = is_local(url) if local.nil?
@ -120,4 +142,30 @@ class UrlHelper
end
end
private
def self.normalize_with_addressable(url)
u = Addressable::URI.normalized_encode(url, Addressable::URI)
if u.host && !u.host.ascii_only?
u.host = ::Addressable::IDNA.to_ascii(u.host)
end
u.to_s
end
def self.normalize_with_ruby_uri(url)
u = URI.parse(url)
if u.scheme && u.scheme != u.scheme.downcase
u.scheme = u.scheme.downcase
end
if u.host && u.host != u.host.downcase
u.host = u.host.downcase
end
u.to_s
end
end

View File

@ -26,7 +26,7 @@ RSpec.describe FinalDestination do
when 'any-subdomain.ihaveawildcard.com' then '104.25.152.11'
when 'wikipedia.com' then '1.2.3.4'
else
as_ip = IPAddr.new(host)
_as_ip = IPAddr.new(host)
host
end
end

View File

@ -86,6 +86,16 @@ RSpec.describe UrlHelper do
end
describe "#escape_uri" do
it "does not double escape %3A (:)" do
url = "http://discourse.org/%3A/test"
expect(UrlHelper.escape_uri(url)).to eq(url)
end
it "does not double escape %2F (/)" do
url = "http://discourse.org/%2F/test"
expect(UrlHelper.escape_uri(url)).to eq(url)
end
it "doesn't escape simple URL" do
url = UrlHelper.escape_uri('http://example.com/foo/bar')
expect(url).to eq('http://example.com/foo/bar')
@ -107,8 +117,37 @@ RSpec.describe UrlHelper do
end
it "doesn't escape already escaped chars (hash)" do
url = UrlHelper.escape_uri('https://calendar.google.com/calendar/embed?src=en.uk%23holiday%40group.v.calendar.google.com&ctz=Europe%2FLondon')
expect(url).to eq('https://calendar.google.com/calendar/embed?src=en.uk%23holiday@group.v.calendar.google.com&ctz=Europe/London')
url = 'https://calendar.google.com/calendar/embed?src=en.uk%23holiday@group.v.calendar.google.com&ctz=Europe%2FLondon'
escaped = UrlHelper.escape_uri(url)
expect(escaped).to eq(url)
end
it "leaves reserved chars alone in edge cases" do
skip "see: https://github.com/sporkmonger/addressable/issues/472"
url = "https://example.com/ article/id%3A1.2%2F1/bar"
expected = "https://example.com/%20article/id%3A1.2%2F1/bar"
escaped = UrlHelper.escape_uri(url)
expect(escaped).to eq(expected)
end
it "handles emoji domain names" do
url = "https://💻.example/💻?computer=💻"
expected = "https://xn--3s8h.example/%F0%9F%92%BB?computer=%F0%9F%92%BB"
escaped = UrlHelper.escape_uri(url)
expect(escaped).to eq(expected)
end
it "handles special-character domain names" do
url = "https://éxample.com/test"
expected = "https://xn--xample-9ua.com/test"
escaped = UrlHelper.escape_uri(url)
expect(escaped).to eq(expected)
end
it "performs basic normalization" do
url = "http://EXAMPLE.com/a"
escaped = UrlHelper.escape_uri(url)
expect(escaped).to eq("http://example.com/a")
end
it "doesn't escape S3 presigned URLs" do

View File

@ -568,27 +568,6 @@ RSpec.describe Upload do
end
end
describe ".signed_url_from_secure_media_url" do
before do
# must be done so signed_url_for_path exists
enable_secure_media
end
it "correctly gives back a signed url from a path only" do
secure_url = "/secure-media-uploads/original/1X/c5a2c4ba0fa390f5aac5c2c1a12416791ebdd9e9.png"
signed_url = Upload.signed_url_from_secure_media_url(secure_url)
expect(signed_url).not_to include("secure-media-uploads")
expect(UrlHelper.s3_presigned_url?(signed_url)).to eq(true)
end
it "correctly gives back a signed url from a full url" do
secure_url = "http://localhost:3000/secure-media-uploads/original/1X/c5a2c4ba0fa390f5aac5c2c1a12416791ebdd9e9.png"
signed_url = Upload.signed_url_from_secure_media_url(secure_url)
expect(signed_url).not_to include(Discourse.base_url)
expect(UrlHelper.s3_presigned_url?(signed_url)).to eq(true)
end
end
describe ".secure_media_url_from_upload_url" do
before do
# must be done so signed_url_for_path exists