2019-05-02 18:17:27 -04:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2015-06-12 06:02:36 -04:00
|
|
|
class UrlHelper
|
2023-03-23 08:01:04 -04:00
|
|
|
MAX_URL_LENGTH = 100_000
|
|
|
|
|
2018-12-11 02:03:13 -05:00
|
|
|
# At the moment this handles invalid URLs that browser address bar accepts
|
|
|
|
# where second # is not encoded
|
|
|
|
#
|
|
|
|
# Longer term we can add support of simpleidn and encode unicode domains
|
|
|
|
def self.relaxed_parse(url)
|
|
|
|
url, fragment = url.split("#", 2)
|
|
|
|
uri = URI.parse(url)
|
|
|
|
if uri
|
2019-12-11 21:49:21 -05:00
|
|
|
# Addressable::URI::CharacterClasses::UNRESERVED is used here because without it
|
|
|
|
# the # in the fragment is not encoded
|
|
|
|
fragment =
|
|
|
|
Addressable::URI.encode_component(
|
|
|
|
fragment,
|
|
|
|
Addressable::URI::CharacterClasses::UNRESERVED,
|
|
|
|
) if fragment&.include?("#")
|
2018-12-11 02:03:13 -05:00
|
|
|
uri.fragment = fragment
|
|
|
|
uri
|
|
|
|
end
|
|
|
|
rescue URI::Error
|
|
|
|
end
|
|
|
|
|
2019-12-11 21:49:21 -05:00
|
|
|
def self.encode_and_parse(url)
|
|
|
|
URI.parse(Addressable::URI.encode(url))
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.encode(url)
|
|
|
|
Addressable::URI.encode(url)
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.unencode(url)
|
|
|
|
Addressable::URI.unencode(url)
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.encode_component(url_component)
|
|
|
|
Addressable::URI.encode_component(url_component)
|
|
|
|
end
|
|
|
|
|
2015-06-12 06:02:36 -04:00
|
|
|
def self.is_local(url)
|
2014-07-18 11:54:18 -04:00
|
|
|
url.present? &&
|
|
|
|
(
|
|
|
|
Discourse.store.has_been_uploaded?(url) ||
|
2020-10-09 07:51:24 -04:00
|
|
|
!!(url =~ Regexp.new("^#{Discourse.base_path}/(assets|plugins|images)/")) ||
|
2014-07-18 11:54:18 -04:00
|
|
|
url.start_with?(Discourse.asset_host || Discourse.base_url_no_prefix)
|
|
|
|
)
|
2013-11-20 07:10:08 -05:00
|
|
|
end
|
|
|
|
|
2015-06-12 06:02:36 -04:00
|
|
|
def self.absolute(url, cdn = Discourse.asset_host)
|
2023-01-20 13:52:49 -05:00
|
|
|
cdn = "https:#{cdn}" if cdn && cdn =~ %r{\A//}
|
|
|
|
url =~ %r{\A/[^/]} ? (cdn || Discourse.base_url_no_prefix) + url : url
|
2013-12-16 18:35:34 -05:00
|
|
|
end
|
|
|
|
|
2015-06-12 06:02:36 -04:00
|
|
|
def self.absolute_without_cdn(url)
|
|
|
|
self.absolute(url, nil)
|
2013-11-20 07:10:08 -05:00
|
|
|
end
|
|
|
|
|
2015-06-12 06:02:36 -04:00
|
|
|
def self.schemaless(url)
|
2023-01-20 13:52:49 -05:00
|
|
|
url.sub(/\Ahttp:/i, "")
|
2013-11-20 07:10:08 -05:00
|
|
|
end
|
|
|
|
|
2019-11-17 20:25:42 -05:00
|
|
|
def self.secure_proxy_without_cdn(url)
|
2022-09-28 19:24:33 -04:00
|
|
|
self.absolute(Upload.secure_uploads_url_from_upload_url(url), nil)
|
2019-11-17 20:25:42 -05:00
|
|
|
end
|
|
|
|
|
2022-08-09 06:28:29 -04:00
|
|
|
def self.normalized_encode(uri)
|
2022-08-09 00:42:23 -04:00
|
|
|
url = uri.to_s
|
|
|
|
|
2023-03-23 08:01:04 -04:00
|
|
|
raise ArgumentError.new(:uri, "URL is too long") if url.length > MAX_URL_LENGTH
|
|
|
|
|
2022-08-09 00:42:23 -04:00
|
|
|
# Ideally we will jump straight to `Addressable::URI.normalized_encode`. However,
|
|
|
|
# that implementation has some edge-case issues like https://github.com/sporkmonger/addressable/issues/472.
|
|
|
|
# To temporaily work around those issues for the majority of cases, we try parsing with `::URI`.
|
|
|
|
# If that fails (e.g. due to non-ascii characters) then we will fall back to addressable.
|
|
|
|
# Hopefully we can simplify this back to `Addressable::URI.normalized_encode` in the future.
|
|
|
|
|
|
|
|
# edge case where we expect mailto:test%40test.com to normalize to mailto:test@test.com
|
|
|
|
return normalize_with_addressable(url) if url.match(/\Amailto:/)
|
|
|
|
|
|
|
|
# If it doesn't pass the regexp, it's definitely not gonna parse with URI.parse. Skip
|
|
|
|
# to addressable
|
|
|
|
return normalize_with_addressable(url) if !url.match?(/\A#{URI.regexp}\z/)
|
|
|
|
|
|
|
|
begin
|
|
|
|
normalize_with_ruby_uri(url)
|
|
|
|
rescue URI::Error
|
|
|
|
normalize_with_addressable(url)
|
|
|
|
end
|
2020-01-30 18:09:34 -05:00
|
|
|
end
|
|
|
|
|
2020-08-27 21:28:11 -04:00
|
|
|
def self.rails_route_from_url(url)
|
2020-08-28 15:10:10 -04:00
|
|
|
path = URI.parse(encode(url)).path
|
2020-08-27 21:28:11 -04:00
|
|
|
Rails.application.routes.recognize_path(path)
|
2020-10-05 14:12:33 -04:00
|
|
|
rescue Addressable::URI::InvalidURIError, URI::InvalidComponentError
|
2020-09-30 01:20:00 -04:00
|
|
|
nil
|
2020-08-27 21:28:11 -04:00
|
|
|
end
|
|
|
|
|
2020-12-30 13:13:13 -05:00
|
|
|
def self.cook_url(url, secure: false, local: nil)
|
2022-09-28 19:24:33 -04:00
|
|
|
is_secure = SiteSetting.secure_uploads && secure
|
2020-12-30 13:13:13 -05:00
|
|
|
local = is_local(url) if local.nil?
|
|
|
|
return url if !local
|
2018-08-14 06:23:32 -04:00
|
|
|
|
2021-06-08 13:25:51 -04:00
|
|
|
url = is_secure ? secure_proxy_without_cdn(url) : absolute_without_cdn(url)
|
2019-02-20 13:24:38 -05:00
|
|
|
|
2022-09-28 19:24:33 -04:00
|
|
|
# we always want secure uploads to come from
|
|
|
|
# Discourse.base_url_no_prefix/secure-uploads
|
2019-11-22 00:29:31 -05:00
|
|
|
# to avoid asset_host mixups
|
2021-06-08 13:25:51 -04:00
|
|
|
return schemaless(url) if is_secure
|
2019-11-22 00:29:31 -05:00
|
|
|
|
2021-05-20 21:43:47 -04:00
|
|
|
# PERF: avoid parsing url except for extreme conditions
|
2020-12-30 08:08:02 -05:00
|
|
|
# this is a hot path used on home page
|
|
|
|
filename = url
|
|
|
|
if url.include?("?")
|
|
|
|
uri = URI.parse(url)
|
|
|
|
filename = File.basename(uri.path)
|
|
|
|
end
|
|
|
|
|
|
|
|
# this technically requires a filename, but will work with a URL as long as it end with the
|
|
|
|
# extension and has no query params
|
|
|
|
is_attachment = !FileHelper.is_supported_media?(filename)
|
|
|
|
|
|
|
|
no_cdn = SiteSetting.login_required || SiteSetting.prevent_anons_from_downloading_files
|
2019-02-20 13:24:38 -05:00
|
|
|
unless is_attachment && no_cdn
|
|
|
|
url = Discourse.store.cdn_url(url)
|
|
|
|
url = local_cdn_url(url) if Discourse.store.external?
|
|
|
|
end
|
2018-08-14 06:23:32 -04:00
|
|
|
|
|
|
|
schemaless(url)
|
|
|
|
rescue URI::Error
|
|
|
|
url
|
|
|
|
end
|
|
|
|
|
2019-02-20 13:24:38 -05:00
|
|
|
def self.local_cdn_url(url)
|
|
|
|
return url if Discourse.asset_host.blank?
|
2019-10-14 02:09:16 -04:00
|
|
|
if url.start_with?("/#{Discourse.store.upload_path}/")
|
|
|
|
"#{Discourse.asset_host}#{url}"
|
|
|
|
else
|
|
|
|
url.sub(Discourse.base_url_no_prefix, Discourse.asset_host)
|
|
|
|
end
|
2019-02-20 13:24:38 -05:00
|
|
|
end
|
|
|
|
|
2022-08-09 00:42:23 -04:00
|
|
|
private
|
|
|
|
|
|
|
|
def self.normalize_with_addressable(url)
|
|
|
|
u = Addressable::URI.normalized_encode(url, Addressable::URI)
|
|
|
|
|
|
|
|
u.host = ::Addressable::IDNA.to_ascii(u.host) if u.host && !u.host.ascii_only?
|
|
|
|
|
|
|
|
u.to_s
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.normalize_with_ruby_uri(url)
|
|
|
|
u = URI.parse(url)
|
|
|
|
|
|
|
|
u.scheme = u.scheme.downcase if u.scheme && u.scheme != u.scheme.downcase
|
|
|
|
|
|
|
|
u.host = u.host.downcase if u.host && u.host != u.host.downcase
|
|
|
|
|
|
|
|
u.to_s
|
|
|
|
end
|
2013-11-20 07:10:08 -05:00
|
|
|
end
|