FIX: Don't error out when trying to retrieve title and URL won't encode (#24660)

This commit is contained in:
Ted Johansson 2023-12-01 15:03:06 +08:00 committed by GitHub
parent aadc104817
commit 54e813e964
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 23 additions and 2 deletions

View File

@ -10,6 +10,8 @@ require "url_helper"
class FinalDestination class FinalDestination
class SSRFError < SocketError class SSRFError < SocketError
end end
class UrlEncodingError < ArgumentError
end
MAX_REQUEST_TIME_SECONDS = 10 MAX_REQUEST_TIME_SECONDS = 10
MAX_REQUEST_SIZE_BYTES = 5_242_880 # 1024 * 1024 * 5 MAX_REQUEST_SIZE_BYTES = 5_242_880 # 1024 * 1024 * 5
@ -457,6 +459,8 @@ class FinalDestination
def normalized_url def normalized_url
UrlHelper.normalized_encode(@url) UrlHelper.normalized_encode(@url)
rescue ArgumentError => e
raise UrlEncodingError, e.message
end end
def log(log_level, message) def log(log_level, message)

View File

@ -2,6 +2,11 @@
module RetrieveTitle module RetrieveTitle
CRAWL_TIMEOUT = 1 CRAWL_TIMEOUT = 1
UNRECOVERABLE_ERRORS = [
Net::ReadTimeout,
FinalDestination::SSRFError,
FinalDestination::UrlEncodingError,
]
def self.crawl(url, max_redirects: nil, initial_https_redirect_ignore_limit: false) def self.crawl(url, max_redirects: nil, initial_https_redirect_ignore_limit: false)
fetch_title( fetch_title(
@ -9,8 +14,8 @@ module RetrieveTitle
max_redirects: max_redirects, max_redirects: max_redirects,
initial_https_redirect_ignore_limit: initial_https_redirect_ignore_limit, initial_https_redirect_ignore_limit: initial_https_redirect_ignore_limit,
) )
rescue Net::ReadTimeout, FinalDestination::SSRFError rescue *UNRECOVERABLE_ERRORS
# do nothing for Net::ReadTimeout errors # ¯\_(ツ)_/¯
end end
def self.extract_title(html, encoding = nil) def self.extract_title(html, encoding = nil)

View File

@ -60,6 +60,12 @@ RSpec.describe FinalDestination do
expect(fd.ignored).to eq(%w[test.localhost google.com meta.discourse.org]) expect(fd.ignored).to eq(%w[test.localhost google.com meta.discourse.org])
end end
it "raises an error when URL is too long to encode" do
expect {
FinalDestination.new("https://meta.discourse.org/" + "x" * UrlHelper::MAX_URL_LENGTH)
}.to raise_error(FinalDestination::UrlEncodingError)
end
describe ".resolve" do describe ".resolve" do
it "has a ready status code before anything happens" do it "has a ready status code before anything happens" do
expect(fd("https://eviltrout.com").status).to eq(:ready) expect(fd("https://eviltrout.com").status).to eq(:ready)

View File

@ -207,6 +207,12 @@ RSpec.describe RetrieveTitle do
expect(RetrieveTitle.crawl("https://example.com")).to eq(nil) expect(RetrieveTitle.crawl("https://example.com")).to eq(nil)
end end
it "ignores URL encoding errors" do
described_class.stubs(:fetch_title).raises(FinalDestination::UrlEncodingError)
expect(RetrieveTitle.crawl("https://example.com")).to eq(nil)
end
end end
describe ".fetch_title" do describe ".fetch_title" do