From 54e813e96403cb33a6d4e82696dbf73ea703d033 Mon Sep 17 00:00:00 2001 From: Ted Johansson Date: Fri, 1 Dec 2023 15:03:06 +0800 Subject: [PATCH] FIX: Don't error out when trying to retrieve title and URL won't encode (#24660) --- lib/final_destination.rb | 4 ++++ lib/retrieve_title.rb | 9 +++++++-- spec/lib/final_destination_spec.rb | 6 ++++++ spec/lib/retrieve_title_spec.rb | 6 ++++++ 4 files changed, 23 insertions(+), 2 deletions(-) diff --git a/lib/final_destination.rb b/lib/final_destination.rb index 973c457e9f2..35aac08e3b0 100644 --- a/lib/final_destination.rb +++ b/lib/final_destination.rb @@ -10,6 +10,8 @@ require "url_helper" class FinalDestination class SSRFError < SocketError end + class UrlEncodingError < ArgumentError + end MAX_REQUEST_TIME_SECONDS = 10 MAX_REQUEST_SIZE_BYTES = 5_242_880 # 1024 * 1024 * 5 @@ -457,6 +459,8 @@ class FinalDestination def normalized_url UrlHelper.normalized_encode(@url) + rescue ArgumentError => e + raise UrlEncodingError, e.message end def log(log_level, message) diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb index 8ea76ecec29..d75cae8f5ac 100644 --- a/lib/retrieve_title.rb +++ b/lib/retrieve_title.rb @@ -2,6 +2,11 @@ module RetrieveTitle CRAWL_TIMEOUT = 1 + UNRECOVERABLE_ERRORS = [ + Net::ReadTimeout, + FinalDestination::SSRFError, + FinalDestination::UrlEncodingError, + ] def self.crawl(url, max_redirects: nil, initial_https_redirect_ignore_limit: false) fetch_title( @@ -9,8 +14,8 @@ module RetrieveTitle max_redirects: max_redirects, initial_https_redirect_ignore_limit: initial_https_redirect_ignore_limit, ) - rescue Net::ReadTimeout, FinalDestination::SSRFError - # do nothing for Net::ReadTimeout errors + rescue *UNRECOVERABLE_ERRORS + # ¯\_(ツ)_/¯ end def self.extract_title(html, encoding = nil) diff --git a/spec/lib/final_destination_spec.rb b/spec/lib/final_destination_spec.rb index c99cac3df4b..aa316794ce5 100644 --- a/spec/lib/final_destination_spec.rb +++ b/spec/lib/final_destination_spec.rb @@ -60,6 +60,12 @@ RSpec.describe FinalDestination do expect(fd.ignored).to eq(%w[test.localhost google.com meta.discourse.org]) end + it "raises an error when URL is too long to encode" do + expect { + FinalDestination.new("https://meta.discourse.org/" + "x" * UrlHelper::MAX_URL_LENGTH) + }.to raise_error(FinalDestination::UrlEncodingError) + end + describe ".resolve" do it "has a ready status code before anything happens" do expect(fd("https://eviltrout.com").status).to eq(:ready) diff --git a/spec/lib/retrieve_title_spec.rb b/spec/lib/retrieve_title_spec.rb index 685696efb50..1a3cc239774 100644 --- a/spec/lib/retrieve_title_spec.rb +++ b/spec/lib/retrieve_title_spec.rb @@ -207,6 +207,12 @@ RSpec.describe RetrieveTitle do expect(RetrieveTitle.crawl("https://example.com")).to eq(nil) end + + it "ignores URL encoding errors" do + described_class.stubs(:fetch_title).raises(FinalDestination::UrlEncodingError) + + expect(RetrieveTitle.crawl("https://example.com")).to eq(nil) + end end describe ".fetch_title" do