diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb index abab83596a6..043fb6a68a4 100644 --- a/lib/retrieve_title.rb +++ b/lib/retrieve_title.rb @@ -11,6 +11,9 @@ module RetrieveTitle def self.extract_title(html, encoding = nil) title = nil + if html =~ // && html !~ /<\/title>/ + return nil + end if doc = Nokogiri::HTML5(html, nil, encoding) title = doc.at('title')&.inner_text @@ -44,8 +47,8 @@ module RetrieveTitle return 500 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/ return 300 if uri.host =~ /youtube\.com$/ || uri.host =~ /youtu.be/ - # default is 10k - 10 + # default is 20k + 20 end # Fetch the beginning of a HTML document at a url diff --git a/spec/components/retrieve_title_spec.rb b/spec/components/retrieve_title_spec.rb index b7a5b3f8822..f760a6a1f0f 100644 --- a/spec/components/retrieve_title_spec.rb +++ b/spec/components/retrieve_title_spec.rb @@ -101,4 +101,24 @@ describe RetrieveTitle do expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq("very amazing") end end + + context 'fetch_title' do + it "does not parse broken title tag" do + # webmock does not do chunks + stub_request(:get, "https://en.wikipedia.org/wiki/Internet"). + to_return(status: 200, body: "<html><head><title>Internet - Wikipedia</ti" , headers: {}) + + title = RetrieveTitle.fetch_title("https://en.wikipedia.org/wiki/Internet") + expect(title).to eq(nil) + end + + it "can parse correct title tag" do + # webmock does not do chunks + stub_request(:get, "https://en.wikipedia.org/wiki/Internet"). + to_return(status: 200, body: "<html><head><title>Internet - Wikipedia" , headers: {}) + + title = RetrieveTitle.fetch_title("https://en.wikipedia.org/wiki/Internet") + expect(title).to eq("Internet - Wikipedia") + end + end end