diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb
index abab83596a6..043fb6a68a4 100644
--- a/lib/retrieve_title.rb
+++ b/lib/retrieve_title.rb
@@ -11,6 +11,9 @@ module RetrieveTitle
def self.extract_title(html, encoding = nil)
title = nil
+ if html =~ /
/ && html !~ /<\/title>/
+ return nil
+ end
if doc = Nokogiri::HTML5(html, nil, encoding)
title = doc.at('title')&.inner_text
@@ -44,8 +47,8 @@ module RetrieveTitle
return 500 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
return 300 if uri.host =~ /youtube\.com$/ || uri.host =~ /youtu.be/
- # default is 10k
- 10
+ # default is 20k
+ 20
end
# Fetch the beginning of a HTML document at a url
diff --git a/spec/components/retrieve_title_spec.rb b/spec/components/retrieve_title_spec.rb
index b7a5b3f8822..f760a6a1f0f 100644
--- a/spec/components/retrieve_title_spec.rb
+++ b/spec/components/retrieve_title_spec.rb
@@ -101,4 +101,24 @@ describe RetrieveTitle do
expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq("very amazing")
end
end
+
+ context 'fetch_title' do
+ it "does not parse broken title tag" do
+ # webmock does not do chunks
+ stub_request(:get, "https://en.wikipedia.org/wiki/Internet").
+ to_return(status: 200, body: "Internet - WikipediaInternet - Wikipedia" , headers: {})
+
+ title = RetrieveTitle.fetch_title("https://en.wikipedia.org/wiki/Internet")
+ expect(title).to eq("Internet - Wikipedia")
+ end
+ end
end