FIX: increase chunk size to fetch title tag correctly (#14144)

2021-09-03 13:15:58 +05:30 · 2021-09-03 13:15:58 +05:30 · 763f48abc7
parent b47a4d0207
commit 763f48abc7
2 changed files with 25 additions and 2 deletions
--- a/lib/retrieve_title.rb
+++ b/lib/retrieve_title.rb
@ -11,6 +11,9 @@ module RetrieveTitle

  def self.extract_title(html, encoding = nil)
    title = nil
+    if html =~ /<title>/ && html !~ /<\/title>/
+      return nil
+    end
    if doc = Nokogiri::HTML5(html, nil, encoding)

      title = doc.at('title')&.inner_text
@ -44,8 +47,8 @@ module RetrieveTitle
    return 500 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
    return 300 if uri.host =~ /youtube\.com$/ || uri.host =~ /youtu.be/

-    # default is 10k
-    10
+    # default is 20k
+    20
  end

  # Fetch the beginning of a HTML document at a url
--- a/spec/components/retrieve_title_spec.rb
+++ b/spec/components/retrieve_title_spec.rb
@ -101,4 +101,24 @@ describe RetrieveTitle do
      expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq("very amazing")
    end
  end
+
+  context 'fetch_title' do
+    it "does not parse broken title tag" do
+      # webmock does not do chunks
+      stub_request(:get, "https://en.wikipedia.org/wiki/Internet").
+        to_return(status: 200, body: "<html><head><title>Internet - Wikipedia</ti" , headers: {})
+
+      title = RetrieveTitle.fetch_title("https://en.wikipedia.org/wiki/Internet")
+      expect(title).to eq(nil)
+    end
+
+    it "can parse correct title tag" do
+      # webmock does not do chunks
+      stub_request(:get, "https://en.wikipedia.org/wiki/Internet").
+        to_return(status: 200, body: "<html><head><title>Internet - Wikipedia</title>" , headers: {})
+
+      title = RetrieveTitle.fetch_title("https://en.wikipedia.org/wiki/Internet")
+      expect(title).to eq("Internet - Wikipedia")
+    end
+  end
 end