FIX: increase chunk size to fetch title tag correctly (#14144)
This commit is contained in:
parent
b47a4d0207
commit
763f48abc7
|
@ -11,6 +11,9 @@ module RetrieveTitle
|
||||||
|
|
||||||
def self.extract_title(html, encoding = nil)
|
def self.extract_title(html, encoding = nil)
|
||||||
title = nil
|
title = nil
|
||||||
|
if html =~ /<title>/ && html !~ /<\/title>/
|
||||||
|
return nil
|
||||||
|
end
|
||||||
if doc = Nokogiri::HTML5(html, nil, encoding)
|
if doc = Nokogiri::HTML5(html, nil, encoding)
|
||||||
|
|
||||||
title = doc.at('title')&.inner_text
|
title = doc.at('title')&.inner_text
|
||||||
|
@ -44,8 +47,8 @@ module RetrieveTitle
|
||||||
return 500 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
|
return 500 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
|
||||||
return 300 if uri.host =~ /youtube\.com$/ || uri.host =~ /youtu.be/
|
return 300 if uri.host =~ /youtube\.com$/ || uri.host =~ /youtu.be/
|
||||||
|
|
||||||
# default is 10k
|
# default is 20k
|
||||||
10
|
20
|
||||||
end
|
end
|
||||||
|
|
||||||
# Fetch the beginning of a HTML document at a url
|
# Fetch the beginning of a HTML document at a url
|
||||||
|
|
|
@ -101,4 +101,24 @@ describe RetrieveTitle do
|
||||||
expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq("very amazing")
|
expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq("very amazing")
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context 'fetch_title' do
|
||||||
|
it "does not parse broken title tag" do
|
||||||
|
# webmock does not do chunks
|
||||||
|
stub_request(:get, "https://en.wikipedia.org/wiki/Internet").
|
||||||
|
to_return(status: 200, body: "<html><head><title>Internet - Wikipedia</ti" , headers: {})
|
||||||
|
|
||||||
|
title = RetrieveTitle.fetch_title("https://en.wikipedia.org/wiki/Internet")
|
||||||
|
expect(title).to eq(nil)
|
||||||
|
end
|
||||||
|
|
||||||
|
it "can parse correct title tag" do
|
||||||
|
# webmock does not do chunks
|
||||||
|
stub_request(:get, "https://en.wikipedia.org/wiki/Internet").
|
||||||
|
to_return(status: 200, body: "<html><head><title>Internet - Wikipedia</title>" , headers: {})
|
||||||
|
|
||||||
|
title = RetrieveTitle.fetch_title("https://en.wikipedia.org/wiki/Internet")
|
||||||
|
expect(title).to eq("Internet - Wikipedia")
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue