diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb index 227da9f0cbd..fd652b33902 100644 --- a/lib/retrieve_title.rb +++ b/lib/retrieve_title.rb @@ -9,9 +9,9 @@ module RetrieveTitle # If there was a connection error, do nothing end - def self.extract_title(html) + def self.extract_title(html, encoding = nil) title = nil - if doc = Nokogiri::HTML5(html) + if doc = Nokogiri::HTML5(html, nil, encoding) title = doc.at('title')&.inner_text @@ -54,6 +54,7 @@ module RetrieveTitle current = nil title = nil + encoding = nil fd.get do |_response, chunk, uri| @@ -62,9 +63,17 @@ module RetrieveTitle else current = chunk end + if !encoding && content_type = _response['content-type']&.strip&.downcase + if content_type =~ /charset="?([a-z0-9_-]+)"?/ + encoding = Regexp.last_match(1) + if !Encoding.list.map(&:name).map(&:downcase).include?(encoding) + encoding = nil + end + end + end max_size = max_chunk_size(uri) * 1024 - title = extract_title(current) + title = extract_title(current, encoding) throw :done if title || max_size < current.length end title diff --git a/spec/components/retrieve_title_spec.rb b/spec/components/retrieve_title_spec.rb index 84ca621ef0c..56e80a41301 100644 --- a/spec/components/retrieve_title_spec.rb +++ b/spec/components/retrieve_title_spec.rb @@ -67,6 +67,27 @@ describe RetrieveTitle do expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("very amazing") end - end + it "detects and uses encoding from Content-Type header" do + stub_request(:get, "https://brelksdjflaskfj.com/amazing") + .to_return( + status: 200, + body: "fancy apostrophes ’’’".dup.force_encoding('ASCII-8BIT'), + headers: { 'Content-Type' => 'text/html; charset="utf-8"' } + ) + + IPSocket.stubs(:getaddress).returns('100.2.3.4') + expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("fancy apostrophes ’’’") + + stub_request(:get, "https://brelksdjflaskfj.com/amazing") + .to_return( + status: 200, + body: "japanese こんにちは website".encode('EUC-JP').force_encoding('ASCII-8BIT'), + headers: { 'Content-Type' => 'text/html;charset=euc-jp' } + ) + + IPSocket.stubs(:getaddress).returns('100.2.3.4') + expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("japanese こんにちは website") + end + end end