diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb index 227da9f0cbd..fd652b33902 100644 --- a/lib/retrieve_title.rb +++ b/lib/retrieve_title.rb @@ -9,9 +9,9 @@ module RetrieveTitle # If there was a connection error, do nothing end - def self.extract_title(html) + def self.extract_title(html, encoding = nil) title = nil - if doc = Nokogiri::HTML5(html) + if doc = Nokogiri::HTML5(html, nil, encoding) title = doc.at('title')&.inner_text @@ -54,6 +54,7 @@ module RetrieveTitle current = nil title = nil + encoding = nil fd.get do |_response, chunk, uri| @@ -62,9 +63,17 @@ module RetrieveTitle else current = chunk end + if !encoding && content_type = _response['content-type']&.strip&.downcase + if content_type =~ /charset="?([a-z0-9_-]+)"?/ + encoding = Regexp.last_match(1) + if !Encoding.list.map(&:name).map(&:downcase).include?(encoding) + encoding = nil + end + end + end max_size = max_chunk_size(uri) * 1024 - title = extract_title(current) + title = extract_title(current, encoding) throw :done if title || max_size < current.length end title diff --git a/spec/components/retrieve_title_spec.rb b/spec/components/retrieve_title_spec.rb index 84ca621ef0c..56e80a41301 100644 --- a/spec/components/retrieve_title_spec.rb +++ b/spec/components/retrieve_title_spec.rb @@ -67,6 +67,27 @@ describe RetrieveTitle do expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("very amazing") end - end + it "detects and uses encoding from Content-Type header" do + stub_request(:get, "https://brelksdjflaskfj.com/amazing") + .to_return( + status: 200, + body: "