DEV: Update nokogiri to 1.18.1 (#30554)
Nokogiri/libxml is now more strict in terms of params it receives. It uses kwargs vs options object (I fixed an issue there in #30545) doesn't accept nil/blank html (fixed here) and most importantly handles encoding in a different way. It seems to require explicitly specifying UTF8. * Build(deps): Bump nokogiri from 1.16.8 to 1.18.1 Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.16.8 to 1.18.1. - [Release notes](https://github.com/sparklemotion/nokogiri/releases) - [Changelog](https://github.com/sparklemotion/nokogiri/blob/main/CHANGELOG.md) - [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.16.8...v1.18.1) --- updated-dependencies: - dependency-name: nokogiri dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
This commit is contained in:
parent
c1a46995a7
commit
affe26f0dd
|
@ -273,13 +273,13 @@ GEM
|
||||||
net-smtp (0.5.0)
|
net-smtp (0.5.0)
|
||||||
net-protocol
|
net-protocol
|
||||||
nio4r (2.7.4)
|
nio4r (2.7.4)
|
||||||
nokogiri (1.16.8-aarch64-linux)
|
nokogiri (1.18.1-aarch64-linux-gnu)
|
||||||
racc (~> 1.4)
|
racc (~> 1.4)
|
||||||
nokogiri (1.16.8-arm64-darwin)
|
nokogiri (1.18.1-arm64-darwin)
|
||||||
racc (~> 1.4)
|
racc (~> 1.4)
|
||||||
nokogiri (1.16.8-x86_64-darwin)
|
nokogiri (1.18.1-x86_64-darwin)
|
||||||
racc (~> 1.4)
|
racc (~> 1.4)
|
||||||
nokogiri (1.16.8-x86_64-linux)
|
nokogiri (1.18.1-x86_64-linux-gnu)
|
||||||
racc (~> 1.4)
|
racc (~> 1.4)
|
||||||
oauth (1.1.0)
|
oauth (1.1.0)
|
||||||
oauth-tty (~> 1.0, >= 1.0.1)
|
oauth-tty (~> 1.0, >= 1.0.1)
|
||||||
|
|
|
@ -365,7 +365,7 @@ class SearchIndexer
|
||||||
return +"" if html.blank?
|
return +"" if html.blank?
|
||||||
|
|
||||||
begin
|
begin
|
||||||
document = Nokogiri.HTML5("<div>#{html}</div>", nil, Encoding::UTF_8.to_s)
|
document = Nokogiri.HTML5("<div>#{html}</div>", encoding: Encoding::UTF_8)
|
||||||
rescue ArgumentError
|
rescue ArgumentError
|
||||||
return +""
|
return +""
|
||||||
end
|
end
|
||||||
|
@ -401,7 +401,7 @@ class SearchIndexer
|
||||||
end
|
end
|
||||||
|
|
||||||
html_scrubber = new
|
html_scrubber = new
|
||||||
Nokogiri::HTML::SAX::Parser.new(html_scrubber).parse(document.to_html)
|
Nokogiri::HTML4::SAX::Parser.new(html_scrubber, Encoding::UTF_8).parse(document.to_html)
|
||||||
html_scrubber.scrubbed.squish
|
html_scrubber.scrubbed.squish
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -277,7 +277,7 @@ class DiscourseDiff
|
||||||
|
|
||||||
def self.tokenize(html)
|
def self.tokenize(html)
|
||||||
me = new
|
me = new
|
||||||
parser = Nokogiri::HTML::SAX::Parser.new(me)
|
parser = Nokogiri::HTML4::SAX::Parser.new(me, Encoding::UTF_8)
|
||||||
parser.parse("<html><body>#{html}</body></html>")
|
parser.parse("<html><body>#{html}</body></html>")
|
||||||
me.tokens
|
me.tokens
|
||||||
end
|
end
|
||||||
|
|
|
@ -27,10 +27,11 @@ class ExcerptParser < Nokogiri::XML::SAX::Document
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.get_excerpt(html, length, options)
|
def self.get_excerpt(html, length, options)
|
||||||
html ||= ""
|
return "" if html.blank?
|
||||||
|
|
||||||
length = html.length if html.include?("excerpt") && CUSTOM_EXCERPT_REGEX === html
|
length = html.length if html.include?("excerpt") && CUSTOM_EXCERPT_REGEX === html
|
||||||
me = self.new(length, options)
|
me = self.new(length, options)
|
||||||
parser = Nokogiri::HTML::SAX::Parser.new(me)
|
parser = Nokogiri::HTML4::SAX::Parser.new(me, Encoding::UTF_8)
|
||||||
catch(:done) { parser.parse(html) }
|
catch(:done) { parser.parse(html) }
|
||||||
excerpt = me.excerpt.strip
|
excerpt = me.excerpt.strip
|
||||||
excerpt = excerpt.gsub(/\s*\n+\s*/, "\n\n") if options[:keep_onebox_source] ||
|
excerpt = excerpt.gsub(/\s*\n+\s*/, "\n\n") if options[:keep_onebox_source] ||
|
||||||
|
|
|
@ -483,6 +483,8 @@ module PrettyText
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.excerpt(html, max_length, options = {})
|
def self.excerpt(html, max_length, options = {})
|
||||||
|
return "" if html.blank?
|
||||||
|
|
||||||
# TODO: properly fix this HACK in ExcerptParser without introducing XSS
|
# TODO: properly fix this HACK in ExcerptParser without introducing XSS
|
||||||
doc = Nokogiri::HTML5.fragment(html)
|
doc = Nokogiri::HTML5.fragment(html)
|
||||||
DiscourseEvent.trigger(:reduce_excerpt, doc, options)
|
DiscourseEvent.trigger(:reduce_excerpt, doc, options)
|
||||||
|
|
|
@ -24,7 +24,7 @@ module RetrieveTitle
|
||||||
|
|
||||||
doc = nil
|
doc = nil
|
||||||
begin
|
begin
|
||||||
doc = Nokogiri.HTML5(html, nil, encoding)
|
doc = Nokogiri.HTML5(html, encoding:)
|
||||||
rescue ArgumentError
|
rescue ArgumentError
|
||||||
# invalid HTML (Eg: too many attributes, status tree too deep) - ignore
|
# invalid HTML (Eg: too many attributes, status tree too deep) - ignore
|
||||||
# Error in nokogumbo is not specialized, uses generic ArgumentError
|
# Error in nokogumbo is not specialized, uses generic ArgumentError
|
||||||
|
|
|
@ -17,7 +17,7 @@ class ImportScripts::Disqus < ImportScripts::Base
|
||||||
abort("Category #{IMPORT_CATEGORY} not found") if @category.blank?
|
abort("Category #{IMPORT_CATEGORY} not found") if @category.blank?
|
||||||
|
|
||||||
@parser = DisqusSAX.new
|
@parser = DisqusSAX.new
|
||||||
doc = Nokogiri::XML::SAX::Parser.new(@parser)
|
doc = Nokogiri::XML::SAX::Parser.new(@parser, Encoding::UTF_8)
|
||||||
doc.parse_file(IMPORT_FILE)
|
doc.parse_file(IMPORT_FILE)
|
||||||
@parser.normalize
|
@parser.normalize
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue