FIX: Make HTML scrubber work with deep HTML (#12619)
SearchIndexer and ReindexSearch used to explode for posts with very deep or invalid HTML content.
This commit is contained in:
parent
cdd5b60447
commit
c10df4b58d
|
@ -284,7 +284,11 @@ class SearchIndexer
|
||||||
def self.scrub(html, strip_diacritics: false)
|
def self.scrub(html, strip_diacritics: false)
|
||||||
return +"" if html.blank?
|
return +"" if html.blank?
|
||||||
|
|
||||||
|
begin
|
||||||
document = Nokogiri::HTML5("<div>#{html}</div>", nil, Encoding::UTF_8.to_s)
|
document = Nokogiri::HTML5("<div>#{html}</div>", nil, Encoding::UTF_8.to_s)
|
||||||
|
rescue ArgumentError
|
||||||
|
return +""
|
||||||
|
end
|
||||||
|
|
||||||
nodes = document.css(
|
nodes = document.css(
|
||||||
"div.#{CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS}"
|
"div.#{CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS}"
|
||||||
|
|
|
@ -144,6 +144,19 @@ describe SearchIndexer do
|
||||||
.to change { post.reload.post_search_data.search_data }
|
.to change { post.reload.post_search_data.search_data }
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it 'should work with invalid HTML' do
|
||||||
|
post.update!(cooked: "<FD>" * Nokogumbo::DEFAULT_MAX_TREE_DEPTH)
|
||||||
|
|
||||||
|
SearchIndexer.update_posts_index(
|
||||||
|
post_id: post.id,
|
||||||
|
topic_title: post.topic.title,
|
||||||
|
category_name: post.topic.category&.name,
|
||||||
|
topic_tags: post.topic.tags.map(&:name).join(' '),
|
||||||
|
cooked: post.cooked,
|
||||||
|
private_message: post.topic.private_message?
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
it 'should not index posts with empty raw' do
|
it 'should not index posts with empty raw' do
|
||||||
expect do
|
expect do
|
||||||
post = Fabricate.build(:post, raw: "", post_type: Post.types[:small_action])
|
post = Fabricate.build(:post, raw: "", post_type: Post.types[:small_action])
|
||||||
|
|
Loading…
Reference in New Issue