FIX: Make HTML scrubber work with deep HTML (#12619)

SearchIndexer and ReindexSearch used to explode for posts with very
deep or invalid HTML content.
This commit is contained in:
Bianca Nenciu 2021-04-07 10:02:00 +03:00 committed by GitHub
parent cdd5b60447
commit c10df4b58d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 18 additions and 1 deletions

View File

@ -284,7 +284,11 @@ class SearchIndexer
def self.scrub(html, strip_diacritics: false)
return +"" if html.blank?
document = Nokogiri::HTML5("<div>#{html}</div>", nil, Encoding::UTF_8.to_s)
begin
document = Nokogiri::HTML5("<div>#{html}</div>", nil, Encoding::UTF_8.to_s)
rescue ArgumentError
return +""
end
nodes = document.css(
"div.#{CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS}"

View File

@ -144,6 +144,19 @@ describe SearchIndexer do
.to change { post.reload.post_search_data.search_data }
end
it 'should work with invalid HTML' do
post.update!(cooked: "<FD>" * Nokogumbo::DEFAULT_MAX_TREE_DEPTH)
SearchIndexer.update_posts_index(
post_id: post.id,
topic_title: post.topic.title,
category_name: post.topic.category&.name,
topic_tags: post.topic.tags.map(&:name).join(' '),
cooked: post.cooked,
private_message: post.topic.private_message?
)
end
it 'should not index posts with empty raw' do
expect do
post = Fabricate.build(:post, raw: "", post_type: Post.types[:small_action])