From c10df4b58d61b66a417b8d9f07b79a78254cffd3 Mon Sep 17 00:00:00 2001 From: Bianca Nenciu Date: Wed, 7 Apr 2021 10:02:00 +0300 Subject: [PATCH] FIX: Make HTML scrubber work with deep HTML (#12619) SearchIndexer and ReindexSearch used to explode for posts with very deep or invalid HTML content. --- app/services/search_indexer.rb | 6 +++++- spec/services/search_indexer_spec.rb | 13 +++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index 12bdcbc4aaf..45d423925db 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -284,7 +284,11 @@ class SearchIndexer def self.scrub(html, strip_diacritics: false) return +"" if html.blank? - document = Nokogiri::HTML5("
#{html}
", nil, Encoding::UTF_8.to_s) + begin + document = Nokogiri::HTML5("
#{html}
", nil, Encoding::UTF_8.to_s) + rescue ArgumentError + return +"" + end nodes = document.css( "div.#{CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS}" diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb index 936dca05377..659ef8cd4f6 100644 --- a/spec/services/search_indexer_spec.rb +++ b/spec/services/search_indexer_spec.rb @@ -144,6 +144,19 @@ describe SearchIndexer do .to change { post.reload.post_search_data.search_data } end + it 'should work with invalid HTML' do + post.update!(cooked: "" * Nokogumbo::DEFAULT_MAX_TREE_DEPTH) + + SearchIndexer.update_posts_index( + post_id: post.id, + topic_title: post.topic.title, + category_name: post.topic.category&.name, + topic_tags: post.topic.tags.map(&:name).join(' '), + cooked: post.cooked, + private_message: post.topic.private_message? + ) + end + it 'should not index posts with empty raw' do expect do post = Fabricate.build(:post, raw: "", post_type: Post.types[:small_action])