FIX: Make HTML scrubber work with deep HTML (#12619)

SearchIndexer and ReindexSearch used to explode for posts with very deep or invalid HTML content.
2025-02-06 03:18:23 +00:00 · 2021-04-07 10:02:00 +03:00 · 2021-04-07 10:02:00 +03:00 · c10df4b58d
commit c10df4b58d
parent cdd5b60447
2 changed files with 18 additions and 1 deletions
--- a/app/services/search_indexer.rb
+++ b/app/services/search_indexer.rb
@ -284,7 +284,11 @@ class SearchIndexer
    def self.scrub(html, strip_diacritics: false)
      return +"" if html.blank?

-      document = Nokogiri::HTML5("<div>#{html}</div>", nil, Encoding::UTF_8.to_s)
+      begin
+        document = Nokogiri::HTML5("<div>#{html}</div>", nil, Encoding::UTF_8.to_s)
+      rescue ArgumentError
+        return +""
+      end

      nodes = document.css(
        "div.#{CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS}"
--- a/spec/services/search_indexer_spec.rb
+++ b/spec/services/search_indexer_spec.rb
@ -144,6 +144,19 @@ describe SearchIndexer do
        .to change { post.reload.post_search_data.search_data }
    end

+    it 'should work with invalid HTML' do
+      post.update!(cooked: "<FD>" * Nokogumbo::DEFAULT_MAX_TREE_DEPTH)
+
+      SearchIndexer.update_posts_index(
+        post_id: post.id,
+        topic_title: post.topic.title,
+        category_name: post.topic.category&.name,
+        topic_tags: post.topic.tags.map(&:name).join(' '),
+        cooked: post.cooked,
+        private_message: post.topic.private_message?
+      )
+    end
+
    it 'should not index posts with empty raw' do
      expect do
        post = Fabricate.build(:post, raw: "", post_type: Post.types[:small_action])