Add Email::HtmlCleaner for email processing

This class is in charge of stripping out most of the crap from the HTML portion of emails that email clients generate, so that it can be sanely post-processed for signatures and quoting boundaries.
2014-08-26 12:31:47 -07:00 · 2014-08-26 12:31:47 -07:00 · cb55ef4702
parent 0d0225133c
commit cb55ef4702
1 changed files with 120 additions and 0 deletions
--- a/lib/email/html_cleaner.rb
+++ b/lib/email/html_cleaner.rb
@ -0,0 +1,120 @@
+module Email
+  # HtmlCleaner cleans up the extremely dirty HTML that many email clients
+  # generate by stripping out any excess divs or spans, removing styling in
+  # the process (which also makes the html more suitable to be parsed as
+  # Markdown).
+  class HtmlCleaner
+    # Elements to hoist all children out of
+    HTML_HOIST_ELEMENTS = %w(div span font table tbody th tr td)
+    # Node types to always delete
+    HTML_DELETE_ELEMENT_TYPES = [Nokogiri::XML::Node::DTD_NODE,
+                                 Nokogiri::XML::Node::COMMENT_NODE,
+                                 ]
+
+    # Private variables:
+    #   @doc - nokogiri document
+    #   @out - same as @doc, but only if trimming has occured
+    def initialize(html)
+      if String === html
+        @doc = Nokogiri::HTML(html)
+      else
+        @doc = html
+      end
+    end
+
+    class << self
+      # Email::HtmlCleaner.trim(inp, opts={})
+      #
+      # Arguments:
+      #   inp - Either a HTML string or a Nokogiri document.
+      # Options:
+      #   :return => :doc, :string
+      #     Specify the desired return type.
+      #     Defaults to the type of the input.
+      #     A value of :string is equivalent to calling get_document_text()
+      #     on the returned document.
+      def trim(inp, opts={})
+        cleaner = HtmlCleaner.new(inp)
+
+        opts[:return] ||= ((String === inp) ? :string : :doc)
+
+        if opts[:return] == :string
+          cleaner.output_html
+        else
+          cleaner.output_document
+        end
+      end
+
+      # Email::HtmlCleaner.get_document_text(doc)
+      #
+      # Get the body portion of the document, including html, as a string.
+      def get_document_text(doc)
+        body = doc.xpath('//body')
+        if body
+          body.inner_html
+        else
+          doc.inner_html
+        end
+      end
+    end
+
+    def output_document
+      @out ||= begin
+                 doc = @doc
+                 trim_process_node doc
+                 add_newlines doc
+                 doc
+      end
+    end
+
+    def output_html
+      HtmlCleaner.get_document_text(output_document)
+    end
+
+    private
+
+    def add_newlines(doc)
+      doc.xpath('//br').each do |br|
+        br.replace(Nokogiri::XML::Text.new("\n", doc))
+      end
+    end
+
+    def trim_process_node(node)
+      if should_hoist?(node)
+        hoisted = trim_hoist_element node
+        hoisted.each { |child| trim_process_node child }
+      elsif should_delete?(node)
+        node.remove
+      else
+        if children = node.children
+          children.each { |child| trim_process_node child }
+        end
+      end
+
+      node
+    end
+
+    def trim_hoist_element(element)
+      hoisted = []
+      element.children.each do |child|
+        element.before(child)
+        hoisted << child
+      end
+      element.remove
+      hoisted
+    end
+
+    def should_hoist?(node)
+      return false unless node.element?
+      HTML_HOIST_ELEMENTS.include? node.name
+    end
+
+    def should_delete?(node)
+      return true if HTML_DELETE_ELEMENT_TYPES.include? node.type
+      return true if node.element? && node.name == 'head'
+      return true if node.text? && node.text.strip.blank?
+
+      false
+    end
+  end
+end