Add Email::HtmlCleaner for email processing

This class is in charge of stripping out most of the crap from the HTML portion of emails that email clients generate, so that it can be sanely post-processed for signatures and quoting boundaries.
2014-08-26 12:31:47 -07:00 · 2014-08-26 12:31:47 -07:00 · cb55ef4702
parent 0d0225133c
commit cb55ef4702
1 changed files with 120 additions and 0 deletions
--- a/lib/email/html_cleaner.rb
+++ b/lib/email/html_cleaner.rb
@ -0,0 +1,120 @@
 module Email
  # HtmlCleaner cleans up the extremely dirty HTML that many email clients
  # generate by stripping out any excess divs or spans, removing styling in
  # the process (which also makes the html more suitable to be parsed as
  # Markdown).
  class HtmlCleaner
    # Elements to hoist all children out of
    HTML_HOIST_ELEMENTS = %w(div span font table tbody th tr td)
    # Node types to always delete
    HTML_DELETE_ELEMENT_TYPES = [Nokogiri::XML::Node::DTD_NODE,
                                 Nokogiri::XML::Node::COMMENT_NODE,
                                 ]
    # Private variables:
    #   @doc - nokogiri document
    #   @out - same as @doc, but only if trimming has occured
    def initialize(html)
      if String === html
        @doc = Nokogiri::HTML(html)
      else
        @doc = html
      end
    end
    class << self
      # Email::HtmlCleaner.trim(inp, opts={})
      #
      # Arguments:
      #   inp - Either a HTML string or a Nokogiri document.
      # Options:
      #   :return => :doc, :string
      #     Specify the desired return type.
      #     Defaults to the type of the input.
      #     A value of :string is equivalent to calling get_document_text()
      #     on the returned document.
      def trim(inp, opts={})
        cleaner = HtmlCleaner.new(inp)
        opts[:return] ||= ((String === inp) ? :string : :doc)
        if opts[:return] == :string
          cleaner.output_html
        else
          cleaner.output_document
        end
      end
      # Email::HtmlCleaner.get_document_text(doc)
      #
      # Get the body portion of the document, including html, as a string.
      def get_document_text(doc)
        body = doc.xpath('//body')
        if body
          body.inner_html
        else
          doc.inner_html
        end
      end
    end
    def output_document
      @out ||= begin
                 doc = @doc
                 trim_process_node doc
                 add_newlines doc
                 doc
      end
    end
    def output_html
      HtmlCleaner.get_document_text(output_document)
    end
    private
    def add_newlines(doc)
      doc.xpath('//br').each do |br|
        br.replace(Nokogiri::XML::Text.new("\n", doc))
      end
    end
    def trim_process_node(node)
      if should_hoist?(node)
        hoisted = trim_hoist_element node
        hoisted.each { |child| trim_process_node child }
      elsif should_delete?(node)
        node.remove
      else
        if children = node.children
          children.each { |child| trim_process_node child }
        end
      end
      node
    end
    def trim_hoist_element(element)
      hoisted = []
      element.children.each do |child|
        element.before(child)
        hoisted << child
      end
      element.remove
      hoisted
    end
    def should_hoist?(node)
      return false unless node.element?
      HTML_HOIST_ELEMENTS.include? node.name
    end
    def should_delete?(node)
      return true if HTML_DELETE_ELEMENT_TYPES.include? node.type
      return true if node.element? && node.name == 'head'
      return true if node.text? && node.text.strip.blank?
      false
    end
  end
 end