discourse/lib/email/html_cleaner.rb

module Email
  # HtmlCleaner cleans up the extremely dirty HTML that many email clients
  # generate by stripping out any excess divs or spans, removing styling in
  # the process (which also makes the html more suitable to be parsed as
  # Markdown).
  class HtmlCleaner
    # Elements to hoist all children out of
    HTML_HOIST_ELEMENTS = %w(div span font table tbody th tr td)
    # Node types to always delete
    HTML_DELETE_ELEMENT_TYPES = [
      Nokogiri::XML::Node::DTD_NODE,
      Nokogiri::XML::Node::COMMENT_NODE,
    ]

    # Private variables:
    #   @doc - nokogiri document
    #   @out - same as @doc, but only if trimming has occured
    def initialize(html)
      if String === html
        @doc = Nokogiri::HTML(html)
      else
        @doc = html
      end
    end

    class << self
      # Email::HtmlCleaner.trim(inp, opts={})
      #
      # Arguments:
      #   inp - Either a HTML string or a Nokogiri document.
      # Options:
      #   :return => :doc, :string
      #     Specify the desired return type.
      #     Defaults to the type of the input.
      #     A value of :string is equivalent to calling get_document_text()
      #     on the returned document.
      def trim(inp, opts={})
        cleaner = HtmlCleaner.new(inp)

        opts[:return] ||= ((String === inp) ? :string : :doc)

        if opts[:return] == :string
          cleaner.output_html
        else
          cleaner.output_document
        end
      end

      # Email::HtmlCleaner.get_document_text(doc)
      #
      # Get the body portion of the document, including html, as a string.
      def get_document_text(doc)
        body = doc.xpath('//body')
        if body
          body.inner_html
        else
          doc.inner_html
        end
      end
    end

    def output_document
      @out ||= begin
                 doc = @doc
                 trim_process_node doc
                 add_newlines doc
                 doc
      end
    end

    def output_html
      HtmlCleaner.get_document_text(output_document)
    end

    private

    def add_newlines(doc)
      # Replace <br> tags with a markdown \n
      doc.xpath('//br').each do |br|
        br.replace(new_linebreak_node doc, 2)
      end
      # Surround <p> tags with newlines, to help with line-wise postprocessing
      # and ensure markdown paragraphs
      doc.xpath('//p').each do |p|
        p.before(new_linebreak_node doc)
        p.after(new_linebreak_node doc, 2)
      end
    end

    def new_linebreak_node(doc, count=1)
      Nokogiri::XML::Text.new("\n" * count, doc)
    end

    def trim_process_node(node)
      if should_hoist?(node)
        hoisted = trim_hoist_element node
        hoisted.each { |child| trim_process_node child }
      elsif should_delete?(node)
        node.remove
      else
        if children = node.children
          children.each { |child| trim_process_node child }
        end
      end

      node
    end

    def trim_hoist_element(element)
      hoisted = []
      element.children.each do |child|
        element.before(child)
        hoisted << child
      end
      element.remove
      hoisted
    end

    def should_hoist?(node)
      return false unless node.element?
      HTML_HOIST_ELEMENTS.include? node.name
    end

    def should_delete?(node)
      return true if HTML_DELETE_ELEMENT_TYPES.include? node.type
      return true if node.element? && node.name == 'head'
      return true if node.text? && node.text.strip.blank?

      false
    end
  end
end
Add Email::HtmlCleaner for email processing This class is in charge of stripping out most of the crap from the HTML portion of emails that email clients generate, so that it can be sanely post-processed for signatures and quoting boundaries. 2014-08-26 15:31:47 -04:00			`module Email`
			`# HtmlCleaner cleans up the extremely dirty HTML that many email clients`
			`# generate by stripping out any excess divs or spans, removing styling in`
			`# the process (which also makes the html more suitable to be parsed as`
			`# Markdown).`
			`class HtmlCleaner`
			`# Elements to hoist all children out of`
			`HTML_HOIST_ELEMENTS = %w(div span font table tbody th tr td)`
			`# Node types to always delete`
FIX: remove links in poll email notification 2015-04-25 18:37:27 -04:00			`HTML_DELETE_ELEMENT_TYPES = [`
			`Nokogiri::XML::Node::DTD_NODE,`
			`Nokogiri::XML::Node::COMMENT_NODE,`
			`]`
Add Email::HtmlCleaner for email processing This class is in charge of stripping out most of the crap from the HTML portion of emails that email clients generate, so that it can be sanely post-processed for signatures and quoting boundaries. 2014-08-26 15:31:47 -04:00
			`# Private variables:`
			`# @doc - nokogiri document`
			`# @out - same as @doc, but only if trimming has occured`
			`def initialize(html)`
			`if String === html`
			`@doc = Nokogiri::HTML(html)`
			`else`
			`@doc = html`
			`end`
			`end`

			`class << self`
			`# Email::HtmlCleaner.trim(inp, opts={})`
			`#`
			`# Arguments:`
			`# inp - Either a HTML string or a Nokogiri document.`
			`# Options:`
			`# :return => :doc, :string`
			`# Specify the desired return type.`
			`# Defaults to the type of the input.`
			`# A value of :string is equivalent to calling get_document_text()`
			`# on the returned document.`
			`def trim(inp, opts={})`
			`cleaner = HtmlCleaner.new(inp)`

			`opts[:return] \|\|= ((String === inp) ? :string : :doc)`

			`if opts[:return] == :string`
			`cleaner.output_html`
			`else`
			`cleaner.output_document`
			`end`
			`end`

			`# Email::HtmlCleaner.get_document_text(doc)`
			`#`
			`# Get the body portion of the document, including html, as a string.`
			`def get_document_text(doc)`
			`body = doc.xpath('//body')`
			`if body`
			`body.inner_html`
			`else`
			`doc.inner_html`
			`end`
			`end`
			`end`

			`def output_document`
			`@out \|\|= begin`
			`doc = @doc`
			`trim_process_node doc`
			`add_newlines doc`
			`doc`
			`end`
			`end`

			`def output_html`
			`HtmlCleaner.get_document_text(output_document)`
			`end`

			`private`

			`def add_newlines(doc)`
FIX: Add "On day, name wrote:" quote trigger for emails 2014-09-05 21:26:45 -04:00			`# Replace <br> tags with a markdown \n`
Add Email::HtmlCleaner for email processing This class is in charge of stripping out most of the crap from the HTML portion of emails that email clients generate, so that it can be sanely post-processed for signatures and quoting boundaries. 2014-08-26 15:31:47 -04:00			`doc.xpath('//br').each do \|br\|`
FIX: handle multiple paragraphs in email reply when parsing html 2014-10-08 10:15:46 -04:00			`br.replace(new_linebreak_node doc, 2)`
Add Email::HtmlCleaner for email processing This class is in charge of stripping out most of the crap from the HTML portion of emails that email clients generate, so that it can be sanely post-processed for signatures and quoting boundaries. 2014-08-26 15:31:47 -04:00			`end`
FIX: Add "On day, name wrote:" quote trigger for emails 2014-09-05 21:26:45 -04:00			`# Surround <p> tags with newlines, to help with line-wise postprocessing`
			`# and ensure markdown paragraphs`
			`doc.xpath('//p').each do \|p\|`
			`p.before(new_linebreak_node doc)`
			`p.after(new_linebreak_node doc, 2)`
			`end`
			`end`

			`def new_linebreak_node(doc, count=1)`
			`Nokogiri::XML::Text.new("\n" * count, doc)`
Add Email::HtmlCleaner for email processing This class is in charge of stripping out most of the crap from the HTML portion of emails that email clients generate, so that it can be sanely post-processed for signatures and quoting boundaries. 2014-08-26 15:31:47 -04:00			`end`

			`def trim_process_node(node)`
			`if should_hoist?(node)`
			`hoisted = trim_hoist_element node`
			`hoisted.each { \|child\| trim_process_node child }`
			`elsif should_delete?(node)`
			`node.remove`
			`else`
			`if children = node.children`
			`children.each { \|child\| trim_process_node child }`
			`end`
			`end`

			`node`
			`end`

			`def trim_hoist_element(element)`
			`hoisted = []`
			`element.children.each do \|child\|`
			`element.before(child)`
			`hoisted << child`
			`end`
			`element.remove`
			`hoisted`
			`end`

			`def should_hoist?(node)`
			`return false unless node.element?`
			`HTML_HOIST_ELEMENTS.include? node.name`
			`end`

			`def should_delete?(node)`
			`return true if HTML_DELETE_ELEMENT_TYPES.include? node.type`
			`return true if node.element? && node.name == 'head'`
			`return true if node.text? && node.text.strip.blank?`

			`false`
			`end`
			`end`
			`end`