2014-08-26 15:31:47 -04:00
|
|
|
module Email
|
|
|
|
# HtmlCleaner cleans up the extremely dirty HTML that many email clients
|
|
|
|
# generate by stripping out any excess divs or spans, removing styling in
|
|
|
|
# the process (which also makes the html more suitable to be parsed as
|
|
|
|
# Markdown).
|
|
|
|
class HtmlCleaner
|
|
|
|
# Elements to hoist all children out of
|
|
|
|
HTML_HOIST_ELEMENTS = %w(div span font table tbody th tr td)
|
|
|
|
# Node types to always delete
|
2015-04-25 18:37:27 -04:00
|
|
|
HTML_DELETE_ELEMENT_TYPES = [
|
|
|
|
Nokogiri::XML::Node::DTD_NODE,
|
|
|
|
Nokogiri::XML::Node::COMMENT_NODE,
|
|
|
|
]
|
2014-08-26 15:31:47 -04:00
|
|
|
|
|
|
|
# Private variables:
|
|
|
|
# @doc - nokogiri document
|
|
|
|
# @out - same as @doc, but only if trimming has occured
|
|
|
|
def initialize(html)
|
|
|
|
if String === html
|
|
|
|
@doc = Nokogiri::HTML(html)
|
|
|
|
else
|
|
|
|
@doc = html
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
class << self
|
|
|
|
# Email::HtmlCleaner.trim(inp, opts={})
|
|
|
|
#
|
|
|
|
# Arguments:
|
|
|
|
# inp - Either a HTML string or a Nokogiri document.
|
|
|
|
# Options:
|
|
|
|
# :return => :doc, :string
|
|
|
|
# Specify the desired return type.
|
|
|
|
# Defaults to the type of the input.
|
|
|
|
# A value of :string is equivalent to calling get_document_text()
|
|
|
|
# on the returned document.
|
|
|
|
def trim(inp, opts={})
|
|
|
|
cleaner = HtmlCleaner.new(inp)
|
|
|
|
|
|
|
|
opts[:return] ||= ((String === inp) ? :string : :doc)
|
|
|
|
|
|
|
|
if opts[:return] == :string
|
|
|
|
cleaner.output_html
|
|
|
|
else
|
|
|
|
cleaner.output_document
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
# Email::HtmlCleaner.get_document_text(doc)
|
|
|
|
#
|
|
|
|
# Get the body portion of the document, including html, as a string.
|
|
|
|
def get_document_text(doc)
|
|
|
|
body = doc.xpath('//body')
|
|
|
|
if body
|
|
|
|
body.inner_html
|
|
|
|
else
|
|
|
|
doc.inner_html
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def output_document
|
|
|
|
@out ||= begin
|
|
|
|
doc = @doc
|
|
|
|
trim_process_node doc
|
|
|
|
add_newlines doc
|
|
|
|
doc
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def output_html
|
|
|
|
HtmlCleaner.get_document_text(output_document)
|
|
|
|
end
|
|
|
|
|
|
|
|
private
|
|
|
|
|
|
|
|
def add_newlines(doc)
|
2014-09-05 21:26:45 -04:00
|
|
|
# Replace <br> tags with a markdown \n
|
2014-08-26 15:31:47 -04:00
|
|
|
doc.xpath('//br').each do |br|
|
2014-10-08 10:15:46 -04:00
|
|
|
br.replace(new_linebreak_node doc, 2)
|
2014-08-26 15:31:47 -04:00
|
|
|
end
|
2014-09-05 21:26:45 -04:00
|
|
|
# Surround <p> tags with newlines, to help with line-wise postprocessing
|
|
|
|
# and ensure markdown paragraphs
|
|
|
|
doc.xpath('//p').each do |p|
|
|
|
|
p.before(new_linebreak_node doc)
|
|
|
|
p.after(new_linebreak_node doc, 2)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def new_linebreak_node(doc, count=1)
|
|
|
|
Nokogiri::XML::Text.new("\n" * count, doc)
|
2014-08-26 15:31:47 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def trim_process_node(node)
|
|
|
|
if should_hoist?(node)
|
|
|
|
hoisted = trim_hoist_element node
|
|
|
|
hoisted.each { |child| trim_process_node child }
|
|
|
|
elsif should_delete?(node)
|
|
|
|
node.remove
|
|
|
|
else
|
|
|
|
if children = node.children
|
|
|
|
children.each { |child| trim_process_node child }
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
node
|
|
|
|
end
|
|
|
|
|
|
|
|
def trim_hoist_element(element)
|
|
|
|
hoisted = []
|
|
|
|
element.children.each do |child|
|
|
|
|
element.before(child)
|
|
|
|
hoisted << child
|
|
|
|
end
|
|
|
|
element.remove
|
|
|
|
hoisted
|
|
|
|
end
|
|
|
|
|
|
|
|
def should_hoist?(node)
|
|
|
|
return false unless node.element?
|
|
|
|
HTML_HOIST_ELEMENTS.include? node.name
|
|
|
|
end
|
|
|
|
|
|
|
|
def should_delete?(node)
|
|
|
|
return true if HTML_DELETE_ELEMENT_TYPES.include? node.type
|
|
|
|
return true if node.element? && node.name == 'head'
|
|
|
|
return true if node.text? && node.text.strip.blank?
|
|
|
|
|
|
|
|
false
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|