FIX: Extract `div` tags within `span`s

This commit is contained in:
Robin Ward 2017-05-09 12:33:54 -04:00
parent c2829dce22
commit b57b635d30
2 changed files with 20 additions and 2 deletions

View File

@ -8,10 +8,22 @@ class HtmlToMarkdown
def initialize(html, opts={})
@opts = opts || {}
@doc = Nokogiri::HTML(html)
@doc = fix_span_elements(Nokogiri::HTML(html))
remove_whitespaces!
end
# If a `<div>` is within a `<span>` that's invalid, so let's hoist the `<div>` up
def fix_span_elements(node)
if node.name == 'span' && node.at('div')
node.swap(node.children)
end
node.children.each {|c| fix_span_elements(c)}
node
end
def remove_whitespaces!
@doc.traverse do |node|
if node.is_a? Nokogiri::XML::Text
@ -33,7 +45,7 @@ class HtmlToMarkdown
end
def traverse(node)
node.children.each { |node| visit(node) }
node.children.each { |n| visit(n) }
end
def visit(node)
@ -197,6 +209,7 @@ class HtmlToMarkdown
end
def format_block
lines = @stack[-1].markdown.each_line.map do |line|
prefix = @stack.map { |b| b.opened ? b.body : b.head }.join
@stack.each { |b| b.opened = true }

View File

@ -219,4 +219,9 @@ describe HtmlToMarkdown do
expect(html_to_markdown("<style>* { margin: 0 }</style>")).to eq("")
end
it "handles divs within spans" do
html = "<div>1st paragraph<span><div>2nd paragraph</div></span></div>"
expect(html_to_markdown(html)).to eq("1st paragraph\n2nd paragraph")
end
end