FIX: Extract `div` tags within `span`s
This commit is contained in:
parent
c2829dce22
commit
b57b635d30
|
@ -8,10 +8,22 @@ class HtmlToMarkdown
|
|||
|
||||
def initialize(html, opts={})
|
||||
@opts = opts || {}
|
||||
@doc = Nokogiri::HTML(html)
|
||||
@doc = fix_span_elements(Nokogiri::HTML(html))
|
||||
|
||||
remove_whitespaces!
|
||||
end
|
||||
|
||||
|
||||
# If a `<div>` is within a `<span>` that's invalid, so let's hoist the `<div>` up
|
||||
def fix_span_elements(node)
|
||||
if node.name == 'span' && node.at('div')
|
||||
node.swap(node.children)
|
||||
end
|
||||
|
||||
node.children.each {|c| fix_span_elements(c)}
|
||||
node
|
||||
end
|
||||
|
||||
def remove_whitespaces!
|
||||
@doc.traverse do |node|
|
||||
if node.is_a? Nokogiri::XML::Text
|
||||
|
@ -33,7 +45,7 @@ class HtmlToMarkdown
|
|||
end
|
||||
|
||||
def traverse(node)
|
||||
node.children.each { |node| visit(node) }
|
||||
node.children.each { |n| visit(n) }
|
||||
end
|
||||
|
||||
def visit(node)
|
||||
|
@ -197,6 +209,7 @@ class HtmlToMarkdown
|
|||
end
|
||||
|
||||
def format_block
|
||||
|
||||
lines = @stack[-1].markdown.each_line.map do |line|
|
||||
prefix = @stack.map { |b| b.opened ? b.body : b.head }.join
|
||||
@stack.each { |b| b.opened = true }
|
||||
|
|
|
@ -219,4 +219,9 @@ describe HtmlToMarkdown do
|
|||
expect(html_to_markdown("<style>* { margin: 0 }</style>")).to eq("")
|
||||
end
|
||||
|
||||
it "handles divs within spans" do
|
||||
html = "<div>1st paragraph<span><div>2nd paragraph</div></span></div>"
|
||||
expect(html_to_markdown(html)).to eq("1st paragraph\n2nd paragraph")
|
||||
end
|
||||
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue