FEATURE: Use basic HTML table if it can't be converted to Markdown
Previously `HtmlToMarkdown` always converted HTML tables into Markdown tables. That lead to some badly formatted Markdown tables, e.g. when the table contained `rowspan` or `colspan`. This solves the issue by using very basic HTML tables in those cases.
This commit is contained in:
parent
b8f2cbf41c
commit
52e81582b4
|
@ -5,6 +5,7 @@ require "securerandom"
|
|||
class HtmlToMarkdown
|
||||
def initialize(html, opts = {})
|
||||
@opts = opts
|
||||
@within_html_block = false
|
||||
|
||||
# we're only interested in <body>
|
||||
@doc = Nokogiri.HTML5(html).at("body")
|
||||
|
@ -139,8 +140,16 @@ class HtmlToMarkdown
|
|||
end
|
||||
end
|
||||
|
||||
def traverse(node)
|
||||
node.children.map { |n| visit(n) }.join
|
||||
def traverse(node, within_html_block: false)
|
||||
within_html_block_changed = false
|
||||
if within_html_block
|
||||
within_html_block_changed = true
|
||||
@within_html_block = true
|
||||
end
|
||||
|
||||
text = node.children.map { |n| visit(n) }.join
|
||||
@within_html_block = false if within_html_block_changed
|
||||
text
|
||||
end
|
||||
|
||||
def visit(node)
|
||||
|
@ -251,11 +260,8 @@ class HtmlToMarkdown
|
|||
|
||||
(1..6).each { |n| define_method("visit_h#{n}") { |node| "#{"#" * n} #{traverse(node)}" } }
|
||||
|
||||
CELLS ||= %w[th td]
|
||||
CELLS.each { |tag| define_method("visit_#{tag}") { |node| "#{traverse(node)} " } }
|
||||
|
||||
def visit_table(node)
|
||||
if rows = extract_rows(node)
|
||||
if (rows = extract_rows(node))
|
||||
headers = rows[0].css("td, th")
|
||||
text = "| " + headers.map { |td| traverse(td).gsub(/\n/, "<br>") }.join(" | ") + " |\n"
|
||||
text << "| " + (["-"] * headers.size).join(" | ") + " |\n"
|
||||
|
@ -265,7 +271,7 @@ class HtmlToMarkdown
|
|||
end
|
||||
"\n\n#{text}\n\n"
|
||||
else
|
||||
traverse(node)
|
||||
"<table>\n#{traverse(node, within_html_block: true)}</table>"
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -277,6 +283,27 @@ class HtmlToMarkdown
|
|||
rows
|
||||
end
|
||||
|
||||
def visit_tr(node)
|
||||
text = traverse(node)
|
||||
@within_html_block ? "<tr>\n#{text}</tr>\n" : text
|
||||
end
|
||||
|
||||
TABLE_CELLS ||= %w[th td]
|
||||
TABLE_CELLS.each do |tag|
|
||||
define_method("visit_#{tag}") do |node|
|
||||
text = traverse(node)
|
||||
if @within_html_block
|
||||
element = create_element(tag, "\n\n#{text}\n\n")
|
||||
node.attribute_nodes.each do |a|
|
||||
element[a.name] = a.value if %w[rowspan colspan].include?(a.name)
|
||||
end
|
||||
"#{element.to_html}\n"
|
||||
else
|
||||
text
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
LISTS ||= %w[ul ol]
|
||||
LISTS.each do |tag|
|
||||
define_method("visit_#{tag}") do |node|
|
||||
|
@ -352,7 +379,11 @@ class HtmlToMarkdown
|
|||
end
|
||||
|
||||
def visit_text(node)
|
||||
node.text
|
||||
if @within_html_block
|
||||
node.to_html
|
||||
else
|
||||
node.text
|
||||
end
|
||||
end
|
||||
|
||||
HTML5_BLOCK_ELEMENTS ||= %w[
|
||||
|
@ -372,4 +403,14 @@ class HtmlToMarkdown
|
|||
return false if !node
|
||||
node.description&.block? || HTML5_BLOCK_ELEMENTS.include?(node.name)
|
||||
end
|
||||
|
||||
def fragment_document
|
||||
@fragment_document ||= Nokogiri::HTML5::DocumentFragment.parse("").document
|
||||
end
|
||||
|
||||
def create_element(tag, inner_html = nil, attributes = {})
|
||||
element = fragment_document.create_element(tag, nil, attributes)
|
||||
element.inner_html = inner_html if inner_html
|
||||
element
|
||||
end
|
||||
end
|
||||
|
|
|
@ -544,7 +544,7 @@ RSpec.describe HtmlToMarkdown do
|
|||
)
|
||||
end
|
||||
|
||||
it "doesn't swallow badly formatted <table>" do
|
||||
it "keeps HTML for badly formatted <table>" do
|
||||
html = <<~HTML
|
||||
<table>
|
||||
<tr>
|
||||
|
@ -554,13 +554,153 @@ RSpec.describe HtmlToMarkdown do
|
|||
<th>4</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>One</td>
|
||||
<td>Two</td>
|
||||
<td>Three</td>
|
||||
<td><One></td>
|
||||
<td><strong>Two</strong></td>
|
||||
<td>Three<script>alert("foo")</script></td>
|
||||
</tr>
|
||||
</table>
|
||||
HTML
|
||||
|
||||
expect(html_to_markdown(html)).to eq("1 2 3 4 \nOne Two Three")
|
||||
markdown = <<~MD
|
||||
<table>
|
||||
<tr>
|
||||
<th>
|
||||
|
||||
1
|
||||
|
||||
</th>
|
||||
<th>
|
||||
|
||||
2
|
||||
|
||||
</th>
|
||||
<th>
|
||||
|
||||
3
|
||||
|
||||
</th>
|
||||
<th>
|
||||
|
||||
4
|
||||
|
||||
</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
|
||||
<One>
|
||||
|
||||
</td>
|
||||
<td>
|
||||
|
||||
**Two**
|
||||
|
||||
</td>
|
||||
<td>
|
||||
|
||||
Three
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
MD
|
||||
|
||||
expect(html_to_markdown(html)).to eq(markdown.strip)
|
||||
end
|
||||
|
||||
it "keeps HTML for <table> with colspan" do
|
||||
html = <<~HTML
|
||||
<table>
|
||||
<tr>
|
||||
<th>1</th>
|
||||
<th>2</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="2">One / Two</td>
|
||||
</tr>
|
||||
</table>
|
||||
HTML
|
||||
|
||||
markdown = <<~MD
|
||||
<table>
|
||||
<tr>
|
||||
<th>
|
||||
|
||||
1
|
||||
|
||||
</th>
|
||||
<th>
|
||||
|
||||
2
|
||||
|
||||
</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="2">
|
||||
|
||||
One / Two
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
MD
|
||||
|
||||
expect(html_to_markdown(html)).to eq(markdown.strip)
|
||||
end
|
||||
|
||||
it "keeps HTML for <table> with rowspan" do
|
||||
html = <<~HTML
|
||||
<table>
|
||||
<tr>
|
||||
<th>1</th>
|
||||
<th>2</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>A</td>
|
||||
<td rowspan="2">B</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>C</td>
|
||||
</tr>
|
||||
</table>
|
||||
HTML
|
||||
|
||||
markdown = <<~MD
|
||||
<table>
|
||||
<tr>
|
||||
<th>
|
||||
|
||||
1
|
||||
|
||||
</th>
|
||||
<th>
|
||||
|
||||
2
|
||||
|
||||
</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
|
||||
A
|
||||
|
||||
</td>
|
||||
<td rowspan="2">
|
||||
|
||||
B
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
|
||||
C
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
MD
|
||||
|
||||
expect(html_to_markdown(html)).to eq(markdown.strip)
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue