HtmlToMarkdown library
Small library to transform HTML to Discourse-flavored markdown (mostly used for imports)
This commit is contained in:
parent
62966b1023
commit
d5630d6160
|
@ -0,0 +1,197 @@
|
|||
require "nokogiri"
|
||||
|
||||
class HtmlToMarkdown
|
||||
|
||||
class Block < Struct.new(:name, :head, :body, :opened, :markdown)
|
||||
def initialize(name, head="", body="", opened=false, markdown=""); super; end
|
||||
end
|
||||
|
||||
def initialize(html)
|
||||
@doc = Nokogiri::HTML.fragment(html)
|
||||
remove_whitespaces!
|
||||
puts @doc.to_html
|
||||
end
|
||||
|
||||
def remove_whitespaces!
|
||||
@doc.traverse do |node|
|
||||
if node.is_a? Nokogiri::XML::Text
|
||||
node.content = node.content.lstrip if node.previous_element&.description&.block?
|
||||
node.content = node.content.lstrip if node.previous_element.nil? && node.parent.description&.block?
|
||||
node.content = node.content.rstrip if node.next_element&.description&.block?
|
||||
node.content = node.content.rstrip if node.next_element.nil? && node.parent.description&.block?
|
||||
node.remove if node.content.empty?
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def to_markdown
|
||||
@stack = [Block.new("root")]
|
||||
@markdown = ""
|
||||
traverse(@doc)
|
||||
@markdown << format_block
|
||||
@markdown.gsub(/\n{3,}/, "\n\n").strip
|
||||
end
|
||||
|
||||
def traverse(node)
|
||||
node.children.each { |node| visit(node) }
|
||||
end
|
||||
|
||||
def visit(node)
|
||||
if node.description&.block? && node.parent&.description&.block? && @stack[-1].markdown.size > 0
|
||||
block = @stack[-1].dup
|
||||
@markdown << format_block
|
||||
block.markdown = ""
|
||||
block.opened = true
|
||||
@stack << block
|
||||
end
|
||||
|
||||
visitor = "visit_#{node.name}"
|
||||
respond_to?(visitor) ? send(visitor, node) : traverse(node)
|
||||
end
|
||||
|
||||
BLACKLISTED ||= %w{button datalist fieldset form input label legend meter optgroup option output progress select textarea style script}
|
||||
BLACKLISTED.each do |tag|
|
||||
class_eval <<-RUBY
|
||||
def visit_#{tag}(node)
|
||||
""
|
||||
end
|
||||
RUBY
|
||||
end
|
||||
|
||||
def visit_pre(node)
|
||||
code = node.children.find { |c| c.name == "code" }
|
||||
code_class = code ? code["class"] : ""
|
||||
lang = code_class ? code_class[/lang-(\w+)/, 1] : ""
|
||||
@stack << Block.new("pre")
|
||||
@markdown << "```#{lang}\n"
|
||||
traverse(node)
|
||||
@markdown << format_block
|
||||
@markdown << "```\n"
|
||||
end
|
||||
|
||||
def visit_blockquote(node)
|
||||
@stack << Block.new("blockquote", "> ", "> ")
|
||||
traverse(node)
|
||||
@markdown << format_block
|
||||
end
|
||||
|
||||
BLOCK_WITH_NEWLINE ||= %w{div p}
|
||||
BLOCK_WITH_NEWLINE.each do |tag|
|
||||
class_eval <<-RUBY
|
||||
def visit_#{tag}(node)
|
||||
@stack << Block.new("#{tag}")
|
||||
traverse(node)
|
||||
@markdown << format_block
|
||||
@markdown << "\n"
|
||||
end
|
||||
RUBY
|
||||
end
|
||||
|
||||
BLOCK_LIST ||= %w{menu ol ul}
|
||||
BLOCK_LIST.each do |tag|
|
||||
class_eval <<-RUBY
|
||||
def visit_#{tag}(node)
|
||||
@stack << Block.new("#{tag}")
|
||||
traverse(node)
|
||||
@markdown << format_block
|
||||
end
|
||||
RUBY
|
||||
end
|
||||
|
||||
def visit_li(node)
|
||||
parent = @stack.reverse.find { |n| n.name[/ul|ol|menu/] }
|
||||
prefix = parent.name == "ol" ? "1. " : "- "
|
||||
@stack << Block.new("li", prefix, " ")
|
||||
traverse(node)
|
||||
@markdown << format_block
|
||||
end
|
||||
|
||||
(1..6).each do |n|
|
||||
class_eval <<-RUBY
|
||||
def visit_h#{n}(node)
|
||||
@stack << Block.new("h#{n}", "#" * #{n} + " ")
|
||||
traverse(node)
|
||||
@markdown << format_block
|
||||
end
|
||||
RUBY
|
||||
end
|
||||
|
||||
WHITELISTED ||= %w{del ins kbd s small strike sub sup table tbody td tfoot th thead tr}
|
||||
WHITELISTED.each do |tag|
|
||||
class_eval <<-RUBY
|
||||
def visit_#{tag}(node)
|
||||
@stack[-1].markdown << "<#{tag}>"
|
||||
traverse(node)
|
||||
@stack[-1].markdown << "</#{tag}>"
|
||||
end
|
||||
RUBY
|
||||
end
|
||||
|
||||
def visit_abbr(node)
|
||||
@stack[-1].markdown << (node["title"].present? ? %Q[<abbr title="#{node["title"]}">] : "<abbr>")
|
||||
traverse(node)
|
||||
@stack[-1].markdown << "</abbr>"
|
||||
end
|
||||
|
||||
def visit_img(node)
|
||||
title = node["alt"].presence || node["title"].presence
|
||||
@stack[-1].markdown << "![#{title}](#{node["src"]})"
|
||||
end
|
||||
|
||||
def visit_a(node)
|
||||
@stack[-1].markdown << "["
|
||||
traverse(node)
|
||||
@stack[-1].markdown << "](#{node["href"]})"
|
||||
end
|
||||
|
||||
def visit_tt(node)
|
||||
@stack[-1].markdown << "`"
|
||||
traverse(node)
|
||||
@stack[-1].markdown << "`"
|
||||
end
|
||||
|
||||
def visit_code(node)
|
||||
@stack.reverse.find { |n| n.name["pre"] } ? traverse(node) : visit_tt(node)
|
||||
end
|
||||
|
||||
def visit_br(node)
|
||||
@stack[-1].markdown << "\n"
|
||||
end
|
||||
|
||||
def visit_hr(node)
|
||||
@stack[-1].markdown << "\n\n---\n\n"
|
||||
end
|
||||
|
||||
def visit_strong(node)
|
||||
delimiter = node.text["*"] ? "__" : "**"
|
||||
@stack[-1].markdown << delimiter
|
||||
traverse(node)
|
||||
@stack[-1].markdown << delimiter
|
||||
end
|
||||
|
||||
alias :visit_b :visit_strong
|
||||
|
||||
def visit_em(node)
|
||||
delimiter = node.text["*"] ? "_" : "*"
|
||||
@stack[-1].markdown << delimiter
|
||||
traverse(node)
|
||||
@stack[-1].markdown << delimiter
|
||||
end
|
||||
|
||||
alias :visit_i :visit_em
|
||||
|
||||
def visit_text(node)
|
||||
@stack[-1].markdown << node.text.gsub(/\s{2,}/, " ")
|
||||
end
|
||||
|
||||
def format_block
|
||||
lines = @stack[-1].markdown.each_line.map do |line|
|
||||
prefix = @stack.map { |b| b.opened ? b.body : b.head }.join
|
||||
@stack.each { |b| b.opened = true }
|
||||
prefix + line.rstrip
|
||||
end
|
||||
@stack.pop
|
||||
(lines + [""]).join("\n")
|
||||
end
|
||||
|
||||
end
|
|
@ -0,0 +1,169 @@
|
|||
require 'rails_helper'
|
||||
require 'html_to_markdown'
|
||||
|
||||
describe HtmlToMarkdown do
|
||||
|
||||
def html_to_markdown(html)
|
||||
HtmlToMarkdown.new(html).to_markdown
|
||||
end
|
||||
|
||||
it "converts <strong>" do
|
||||
expect(html_to_markdown("<strong>Strong</strong>")).to eq("**Strong**")
|
||||
expect(html_to_markdown("<strong>Str*ng</strong>")).to eq("__Str*ng__")
|
||||
end
|
||||
|
||||
it "converts <b>" do
|
||||
expect(html_to_markdown("<b>Bold</b>")).to eq("**Bold**")
|
||||
expect(html_to_markdown("<b>B*ld</b>")).to eq("__B*ld__")
|
||||
end
|
||||
|
||||
it "converts <em>" do
|
||||
expect(html_to_markdown("<em>Emphasis</em>")).to eq("*Emphasis*")
|
||||
expect(html_to_markdown("<em>Emph*sis</em>")).to eq("_Emph*sis_")
|
||||
end
|
||||
|
||||
it "converts <i>" do
|
||||
expect(html_to_markdown("<i>Italic</i>")).to eq("*Italic*")
|
||||
expect(html_to_markdown("<i>It*lic</i>")).to eq("_It*lic_")
|
||||
end
|
||||
|
||||
it "converts <a>" do
|
||||
expect(html_to_markdown(%Q{<a href="https://www.discourse.org">Discourse</a>})).to eq("[Discourse](https://www.discourse.org)")
|
||||
end
|
||||
|
||||
it "converts <img>" do
|
||||
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" alt="Discourse Logo">})).to eq("![Discourse Logo](https://www.discourse.org/logo.svg)")
|
||||
end
|
||||
|
||||
(1..6).each do |n|
|
||||
it "converts <h#{n}>" do
|
||||
expect(html_to_markdown("<h#{n}>Header #{n}</h#{n}>")).to eq("#" * n + " Header #{n}")
|
||||
end
|
||||
end
|
||||
|
||||
it "converts <br>" do
|
||||
expect(html_to_markdown("Before<br>Inside<br>After")).to eq("Before\nInside\nAfter")
|
||||
end
|
||||
|
||||
it "converts <hr>" do
|
||||
expect(html_to_markdown("Before<hr>Inside<hr>After")).to eq("Before\n\n---\n\nInside\n\n---\n\nAfter")
|
||||
end
|
||||
|
||||
it "converts <tt>" do
|
||||
expect(html_to_markdown("<tt>Teletype</tt>")).to eq("`Teletype`")
|
||||
end
|
||||
|
||||
it "converts <code>" do
|
||||
expect(html_to_markdown("<code>Code</code>")).to eq("`Code`")
|
||||
end
|
||||
|
||||
it "supports <ins>" do
|
||||
expect(html_to_markdown("This is an <ins>insertion</ins>")).to eq("This is an <ins>insertion</ins>")
|
||||
end
|
||||
|
||||
it "supports <del>" do
|
||||
expect(html_to_markdown("This is a <del>deletion</del>")).to eq("This is a <del>deletion</del>")
|
||||
end
|
||||
|
||||
it "supports <sub>" do
|
||||
expect(html_to_markdown("H<sub>2</sub>O")).to eq("H<sub>2</sub>O")
|
||||
end
|
||||
|
||||
it "supports <sup>" do
|
||||
expect(html_to_markdown("<sup>Super Script!</sup>")).to eq("<sup>Super Script!</sup>")
|
||||
end
|
||||
|
||||
it "supports <small>" do
|
||||
expect(html_to_markdown("<small>Small</small>")).to eq("<small>Small</small>")
|
||||
end
|
||||
|
||||
it "supports <kbd>" do
|
||||
expect(html_to_markdown("<kbd>CTRL</kbd>+<kbd>C</kbd>")).to eq("<kbd>CTRL</kbd>+<kbd>C</kbd>")
|
||||
end
|
||||
|
||||
it "supports <abbr>" do
|
||||
expect(html_to_markdown(%Q{<abbr title="Civilized Discourse Construction Kit, Inc.">CDCK</abbr>})).to eq(%Q{<abbr title="Civilized Discourse Construction Kit, Inc.">CDCK</abbr>})
|
||||
end
|
||||
|
||||
it "supports <s>" do
|
||||
expect(html_to_markdown("<s>Strike Through</s>")).to eq("<s>Strike Through</s>")
|
||||
end
|
||||
|
||||
it "supports <strike>" do
|
||||
expect(html_to_markdown("<strike>Strike Through</strike>")).to eq("<strike>Strike Through</strike>")
|
||||
end
|
||||
|
||||
it "supports <blockquote>" do
|
||||
expect(html_to_markdown("<blockquote>Quote</blockquote>")).to eq("> Quote")
|
||||
end
|
||||
|
||||
it "supports <ul>" do
|
||||
expect(html_to_markdown("<ul><li>🍏</li><li>🍐</li><li>🍌</li></ul>")).to eq("- 🍏\n- 🍐\n- 🍌")
|
||||
expect(html_to_markdown("<ul>\n<li>🍏</li>\n<li>🍐</li>\n<li>🍌</li>\n</ul>")).to eq("- 🍏\n- 🍐\n- 🍌")
|
||||
end
|
||||
|
||||
it "supports <ol>" do
|
||||
expect(html_to_markdown("<ol><li>🍆</li><li>🍅</li><li>🍄</li></ol>")).to eq("1. 🍆\n1. 🍅\n1. 🍄")
|
||||
end
|
||||
|
||||
it "supports <p> inside <li>" do
|
||||
expect(html_to_markdown("<ul><li><p>🍏</p></li><li><p>🍐</p></li><li><p>🍌</p></li></ul>")).to eq("- 🍏\n\n- 🍐\n\n- 🍌")
|
||||
end
|
||||
|
||||
it "supports <ul> inside <ul>" do
|
||||
expect(html_to_markdown(<<-HTML
|
||||
<ul>
|
||||
<li>Fruits
|
||||
<ul>
|
||||
<li>🍏</li>
|
||||
<li>🍐</li>
|
||||
<li>🍌</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>Vegetables
|
||||
<ul>
|
||||
<li>🍆</li>
|
||||
<li>🍅</li>
|
||||
<li>🍄</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
HTML
|
||||
)).to eq("- Fruits\n - 🍏\n - 🍐\n - 🍌\n- Vegetables\n - 🍆\n - 🍅\n - 🍄")
|
||||
end
|
||||
|
||||
it "supports <pre>" do
|
||||
expect(html_to_markdown("<pre>var foo = 'bar';</pre>")).to eq("```\nvar foo = 'bar';\n```")
|
||||
expect(html_to_markdown("<pre><code>var foo = 'bar';</code></pre>")).to eq("```\nvar foo = 'bar';\n```")
|
||||
expect(html_to_markdown(%Q{<pre><code class="lang-javascript">var foo = 'bar';</code></pre>})).to eq("```javascript\nvar foo = 'bar';\n```")
|
||||
end
|
||||
|
||||
it "works" do
|
||||
expect(html_to_markdown("<ul><li><p>A list item with a blockquote:</p><blockquote><p>This is a <strong>blockquote</strong><br>inside a list item.</p></blockquote></li></ul>")).to eq("- A list item with a blockquote:\n\n > This is a **blockquote**\n > inside a list item.")
|
||||
end
|
||||
|
||||
it "handles <p>" do
|
||||
expect(html_to_markdown("<p>1st paragraph</p><p>2nd paragraph</p>")).to eq("1st paragraph\n\n2nd paragraph")
|
||||
end
|
||||
|
||||
it "handles <div>" do
|
||||
expect(html_to_markdown("<div>1st div</div><div>2nd div</div>")).to eq("1st div\n\n2nd div")
|
||||
end
|
||||
|
||||
it "swallows <span>" do
|
||||
expect(html_to_markdown("<span>Span</span>")).to eq("Span")
|
||||
end
|
||||
|
||||
it "swallows <u>" do
|
||||
expect(html_to_markdown("<u>Underline</u>")).to eq("Underline")
|
||||
end
|
||||
|
||||
it "removes <script>" do
|
||||
expect(html_to_markdown("<script>var foo = 'bar'</script>")).to eq("")
|
||||
end
|
||||
|
||||
it "removes <style>" do
|
||||
expect(html_to_markdown("<style>* { margin: 0 }</style>")).to eq("")
|
||||
end
|
||||
|
||||
end
|
Loading…
Reference in New Issue