2017-04-24 16:01:41 -04:00
|
|
|
require "nokogiri"
|
|
|
|
|
|
|
|
class HtmlToMarkdown
|
|
|
|
|
|
|
|
class Block < Struct.new(:name, :head, :body, :opened, :markdown)
|
2018-12-03 22:48:13 -05:00
|
|
|
def initialize(name, head = "", body = "", opened = false, markdown = "")
|
|
|
|
super
|
|
|
|
end
|
2017-04-24 16:01:41 -04:00
|
|
|
end
|
|
|
|
|
2017-07-27 21:20:09 -04:00
|
|
|
def initialize(html, opts = {})
|
2017-04-28 16:14:46 -04:00
|
|
|
@opts = opts || {}
|
2017-05-09 12:33:54 -04:00
|
|
|
@doc = fix_span_elements(Nokogiri::HTML(html))
|
|
|
|
|
2017-04-24 16:01:41 -04:00
|
|
|
remove_whitespaces!
|
|
|
|
end
|
|
|
|
|
2017-05-09 12:33:54 -04:00
|
|
|
# If a `<div>` is within a `<span>` that's invalid, so let's hoist the `<div>` up
|
2018-12-17 12:39:02 -05:00
|
|
|
INLINE_ELEMENTS ||= %w{span font}
|
|
|
|
BLOCK_ELEMENTS ||= %w{div p}
|
2017-05-09 12:33:54 -04:00
|
|
|
def fix_span_elements(node)
|
2018-12-17 12:39:02 -05:00
|
|
|
if (INLINE_ELEMENTS.include?(node.name) && BLOCK_ELEMENTS.any? { |e| node.at(e) })
|
2017-05-09 12:33:54 -04:00
|
|
|
node.swap(node.children)
|
|
|
|
end
|
|
|
|
|
2017-07-27 21:20:09 -04:00
|
|
|
node.children.each { |c| fix_span_elements(c) }
|
2017-05-09 12:33:54 -04:00
|
|
|
node
|
|
|
|
end
|
|
|
|
|
2017-04-24 16:01:41 -04:00
|
|
|
def remove_whitespaces!
|
|
|
|
@doc.traverse do |node|
|
|
|
|
if node.is_a? Nokogiri::XML::Text
|
2017-05-03 12:04:31 -04:00
|
|
|
node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element&.description&.block?
|
|
|
|
node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element.nil? && node.parent.description&.block?
|
|
|
|
node.content = node.content.gsub(/[[:space:]]+\z/, "") if node.next_element&.description&.block?
|
|
|
|
node.content = node.content.gsub(/[[:space:]]+\z/, "") if node.next_element.nil? && node.parent.description&.block?
|
2018-12-17 12:38:11 -05:00
|
|
|
node.content = node.content.gsub(/\r\n?/, "\n")
|
2017-04-24 16:01:41 -04:00
|
|
|
node.remove if node.content.empty?
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def to_markdown
|
|
|
|
@stack = [Block.new("root")]
|
|
|
|
@markdown = ""
|
|
|
|
traverse(@doc)
|
|
|
|
@markdown << format_block
|
|
|
|
@markdown.gsub(/\n{3,}/, "\n\n").strip
|
|
|
|
end
|
|
|
|
|
|
|
|
def traverse(node)
|
2017-05-09 12:33:54 -04:00
|
|
|
node.children.each { |n| visit(n) }
|
2017-04-24 16:01:41 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def visit(node)
|
2017-05-03 13:40:34 -04:00
|
|
|
return if node["style"] && node["style"][/display\s*:\s*none/]
|
2017-05-03 13:34:03 -04:00
|
|
|
|
2017-04-24 16:01:41 -04:00
|
|
|
if node.description&.block? && node.parent&.description&.block? && @stack[-1].markdown.size > 0
|
|
|
|
block = @stack[-1].dup
|
|
|
|
@markdown << format_block
|
|
|
|
block.markdown = ""
|
|
|
|
block.opened = true
|
|
|
|
@stack << block
|
|
|
|
end
|
|
|
|
|
|
|
|
visitor = "visit_#{node.name}"
|
|
|
|
respond_to?(visitor) ? send(visitor, node) : traverse(node)
|
|
|
|
end
|
|
|
|
|
|
|
|
BLACKLISTED ||= %w{button datalist fieldset form input label legend meter optgroup option output progress select textarea style script}
|
|
|
|
BLACKLISTED.each do |tag|
|
|
|
|
class_eval <<-RUBY
|
|
|
|
def visit_#{tag}(node)
|
|
|
|
""
|
|
|
|
end
|
|
|
|
RUBY
|
|
|
|
end
|
|
|
|
|
|
|
|
def visit_pre(node)
|
|
|
|
code = node.children.find { |c| c.name == "code" }
|
|
|
|
code_class = code ? code["class"] : ""
|
|
|
|
lang = code_class ? code_class[/lang-(\w+)/, 1] : ""
|
2018-02-26 17:28:02 -05:00
|
|
|
pre = Block.new("pre")
|
|
|
|
pre.markdown = "```#{lang}\n"
|
|
|
|
@stack << pre
|
2017-04-24 16:01:41 -04:00
|
|
|
traverse(node)
|
2018-02-26 17:28:02 -05:00
|
|
|
pre.markdown << "\n```\n"
|
2017-04-24 16:01:41 -04:00
|
|
|
@markdown << format_block
|
|
|
|
end
|
|
|
|
|
|
|
|
def visit_blockquote(node)
|
|
|
|
@stack << Block.new("blockquote", "> ", "> ")
|
|
|
|
traverse(node)
|
|
|
|
@markdown << format_block
|
|
|
|
end
|
|
|
|
|
|
|
|
BLOCK_WITH_NEWLINE ||= %w{div p}
|
|
|
|
BLOCK_WITH_NEWLINE.each do |tag|
|
|
|
|
class_eval <<-RUBY
|
|
|
|
def visit_#{tag}(node)
|
|
|
|
@stack << Block.new("#{tag}")
|
|
|
|
traverse(node)
|
|
|
|
@markdown << format_block
|
|
|
|
@markdown << "\n"
|
|
|
|
end
|
|
|
|
RUBY
|
|
|
|
end
|
|
|
|
|
|
|
|
BLOCK_LIST ||= %w{menu ol ul}
|
|
|
|
BLOCK_LIST.each do |tag|
|
|
|
|
class_eval <<-RUBY
|
|
|
|
def visit_#{tag}(node)
|
|
|
|
@stack << Block.new("#{tag}")
|
|
|
|
traverse(node)
|
|
|
|
@markdown << format_block
|
|
|
|
end
|
|
|
|
RUBY
|
|
|
|
end
|
|
|
|
|
|
|
|
def visit_li(node)
|
|
|
|
parent = @stack.reverse.find { |n| n.name[/ul|ol|menu/] }
|
2017-05-17 09:05:11 -04:00
|
|
|
prefix = parent&.name == "ol" ? "1. " : "- "
|
2017-04-24 16:01:41 -04:00
|
|
|
@stack << Block.new("li", prefix, " ")
|
|
|
|
traverse(node)
|
|
|
|
@markdown << format_block
|
|
|
|
end
|
|
|
|
|
|
|
|
(1..6).each do |n|
|
|
|
|
class_eval <<-RUBY
|
|
|
|
def visit_h#{n}(node)
|
|
|
|
@stack << Block.new("h#{n}", "#" * #{n} + " ")
|
|
|
|
traverse(node)
|
|
|
|
@markdown << format_block
|
|
|
|
end
|
|
|
|
RUBY
|
|
|
|
end
|
|
|
|
|
2017-04-26 10:49:06 -04:00
|
|
|
WHITELISTED ||= %w{del ins kbd s small strike sub sup}
|
2017-04-24 16:01:41 -04:00
|
|
|
WHITELISTED.each do |tag|
|
|
|
|
class_eval <<-RUBY
|
|
|
|
def visit_#{tag}(node)
|
|
|
|
@stack[-1].markdown << "<#{tag}>"
|
|
|
|
traverse(node)
|
|
|
|
@stack[-1].markdown << "</#{tag}>"
|
|
|
|
end
|
|
|
|
RUBY
|
|
|
|
end
|
|
|
|
|
|
|
|
def visit_abbr(node)
|
|
|
|
@stack[-1].markdown << (node["title"].present? ? %Q[<abbr title="#{node["title"]}">] : "<abbr>")
|
|
|
|
traverse(node)
|
|
|
|
@stack[-1].markdown << "</abbr>"
|
|
|
|
end
|
|
|
|
|
|
|
|
def visit_img(node)
|
2017-05-03 16:53:47 -04:00
|
|
|
if is_valid_src?(node["src"]) && is_visible_img?(node)
|
2017-05-03 12:29:25 -04:00
|
|
|
if @opts[:keep_img_tags]
|
|
|
|
@stack[-1].markdown << node.to_html
|
|
|
|
else
|
|
|
|
title = node["alt"].presence || node["title"].presence
|
|
|
|
@stack[-1].markdown << "![#{title}](#{node["src"]})"
|
|
|
|
end
|
2017-04-28 16:14:46 -04:00
|
|
|
end
|
2017-04-24 16:01:41 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def visit_a(node)
|
2017-05-03 16:53:47 -04:00
|
|
|
if is_valid_href?(node["href"])
|
2017-05-03 10:42:37 -04:00
|
|
|
@stack[-1].markdown << "["
|
|
|
|
traverse(node)
|
2017-05-03 12:29:25 -04:00
|
|
|
@stack[-1].markdown << "](#{node["href"]})"
|
2017-05-03 10:42:37 -04:00
|
|
|
else
|
|
|
|
traverse(node)
|
|
|
|
end
|
2017-04-24 16:01:41 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def visit_tt(node)
|
|
|
|
@stack[-1].markdown << "`"
|
|
|
|
traverse(node)
|
|
|
|
@stack[-1].markdown << "`"
|
|
|
|
end
|
|
|
|
|
|
|
|
def visit_code(node)
|
|
|
|
@stack.reverse.find { |n| n.name["pre"] } ? traverse(node) : visit_tt(node)
|
|
|
|
end
|
|
|
|
|
|
|
|
def visit_br(node)
|
2017-08-14 16:13:24 -04:00
|
|
|
return if node.previous_sibling.nil? && EMPHASIS.include?(node.parent.name)
|
2017-04-24 16:01:41 -04:00
|
|
|
@stack[-1].markdown << "\n"
|
|
|
|
end
|
|
|
|
|
|
|
|
def visit_hr(node)
|
|
|
|
@stack[-1].markdown << "\n\n---\n\n"
|
|
|
|
end
|
|
|
|
|
2017-08-14 16:13:24 -04:00
|
|
|
EMPHASIS ||= %w{b strong i em}
|
|
|
|
EMPHASIS.each do |tag|
|
|
|
|
class_eval <<-RUBY
|
|
|
|
def visit_#{tag}(node)
|
|
|
|
return if node.text.empty?
|
|
|
|
return @stack[-1].markdown << " " if node.text.blank?
|
|
|
|
times = "#{tag}" == "i" || "#{tag}" == "em" ? 1 : 2
|
|
|
|
delimiter = (node.text["*"] ? "_" : "*") * times
|
|
|
|
@stack[-1].markdown << " " if node.text[0] == " "
|
|
|
|
@stack[-1].markdown << delimiter
|
|
|
|
traverse(node)
|
2018-12-17 12:38:11 -05:00
|
|
|
@stack[-1].markdown.gsub!(/\n+$/, "")
|
2017-08-14 16:13:24 -04:00
|
|
|
if @stack[-1].markdown[-1] == " "
|
|
|
|
@stack[-1].markdown.chomp!(" ")
|
|
|
|
append_space = true
|
|
|
|
end
|
|
|
|
@stack[-1].markdown << delimiter
|
|
|
|
@stack[-1].markdown << " " if append_space
|
|
|
|
end
|
|
|
|
RUBY
|
2017-04-24 16:01:41 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def visit_text(node)
|
2017-08-14 16:13:24 -04:00
|
|
|
node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element.nil? && EMPHASIS.include?(node.parent.name)
|
2017-12-08 16:06:27 -05:00
|
|
|
indent = node.text[/^\s+/] || ""
|
|
|
|
text = node.text.gsub(/^\s+/, "").gsub(/\s{2,}/, " ")
|
|
|
|
@stack[-1].markdown << [indent, text].join("")
|
2017-04-24 16:01:41 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def format_block
|
|
|
|
lines = @stack[-1].markdown.each_line.map do |line|
|
|
|
|
prefix = @stack.map { |b| b.opened ? b.body : b.head }.join
|
|
|
|
@stack.each { |b| b.opened = true }
|
|
|
|
prefix + line.rstrip
|
|
|
|
end
|
|
|
|
@stack.pop
|
|
|
|
(lines + [""]).join("\n")
|
|
|
|
end
|
|
|
|
|
2017-05-03 16:53:47 -04:00
|
|
|
def is_valid_href?(href)
|
|
|
|
href.present? && (href.start_with?("http") || href.start_with?("www."))
|
|
|
|
end
|
|
|
|
|
|
|
|
def is_valid_src?(src)
|
|
|
|
return false if src.blank?
|
|
|
|
return true if @opts[:keep_cid_imgs] && src.start_with?("cid:")
|
|
|
|
src.start_with?("http") || src.start_with?("www.")
|
2017-05-03 12:29:25 -04:00
|
|
|
end
|
|
|
|
|
2017-05-03 13:40:34 -04:00
|
|
|
def is_visible_img?(img)
|
2017-05-03 16:53:47 -04:00
|
|
|
return false if img["width"].present? && img["width"].to_i == 0
|
2017-05-03 13:40:34 -04:00
|
|
|
return false if img["height"].present? && img["height"].to_i == 0
|
2017-05-03 16:53:47 -04:00
|
|
|
return false if img["style"].present? && img["style"][/(width|height)\s*:\s*0/]
|
2017-05-03 13:40:34 -04:00
|
|
|
true
|
|
|
|
end
|
|
|
|
|
2017-04-24 16:01:41 -04:00
|
|
|
end
|