require "nokogiri"
class HtmlToMarkdown
class Block < Struct.new(:name, :head, :body, :opened, :markdown)
def initialize(name, head="", body="", opened=false, markdown=""); super; end
end
def initialize(html, opts={})
@opts = opts || {}
@doc = Nokogiri::HTML(html)
remove_whitespaces!
end
def remove_whitespaces!
@doc.traverse do |node|
if node.is_a? Nokogiri::XML::Text
node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element&.description&.block?
node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element.nil? && node.parent.description&.block?
node.content = node.content.gsub(/[[:space:]]+\z/, "") if node.next_element&.description&.block?
node.content = node.content.gsub(/[[:space:]]+\z/, "") if node.next_element.nil? && node.parent.description&.block?
node.remove if node.content.empty?
end
end
end
def to_markdown
@stack = [Block.new("root")]
@markdown = ""
traverse(@doc)
@markdown << format_block
@markdown.gsub(/\n{3,}/, "\n\n").strip
end
def traverse(node)
node.children.each { |node| visit(node) }
end
def visit(node)
return if node["style"] && node["style"][/display\s*:\s*none/]
if node.description&.block? && node.parent&.description&.block? && @stack[-1].markdown.size > 0
block = @stack[-1].dup
@markdown << format_block
block.markdown = ""
block.opened = true
@stack << block
end
visitor = "visit_#{node.name}"
respond_to?(visitor) ? send(visitor, node) : traverse(node)
end
BLACKLISTED ||= %w{button datalist fieldset form input label legend meter optgroup option output progress select textarea style script}
BLACKLISTED.each do |tag|
class_eval <<-RUBY
def visit_#{tag}(node)
""
end
RUBY
end
def visit_pre(node)
code = node.children.find { |c| c.name == "code" }
code_class = code ? code["class"] : ""
lang = code_class ? code_class[/lang-(\w+)/, 1] : ""
@stack << Block.new("pre")
@markdown << "```#{lang}\n"
traverse(node)
@markdown << format_block
@markdown << "```\n"
end
def visit_blockquote(node)
@stack << Block.new("blockquote", "> ", "> ")
traverse(node)
@markdown << format_block
end
BLOCK_WITH_NEWLINE ||= %w{div p}
BLOCK_WITH_NEWLINE.each do |tag|
class_eval <<-RUBY
def visit_#{tag}(node)
@stack << Block.new("#{tag}")
traverse(node)
@markdown << format_block
@markdown << "\n"
end
RUBY
end
BLOCK_LIST ||= %w{menu ol ul}
BLOCK_LIST.each do |tag|
class_eval <<-RUBY
def visit_#{tag}(node)
@stack << Block.new("#{tag}")
traverse(node)
@markdown << format_block
end
RUBY
end
def visit_li(node)
parent = @stack.reverse.find { |n| n.name[/ul|ol|menu/] }
prefix = parent.name == "ol" ? "1. " : "- "
@stack << Block.new("li", prefix, " ")
traverse(node)
@markdown << format_block
end
(1..6).each do |n|
class_eval <<-RUBY
def visit_h#{n}(node)
@stack << Block.new("h#{n}", "#" * #{n} + " ")
traverse(node)
@markdown << format_block
end
RUBY
end
WHITELISTED ||= %w{del ins kbd s small strike sub sup}
WHITELISTED.each do |tag|
class_eval <<-RUBY
def visit_#{tag}(node)
@stack[-1].markdown << "<#{tag}>"
traverse(node)
@stack[-1].markdown << "#{tag}>"
end
RUBY
end
def visit_abbr(node)
@stack[-1].markdown << (node["title"].present? ? %Q[] : "")
traverse(node)
@stack[-1].markdown << ""
end
def visit_img(node)
if is_valid_src?(node["src"]) && is_visible_img?(node)
if @opts[:keep_img_tags]
@stack[-1].markdown << node.to_html
else
title = node["alt"].presence || node["title"].presence
@stack[-1].markdown << "![#{title}](#{node["src"]})"
end
end
end
def visit_a(node)
if is_valid_href?(node["href"])
@stack[-1].markdown << "["
traverse(node)
@stack[-1].markdown << "](#{node["href"]})"
else
traverse(node)
end
end
def visit_tt(node)
@stack[-1].markdown << "`"
traverse(node)
@stack[-1].markdown << "`"
end
def visit_code(node)
@stack.reverse.find { |n| n.name["pre"] } ? traverse(node) : visit_tt(node)
end
def visit_br(node)
@stack[-1].markdown << "\n"
end
def visit_hr(node)
@stack[-1].markdown << "\n\n---\n\n"
end
def visit_strong(node)
delimiter = node.text["*"] ? "__" : "**"
@stack[-1].markdown << delimiter
traverse(node)
@stack[-1].markdown << delimiter
end
alias :visit_b :visit_strong
def visit_em(node)
delimiter = node.text["*"] ? "_" : "*"
@stack[-1].markdown << delimiter
traverse(node)
@stack[-1].markdown << delimiter
end
alias :visit_i :visit_em
def visit_text(node)
@stack[-1].markdown << node.text.gsub(/\s{2,}/, " ")
end
def format_block
lines = @stack[-1].markdown.each_line.map do |line|
prefix = @stack.map { |b| b.opened ? b.body : b.head }.join
@stack.each { |b| b.opened = true }
prefix + line.rstrip
end
@stack.pop
(lines + [""]).join("\n")
end
def is_valid_href?(href)
href.present? && (href.start_with?("http") || href.start_with?("www."))
end
def is_valid_src?(src)
return false if src.blank?
return true if @opts[:keep_cid_imgs] && src.start_with?("cid:")
src.start_with?("http") || src.start_with?("www.")
end
def is_visible_img?(img)
return false if img["width"].present? && img["width"].to_i == 0
return false if img["height"].present? && img["height"].to_i == 0
return false if img["style"].present? && img["style"][/(width|height)\s*:\s*0/]
true
end
end