discourse/lib/html_to_markdown.rb

# frozen_string_literal: true

require "nokogumbo"
require "securerandom"

class HtmlToMarkdown

  def initialize(html, opts = {})
    @opts = opts

    # we're only interested in <body>
    @doc = Nokogiri::HTML5(html).at("body")

    remove_not_allowed!(@doc)
    remove_hidden!(@doc)
    hoist_line_breaks!(@doc)
    remove_whitespaces!(@doc)
  end

  def to_markdown
    traverse(@doc)
      .gsub(/\n{2,}/, "\n\n")
      .strip
  end

  private

  def remove_not_allowed!(doc)
    allowed = Set.new

    HtmlToMarkdown.private_instance_methods.each do |m|
      if tag = m.to_s[/^visit_(.+)/, 1]
        allowed << tag
      end
    end

    @doc.traverse { |node| node.remove if !allowed.include?(node.name) }
  end

  HIDDEN_STYLES ||= /(display\s*:\s*none)|(visibility\s*:\s*hidden)|(opacity\s*:\s*0)|(transform\s*:\s*scale\(0\))|((width|height)\s*:\s*0)/i

  def remove_hidden!(doc)
    @doc.css("[hidden]").remove
    @doc.css("[style]").each { |n| n.remove if n["style"][HIDDEN_STYLES] }
    @doc.css("img[width]").each { |n| n.remove if n["width"].to_i <= 0 }
    @doc.css("img[height]").each { |n| n.remove if n["height"].to_i <= 0 }
  end

  # When there's a <br> inside an inline element, split the inline element around the <br>
  def hoist_line_breaks!(doc)
    klass = "_" + SecureRandom.hex
    doc.css("br").each { |br| br.add_class(klass) }

    loop do
      changed = false

      doc.css("br.#{klass}").each do |br|
        parent = br.parent

        if parent.description.block?
          br.remove_class(klass)
        else
          before, after = parent.children.slice_when { |n| n == br }.to_a

          if before.size > 1
            b = Nokogiri::XML::Node.new(parent.name, doc)
            before[0...-1].each { |c| b.add_child(c) }
            parent.previous = b if b.inner_html.present?
          end

          if after.present?
            a = Nokogiri::XML::Node.new(parent.name, doc)
            after.each { |c| a.add_child(c) }
            parent.next = a if a.inner_html.present?
          end

          parent.replace(br)

          changed = true
        end
      end

      break if !changed
    end
  end

  # Removes most of the unnecessary white spaces for better markdown conversion
  # Loosely based on the CSS' White Space Processing Rules (https://www.w3.org/TR/css-text-3/#white-space-rules)
  def remove_whitespaces!(node)
    return true if "pre" == node.name

    node.children.chunk { |n| is_inline?(n) }.each do |inline, nodes|
      if inline
        collapse_spaces!(nodes) && remove_trailing_space!(nodes)
      else
        nodes.each { |n| remove_whitespaces!(n) }
      end
    end
  end

  def is_inline?(node)
    node.text? || ("br" != node.name && node.description&.inline? && node.children.all? { |n| is_inline?(n) })
  end

  def collapse_spaces!(nodes, was_space = true)
    nodes.each do |node|
      if node.text?
        text = String.new

        node.text.chars.each do |c|
          if c[/[[:space:]]/]
            text << " " if !was_space
            was_space = true
          else
            text << c
            was_space = false
          end
        end

        node.content = text
      else
        node.children.each { |n| was_space = collapse_spaces!([n], was_space) }
      end
    end

    was_space
  end

  def remove_trailing_space!(nodes)
    last = nodes[-1]

    if last.text?
      last.content = last.content[0...-1] if last.content[-1] == " "
    elsif last.children.present?
      remove_trailing_space!(last.children)
    end
  end

  def traverse(node)
    node.children.map { |n| visit(n) }.join
  end

  def visit(node)
    visitor = "visit_#{node.name}"
    send(visitor, node) if respond_to?(visitor, true)
  end

  ALLOWED_IMG_SRCS ||= %w{http:// https:// www.}

  def allowed_hrefs
    @allowed_hrefs ||= begin
      hrefs = SiteSetting.allowed_href_schemes.split("|").map { |scheme| "#{scheme}:" }.to_set
      ALLOWED_IMG_SRCS.each { |src| hrefs << src }
      hrefs << "mailto:"
      hrefs.to_a
    end
  end

  def visit_a(node)
    if node["href"].present? && node["href"].starts_with?(*allowed_hrefs)
      "[#{traverse(node)}](#{node["href"]})"
    else
      traverse(node)
    end
  end

  def visit_img(node)
    return if node["src"].blank?

    if @opts[:keep_img_tags]
      node.to_html
    elsif @opts[:keep_cid_imgs] && node["src"].starts_with?("cid:")
      node.to_html
    elsif node["src"].starts_with?(*ALLOWED_IMG_SRCS)
      title = node["alt"].presence || node["title"].presence
      width = node["width"].to_i
      height = node["height"].to_i
      dimensions = "|#{width}x#{height}" if width > 0 && height > 0
      "![#{title}#{dimensions}](#{node["src"]})"
    end
  end

  ALLOWED ||= %w{kbd del ins small big sub sup dl dd dt mark}
  ALLOWED.each do |tag|
    define_method("visit_#{tag}") do |node|
      "<#{tag}>#{traverse(node)}</#{tag}>"
    end
  end

  def visit_blockquote(node)
    text = traverse(node)
    text.strip!
    text.gsub!(/\n{2,}/, "\n\n")
    text.gsub!(/^/, "> ")
    "\n\n#{text}\n\n"
  end

  BLOCKS ||= %w{div tr}
  BLOCKS.each do |tag|
    define_method("visit_#{tag}") do |node|
      prefix = node.previous_element&.description&.block? ? "" : "\n"
      "#{prefix}#{traverse(node)}\n"
    end
  end

  def visit_p(node)
    "\n\n#{traverse(node)}\n\n"
  end

  TRAVERSABLES ||= %w{aside font span thead tbody tfooter u}
  TRAVERSABLES.each do |tag|
    define_method("visit_#{tag}") do |node|
      traverse(node)
    end
  end

  def visit_tt(node)
    "`#{traverse(node)}`"
  end

  def visit_code(node)
    node.ancestors("pre").present? ? traverse(node) : visit_tt(node)
  end

  def visit_pre(node)
    text = traverse(node)
    fence = text["`"] ? "~~~" : "```"
    code = node.at("code")
    code_class = code ? code["class"] : ""
    lang = code_class ? code_class[/lang-(\w+)/, 1] : ""
    "\n\n#{fence}#{lang}\n#{traverse(node)}\n#{fence}\n\n"
  end

  def visit_br(node)
    "\n"
  end

  def visit_hr(node)
    "\n\n---\n\n"
  end

  def visit_abbr(node)
    title = node["title"].presence
    title_attr = title ? %[ title="#{title}"] : ""
    "<abbr#{title_attr}>#{traverse(node)}</abbr>"
  end

  def visit_acronym(node)
    visit_abbr(node)
  end

  (1..6).each do |n|
    define_method("visit_h#{n}") do |node|
      "#{"#" * n} #{traverse(node)}"
    end
  end

  CELLS ||= %w{th td}
  CELLS.each do |tag|
    define_method("visit_#{tag}") do |node|
      "#{traverse(node)} "
    end
  end

  def visit_table(node)
    if rows = extract_rows(node)
      headers = rows[0].css("td, th")
      text = "| " + headers.map { |td| traverse(td).gsub(/\n/, "<br>") }.join(" | ") + " |\n"
      text << "| " + (["-"] * headers.size).join(" | ") + " |\n"
      rows[1..-1].each do |row|
        text << "| " + row.css("td").map { |td| traverse(td).gsub(/\n/, "<br>") }.join(" | ") + " |\n"
      end
      "\n\n#{text}\n\n"
    else
      traverse(node)
    end
  end

  def extract_rows(table)
    return if table.ancestors("table").present?
    return if (rows = table.css("tr")).empty?
    headers_count = rows[0].css("td, th").size
    return if rows[1..-1].any? { |row| row.css("td").size != headers_count }
    rows
  end

  LISTS ||= %w{ul ol}
  LISTS.each do |tag|
    define_method("visit_#{tag}") do |node|
      prefix = node.previous_element&.description&.block? ? "" : "\n"
      suffix = node.ancestors("ul, ol, li").size > 0 ? "" : "\n"
      "#{prefix}#{traverse(node)}#{suffix}"
    end
  end

  def visit_li(node)
    text = traverse(node)

    lists = node.ancestors("ul, ol")
    marker = "ol" == lists[0]&.name ? "1. " : "- "
    indent = (" " * marker.size) * [1, lists.size].max
    suffix = node == node.parent.elements[-1] ? "" : "\n"

    text.gsub!(/\n{2,}/, "\n\n")
    text.gsub!(/^(?!\s*$)/, indent)
    text.lstrip!

    "#{marker}#{text}#{suffix}"
  end

  EMPHASES ||= %w{i em}
  EMPHASES.each do |tag|
    define_method("visit_#{tag}") do |node|
      text = traverse(node)

      return ""  if text.empty?
      return " " if text.blank?
      return "<#{tag}>#{text}</#{tag}>" if text["\n"] || (text["*"] && text["_"])

      prefix = text[0][" "]
      suffix = text[-1][" "] if text.size > 1
      wrap = text["*"] ? "_" : "*"

      "#{prefix}#{wrap}#{text.strip}#{wrap}#{suffix}"
    end
  end

  STRONGS ||= %w{b strong}
  STRONGS.each do |tag|
    define_method("visit_#{tag}") do |node|
      text = traverse(node)

      return ""  if text.empty?
      return " " if text.blank?
      return "<#{tag}>#{text}</#{tag}>" if text["\n"] || (text["*"] && text["_"])

      prefix = text[0][" "]
      suffix = text[-1][" "] if text.size > 1
      wrap = text["*"] ? "__" : "**"

      "#{prefix}#{wrap}#{text.strip}#{wrap}#{suffix}"
    end
  end

  STRIKES ||= %w{s strike}
  STRIKES.each do |tag|
    define_method("visit_#{tag}") do |node|
      text = traverse(node)

      return ""  if text.empty?
      return " " if text.blank?
      return "<#{tag}>#{text}</#{tag}>" if text["\n"] || text["~~"]

      prefix = text[0][" "]
      suffix = text[-1][" "] if text.size > 1

      "#{prefix}~~#{text.strip}~~#{suffix}"
    end
  end

  def visit_text(node)
    node.text
  end

end