discourse/lib/plain_text_to_markdown.rb

class PlainTextToMarkdown
  SIGNATURE_SEPARATOR ||= "-- ".freeze

  URL_REGEX ||= /((?:https?:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.])(?:[^\s()<>]+|\([^\s()<>]+\))+(?:\([^\s()<>]+\)|[^`!()\[\]{};:'".,<>?«»“”‘’\s]))/i

  def initialize(plaintext, opts = {})
    @plaintext = plaintext
    @lines = []

    @format_flowed = opts[:format_flowed] || false
    @delete_flowed_space = opts[:delete_flowed_space] || false
  end

  def to_markdown
    prepare_lines
    classify_lines

    markdown = ""
    last_quote_level = 0
    last_line_blank = false

    @lines.each do |line|
      current_line_blank = line.text.blank?

      unless last_line_blank && current_line_blank
        if line.quote_level > 0
          quote_identifiers = ">" * line.quote_level
          markdown << quote_identifiers << "\n" unless line.quote_level >= last_quote_level || current_line_blank
          markdown << quote_identifiers
          markdown << " " unless current_line_blank
        else
          markdown << "\n" unless last_quote_level == 0 || current_line_blank
        end

        markdown << convert_text(line)
        markdown << "\n"
      end

      last_line_blank = current_line_blank
      last_quote_level = line.quote_level
    end

    markdown.rstrip!
    markdown
  end

  private

  class CodeBlock < Struct.new(:start_line, :end_line)
    def initialize(start_line, end_line = nil)
      super
    end

    def valid?
      start_line.present? && end_line.present?
    end
  end

  class Line < Struct.new(:text, :quote_level, :code_block)
    def initialize(text, quote_level = 0, code_block = nil)
      super
    end

    def valid_code_block?
      code_block&.valid?
    end
  end

  def prepare_lines
    previous_line = nil

    @plaintext.each_line do |text|
      text.chomp!
      line = Line.new(text)

      remove_quote_level_indicators!(line)

      if @format_flowed
        line = merge_lines(line, previous_line)
        @lines << line unless line == previous_line
      else
        @lines << line
      end

      previous_line = line
    end
  end

  def classify_lines
    previous_line = nil

    @lines.each do |line|
      classify_line_as_code!(line, previous_line)

      previous_line = line
    end
  end

  # @param line [Line]
  def remove_quote_level_indicators!(line)
    match_data = line.text.match(/^(?<indicators>>+)\s?(?<text>.*)/)

    if match_data
      line.text = match_data[:text]
      line.quote_level = match_data[:indicators].length
    end
  end

  # @param line [Line]
  # @param previous_line [Line]
  # @return [Line]
  def merge_lines(line, previous_line)
    return line if previous_line.nil? || line.text.blank?
    return line if line.text == SIGNATURE_SEPARATOR || previous_line.text == SIGNATURE_SEPARATOR
    return line unless line.quote_level == previous_line.quote_level && previous_line.text.end_with?(" ")

    previous_line.text = previous_line.text[0...-1] if @delete_flowed_space
    previous_line.text += line.text
    previous_line
  end

  # @param line [Line]
  # @param previous_line [Line]
  def classify_line_as_code!(line, previous_line)
    line.code_block = previous_line.code_block unless previous_line.nil? || previous_line.valid_code_block?
    return unless line.text =~ /^\s{0,3}```/

    if line.code_block.present?
      line.code_block.end_line = line
    else
      line.code_block = CodeBlock.new(line)
    end
  end

  # @param line [Line]
  # @return [string]
  def convert_text(line)
    text = line.text

    if line.valid_code_block?
      code_block = line.code_block
      return code_block.start_line == line || code_block.end_line == line ? text.lstrip : text
    end

    converted_text = replace_duplicate_links(text)
    converted_text = escape_special_characters(converted_text)
    converted_text = indent_with_non_breaking_spaces(converted_text)
    converted_text
  end

  def replace_duplicate_links(text)
    text.to_enum(:scan, URL_REGEX)
      .map { $& }
      .group_by { |url| url }
      .keep_if { |_, urls | urls.length > 1 }
      .keys.each do |url|

      text.gsub!(Regexp.new(%Q|#{url}(\s*[()\\[\\]<>«»'"“”‘’]?#{url}[()\\[\\]<>«»'"“”‘’]?)|, Regexp::IGNORECASE), url)
    end

    text
  end

  def indent_with_non_breaking_spaces(text)
    text.sub(/^\s+/) do |s|
      # replace tabs with 2 spaces
      s.gsub!("\t", "  ")

      # replace indentation with non-breaking spaces
      s.length > 1 ? "&nbsp;" * s.length : s
    end
  end

  def escape_special_characters(text)
    escaped_text = ""

    text.split(URL_REGEX).each do |text_part|
      if text_part =~ URL_REGEX
        # no escaping withing URLs
        escaped_text << text_part
      else
        # escape Markdown and HTML
        text_part.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" }
        escaped_text << CGI.escapeHTML(text_part)
      end
    end

    escaped_text
  end
end