discourse/spec/lib/html_to_markdown_spec.rb

# frozen_string_literal: true

require 'html_to_markdown'

describe HtmlToMarkdown do

  def html_to_markdown(html, opts = {})
    HtmlToMarkdown.new(html, opts).to_markdown
  end

  it "remove whitespaces" do
    html = <<-HTML
      <div dir="auto">Hello,
        <div dir="auto"><br></div>
        <div dir="auto">&nbsp; &nbsp; This is the 1st paragraph.&nbsp; &nbsp; </div>
        <div dir="auto"><br></div>
        <div dir="auto">
          &nbsp; &nbsp; &nbsp; &nbsp; This is another paragraph
        </div>
      </div>
    HTML

    expect(html_to_markdown(html)).to eq("Hello,\n\nThis is the 1st paragraph.\n\nThis is another paragraph")

    html = <<~HTML
      <body text="#000000" bgcolor="#FFFFFF">
          <p>Let me see if it happens by answering your message through
            Thunderbird.</p>
          <p>Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
            Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
            Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
            Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
            Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
            Long sentence 1
          </p>
      </body>
    HTML

    markdown = <<~MD
      Let me see if it happens by answering your message through Thunderbird.

      Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1
    MD

    expect(html_to_markdown(html)).to eq(markdown.strip)

    html = <<~HTML
      <p>    This     post
            has             lots<br>           of
                  space
      </p>
      <pre>    This     space    was   left untouched     !</pre>
    HTML

    markdown = <<~MD
      This post has lots
      of space

      ```
          This     space    was   left untouched     !
      ```
    MD

    expect(html_to_markdown(html)).to eq(markdown.strip)
  end

  it "doesn't error on non-inline elements like (aside, section)" do

    html = <<~HTML
      <aside class="quote no-group">
      <blockquote>
      <p>Hello,<br>is it me you're looking for?</p>
      </blockquote>
      <br>
      </aside>
    HTML

    markdown = <<~MD
      > Hello,
      > is it me you're looking for?
    MD

    expect(html_to_markdown(html)).to eq(markdown.strip)
  end

  it "skips hidden tags" do
    expect(html_to_markdown(%Q{<p>Hello <span hidden>cruel </span>World!</p>})).to eq("Hello World!")
  end

  it "converts <strong>" do
    expect(html_to_markdown("<strong>Strong</strong>")).to eq("**Strong**")
    expect(html_to_markdown("<strong>Str*ng</strong>")).to eq("__Str*ng__")
  end

  it "converts <b>" do
    expect(html_to_markdown("<b>Bold</b>")).to eq("**Bold**")
    expect(html_to_markdown("<b>B*ld</b>")).to eq("__B*ld__")

    html = <<~HTML
      Before
      <p><b>Bold
      <br>
      <br>
      </b>
      </p>
      After
    HTML
    expect(html_to_markdown(html)).to eq("Before\n\n**Bold**\n\nAfter")
  end

  it "converts <em>" do
    expect(html_to_markdown("<em>Emphasis</em>")).to eq("*Emphasis*")
    expect(html_to_markdown("<em>Emph*sis</em>")).to eq("_Emph*sis_")
  end

  it "converts <i>" do
    expect(html_to_markdown("<i>Italic</i>")).to eq("*Italic*")
    expect(html_to_markdown("<i>It*lic</i>")).to eq("_It*lic_")
  end

  it "converts <a>" do
    expect(html_to_markdown(%Q{<a href="https://www.discourse.org">Discourse</a>})).to eq("[Discourse](https://www.discourse.org)")
  end

  it "supports SiteSetting.allowed_href_schemes" do
    SiteSetting.allowed_href_schemes = "tel|steam"
    expect(html_to_markdown(%Q{<a href="steam://store/48000">LIMBO</a>})).to eq("[LIMBO](steam://store/48000)")
  end

  it "removes empty & invalid <a>" do
    expect(html_to_markdown(%Q{<a>Discourse</a>})).to eq("Discourse")
    expect(html_to_markdown(%Q{<a href="">Discourse</a>})).to eq("Discourse")
    expect(html_to_markdown(%Q{<a href="foo.bar">Discourse</a>})).to eq("Discourse")
  end

  HTML_WITH_IMG     ||= %Q{<img src="https://www.discourse.org/logo.svg" alt="Discourse Logo">}
  HTML_WITH_CID_IMG ||= %Q{<img src="cid:ii_1525434659ddb4cb" title="Discourse Logo">}

  it "converts <img>" do
    expect(html_to_markdown(HTML_WITH_IMG)).to eq("![Discourse Logo](https://www.discourse.org/logo.svg)")
  end

  it "keeps <img> with 'keep_img_tags'" do
    expect(html_to_markdown(HTML_WITH_IMG, keep_img_tags: true)).to eq(HTML_WITH_IMG)
  end

  it "removes empty & invalid <img>" do
    expect(html_to_markdown(%Q{<img>})).to eq("")
    expect(html_to_markdown(%Q{<img src="">})).to eq("")
    expect(html_to_markdown(%Q{<img src="foo.bar">})).to eq("")
  end

  it "keeps <img> with src='cid:' with 'keep_cid_imgs'" do
    expect(html_to_markdown(HTML_WITH_CID_IMG, keep_cid_imgs: true)).to eq(HTML_WITH_CID_IMG)
  end

  it "skips hidden <img>" do
    expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" width=0>})).to eq("")
    expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" height="0">})).to eq("")
  end

  it "supports width/height on <img>" do
    expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" height=100>})).to eq("![](https://www.discourse.org/logo.svg)")
    expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" width=200>})).to eq("![](https://www.discourse.org/logo.svg)")
    expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" height=100 width=200>})).to eq("![|200x100](https://www.discourse.org/logo.svg)")
  end

  (1..6).each do |n|
    it "converts <h#{n}>" do
      expect(html_to_markdown("<h#{n}>Header #{n}</h#{n}>")).to eq("#" * n + " Header #{n}")
    end
  end

  it "converts <br>" do
    expect(html_to_markdown("Before<br>Inside<br>After")).to eq("Before\nInside\nAfter")
  end

  it "skips <br> inside <p> if next character is \n" do
    expect(html_to_markdown("<p>Before<br>\nInside<br>After</p>")).to eq("Before\nInside\nAfter")
  end

  it "converts <hr>" do
    expect(html_to_markdown("Before<hr>Inside<hr>After")).to eq("Before\n\n---\n\nInside\n\n---\n\nAfter")
  end

  it "converts <tt>" do
    expect(html_to_markdown("<tt>Teletype</tt>")).to eq("`Teletype`")
  end

  it "converts <code>" do
    expect(html_to_markdown("<code>Code</code>")).to eq("`Code`")
  end

  it "supports <ins>" do
    expect(html_to_markdown("This is an <ins>insertion</ins>")).to eq("This is an <ins>insertion</ins>")
  end

  it "supports <del>" do
    expect(html_to_markdown("This is a <del>deletion</del>")).to eq("This is a <del>deletion</del>")
  end

  it "supports <sub>" do
    expect(html_to_markdown("H<sub>2</sub>O")).to eq("H<sub>2</sub>O")
  end

  it "supports <mark>" do
    expect(html_to_markdown("<mark>This is highlighted!</mark>")).to eq("<mark>This is highlighted!</mark>")
  end

  it "supports <sup>" do
    expect(html_to_markdown("<sup>Super Script!</sup>")).to eq("<sup>Super Script!</sup>")
  end

  it "supports <small>" do
    expect(html_to_markdown("<small>Small</small>")).to eq("<small>Small</small>")
  end

  it "supports <kbd>" do
    expect(html_to_markdown("<kbd>CTRL</kbd>+<kbd>C</kbd>")).to eq("<kbd>CTRL</kbd>+<kbd>C</kbd>")
  end

  it "supports <abbr>" do
    expect(html_to_markdown(%Q{<abbr title="Civilized Discourse Construction Kit, Inc.">CDCK</abbr>})).to eq(%Q{<abbr title="Civilized Discourse Construction Kit, Inc.">CDCK</abbr>})
  end

  it "supports <s>" do
    expect(html_to_markdown("<s>Strike Through</s>")).to eq("~~Strike Through~~")
  end

  it "supports <strike>" do
    expect(html_to_markdown("<strike>Strike Through</strike>")).to eq("~~Strike Through~~")
  end

  it "supports <blockquote>" do
    expect(html_to_markdown("<blockquote>Quote</blockquote>")).to eq("> Quote")
  end

  it "supports <ul>" do
    expect(html_to_markdown("<ul><li>🍏</li><li>🍐</li><li>🍌</li></ul>")).to eq("- 🍏\n- 🍐\n- 🍌")
    expect(html_to_markdown("<ul>\n<li>🍏</li>\n<li>🍐</li>\n<li>🍌</li>\n</ul>")).to eq("- 🍏\n- 🍐\n- 🍌")
  end

  it "supports <ol>" do
    expect(html_to_markdown("<ol><li>🍆</li><li>🍅</li><li>🍄</li></ol>")).to eq("1. 🍆\n1. 🍅\n1. 🍄")
  end

  it "supports <p> inside <li>" do
    expect(html_to_markdown("<ul><li><p>🍏</p></li><li><p>🍐</p></li><li><p>🍌</p></li></ul>")).to eq("- 🍏\n\n- 🍐\n\n- 🍌")
  end

  it "supports <ul> inside <ul>" do
    expect(html_to_markdown(<<-HTML
      <ul>
        <li>Fruits
            <ul>
                <li>🍏</li>
                <li>🍐</li>
                <li>🍌</li>
            </ul>
        </li>
        <li>Vegetables
            <ul>
                <li>🍆</li>
                <li>🍅</li>
                <li>🍄</li>
            </ul>
        </li>
      </ul>
    HTML
    )).to eq("- Fruits\n  - 🍏\n  - 🍐\n  - 🍌\n- Vegetables\n  - 🍆\n  - 🍅\n  - 🍄")
  end

  it "supports bare <li>" do
    expect(html_to_markdown("<li>I'm alone</li>")).to eq("- I'm alone")
  end

  it "supports <pre>" do
    expect(html_to_markdown("<pre>var foo = 'bar';</pre>")).to eq("```\nvar foo = 'bar';\n```")
    expect(html_to_markdown("<pre><code>var foo = 'bar';</code></pre>")).to eq("```\nvar foo = 'bar';\n```")
    expect(html_to_markdown(%Q{<pre><code class="lang-javascript">var foo = 'bar';</code></pre>})).to eq("```javascript\nvar foo = 'bar';\n```")
    expect(html_to_markdown("<pre>    function f() {\n        console.log('Hello world!');\n    }</pre>")).to eq("```\n    function f() {\n        console.log('Hello world!');\n    }\n```")
  end

  it "supports <pre> inside <blockquote>" do
    expect(html_to_markdown("<blockquote><pre><code>var foo = 'bar';</code></pre></blockquote>")).to eq("> ```\n> var foo = 'bar';\n> ```")
  end

  it "works" do
    expect(html_to_markdown("<ul><li><p>A list item with a blockquote:</p><blockquote><p>This is a <strong>blockquote</strong><br>inside a list item.</p></blockquote></li></ul>")).to eq("- A list item with a blockquote:\n\n  > This is a **blockquote**\n  > inside a list item.")
  end

  it "supports html document" do
    expect(html_to_markdown("<html><body>Hello<div>World</div></body></html>")).to eq("Hello\nWorld")
  end

  it "handles <p>" do
    expect(html_to_markdown("<p>1st paragraph</p><p>2nd paragraph</p>")).to eq("1st paragraph\n\n2nd paragraph")
    expect(html_to_markdown("<body><p>1st paragraph</p>\n    <p>    2nd paragraph\n    2nd paragraph</p>\n<p>3rd paragraph</p></body>")).to eq("1st paragraph\n\n2nd paragraph 2nd paragraph\n\n3rd paragraph")
  end

  it "handles <div>" do
    expect(html_to_markdown("<div>1st div</div><div>2nd div</div>")).to eq("1st div\n2nd div")
  end

  it "swallows <span>" do
    expect(html_to_markdown("<span>Span</span>")).to eq("Span")
  end

  it "swallows <u>" do
    expect(html_to_markdown("<u>Underline</u>")).to eq("Underline")
  end

  it "removes <script>" do
    expect(html_to_markdown("<script>var foo = 'bar'</script>")).to eq("")
  end

  it "removes <style>" do
    expect(html_to_markdown("<style>* { margin: 0 }</style>")).to eq("")
  end

  it "handles <p> and <div> within <span>" do
    html = "<div>1st paragraph<span><div>2nd paragraph</div><p>3rd paragraph</p></span></div>"
    expect(html_to_markdown(html)).to eq("1st paragraph\n2nd paragraph\n\n3rd paragraph")
  end

  it "handles <p> and <div> within <font>" do
    html = "<font>1st paragraph<br><span>2nd paragraph</span><div>3rd paragraph</div><p>4th paragraph</p></font>"
    expect(html_to_markdown(html)).to eq("1st paragraph\n2nd paragraph\n3rd paragraph\n\n4th paragraph")
  end

  context "with an oddly placed <br>" do

    it "handles <strong>" do
      expect(html_to_markdown("Hello <strong><br>Bold</strong> World")).to eq("Hello\n**Bold** World")
      expect(html_to_markdown("Hello <strong>Bold<br></strong> World")).to eq("Hello **Bold**\nWorld")
      expect(html_to_markdown("Hello <strong>Bold<br>text</strong> World")).to eq("Hello **Bold**\n**text** World")
    end

    it "handles <em>" do
      expect(html_to_markdown("Hello <em><br>Italic</em> World")).to eq("Hello\n*Italic* World")
      expect(html_to_markdown("Hello <em>Italic<br></em> World")).to eq("Hello *Italic*\nWorld")
      expect(html_to_markdown("Hello <em>Italic<br>text</em> World")).to eq("Hello *Italic*\n*text* World")
    end

    it "works" do
      expect(html_to_markdown("<div>A <b> B <i> C <br> D </i> E <br> F </b> G</div>")).to eq("A __B *C*__\n__*D* E__\n**F** G")
    end

  end

  context "with an empty tag" do

    it "handles <strong>" do
      expect(html_to_markdown("<strong></strong>")).to eq("")
      expect(html_to_markdown("<strong>   </strong>")).to eq("")
      expect(html_to_markdown("Some<strong> </strong>text")).to eq("Some text")
      expect(html_to_markdown("Some<strong>    </strong>text")).to eq("Some text")
    end

    it "handles <em>" do
      expect(html_to_markdown("<em></em>")).to eq("")
      expect(html_to_markdown("<em>   </em>")).to eq("")
      expect(html_to_markdown("Some<em> </em>text")).to eq("Some text")
      expect(html_to_markdown("Some<em>    </em>text")).to eq("Some text")
    end

  end

  context "with spaces around text" do

    it "handles <strong>" do
      expect(html_to_markdown("<strong> Bold</strong>")).to eq("**Bold**")
      expect(html_to_markdown("<strong>     Bold</strong>")).to eq("**Bold**")
      expect(html_to_markdown("<strong>Bold </strong>")).to eq("**Bold**")
      expect(html_to_markdown("<strong>Bold     </strong>")).to eq("**Bold**")
      expect(html_to_markdown("Some<strong> bold</strong> text")).to eq("Some **bold** text")
      expect(html_to_markdown("Some<strong>     bold</strong> text")).to eq("Some **bold** text")
      expect(html_to_markdown("Some <strong>bold </strong>text")).to eq("Some **bold** text")
      expect(html_to_markdown("Some <strong>bold     </strong>text")).to eq("Some **bold** text")
    end

    it "handles <em>" do
      expect(html_to_markdown("<em> Italic</em>")).to eq("*Italic*")
      expect(html_to_markdown("<em>     Italic</em>")).to eq("*Italic*")
      expect(html_to_markdown("<em>Italic </em>")).to eq("*Italic*")
      expect(html_to_markdown("<em>Italic     </em>")).to eq("*Italic*")
      expect(html_to_markdown("Some<em> italic</em> text")).to eq("Some *italic* text")
      expect(html_to_markdown("Some<em>     italic</em> text")).to eq("Some *italic* text")
      expect(html_to_markdown("Some <em>italic </em>text")).to eq("Some *italic* text")
      expect(html_to_markdown("Some <em>italic     </em>text")).to eq("Some *italic* text")
    end

  end

  it "supports <table>" do
    html = <<~HTML
      <table>
        <thead>
          <tr>
            <th>This</th>
            <th>is</th>
            <th>the</th>
            <th><i>headers</i></th>
          </tr>
        </thead>
        <tbody>
          <tr>
            <td>I am</td>
            <td>the</td>
            <td><b>first</b></td>
            <td>row</td>
          </tr>
          <tr>
            <td>And this</td>
            <td>is the</td>
            <td>2<sup>nd</sup></td>
            <td>line</td>
          </tr>
        </tbody>
      </table>
    HTML

    markdown = <<~MD
      | This | is | the | *headers* |
      | - | - | - | - |
      | I am | the | **first** | row |
      | And this | is the | 2<sup>nd</sup> | line |
    MD

    expect(html_to_markdown(html)).to eq(markdown.strip)

    expect(html_to_markdown("<table><tr><td>Hello</td><td>World</td></tr></table>")).to eq("| Hello | World |\n| - | - |")
  end

  it "doesn't swallow badly formatted <table>" do
    html = <<~HTML
      <table>
        <tr>
          <th>1</th>
          <th>2</th>
          <th>3</th>
          <th>4</th>
        </tr>
        <tr>
          <td>One</td>
          <td>Two</td>
          <td>Three</td>
        </tr>
      </table>
    HTML

    expect(html_to_markdown(html)).to eq("1 2 3 4 \nOne Two Three")
  end

end