FIX: Remove newlines from img alt & title in HTML to markdown parser (#25473)

We were having a minor issue with emails with embedded images that had newlines in the alt string; for example: ``` <p class="MsoNormal"><span style="font-size:11.0pt"><img width="898" height="498" style="width:9.3541in;height:5.1875in" id="Picture_x0020_5" src="cid:image003.png@01DA4EBA.0400B610" alt="A screenshot of a computer program Description automatically generated"></span><span style="font-size:11.0pt"><o:p></o:p></span></p> ``` Once this was parsed and converted to markdown (or directly to HTML in some cases), this caused an issue in the composer and the post UI, where the markdown parser didn't know how to deal with this, making the HTML show directly instead of showing an image. The easiest way to deal with this is to just strip \n from image alt and title attrs in the HTMLToMarkdown class.
2024-01-31 10:23:09 +10:00 · 2024-01-31 10:23:09 +10:00 · 575bc4af73
parent 72eb29e5c3
commit 575bc4af73
2 changed files with 23 additions and 2 deletions
--- a/lib/html_to_markdown.rb
+++ b/lib/html_to_markdown.rb
@ -21,6 +21,10 @@ class HtmlToMarkdown

  private

+  def strip_newlines(string)
+    string.gsub(/\n/, " ")&.squeeze(" ")
+  end
+
  def remove_not_allowed!(doc)
    allowed = Set.new

@ -167,16 +171,18 @@ class HtmlToMarkdown
  def visit_img(node)
    return if node["src"].blank?

+    node["alt"] = strip_newlines(node["alt"]) if node["alt"].present?
+    node["title"] = strip_newlines(node["title"]) if node["title"].present?
+
    if @opts[:keep_img_tags]
      node.to_html
    elsif @opts[:keep_cid_imgs] && node["src"].start_with?("cid:")
      node.to_html
    elsif node["src"].start_with?(*ALLOWED_IMG_SRCS)
-      title = node["alt"].presence || node["title"].presence
      width = node["width"].to_i
      height = node["height"].to_i
      dimensions = "|#{width}x#{height}" if width > 0 && height > 0
-      "![#{title}#{dimensions}](#{node["src"]})"
+      "![#{node["alt"] || node["title"]}#{dimensions}](#{node["src"]})"
    end
  end

--- a/spec/lib/html_to_markdown_spec.rb
+++ b/spec/lib/html_to_markdown_spec.rb
@ -150,6 +150,14 @@ RSpec.describe HtmlToMarkdown do
    expect(html_to_markdown(HTML_WITH_IMG, keep_img_tags: true)).to eq(HTML_WITH_IMG)
  end

+  it "removes newlines from img alt text" do
+    html_with_alt_newlines =
+      %Q{<img src="https://www.discourse.org/logo.svg" alt="Discourse\n\nLogo">}
+    expect(html_to_markdown(html_with_alt_newlines)).to eq(
+      "![Discourse Logo](https://www.discourse.org/logo.svg)",
+    )
+  end
+
  it "removes empty & invalid <img>" do
    expect(html_to_markdown("<img>")).to eq("")
    expect(html_to_markdown(%Q{<img src="">})).to eq("")
@ -160,6 +168,13 @@ RSpec.describe HtmlToMarkdown do
    expect(html_to_markdown(HTML_WITH_CID_IMG, keep_cid_imgs: true)).to eq(HTML_WITH_CID_IMG)
  end

+  it "removes newlines from img alt text with cid images" do
+    html_with_cid_alt_newlines = %Q{<img src="cid:ii_1525434659ddb4cb" title="Discourse\n\nLogo">}
+    expect(html_to_markdown(html_with_cid_alt_newlines, keep_cid_imgs: true)).to eq(
+      %Q{<img src="cid:ii_1525434659ddb4cb" title="Discourse Logo">},
+    )
+  end
+
  it "skips hidden <img>" do
    expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" width=0>})).to eq("")
    expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" height="0">})).to eq(