FIX: `HtmlToMarkdown` should keep HTML entities for <, > and & within HTML elements

Not all HTML elements are converted into Markdown. Some are kept as HTML.
Without this fix XML/HTML entities that are formatted as text instead of code are swallowed by Discourse.
This also fixes quotes in the `title` attribute of the `<abbr>` tag.
This commit is contained in:
Gerhard Schlager 2024-06-09 14:17:32 +02:00 committed by Gerhard Schlager
parent 3c9d61d302
commit 7bdf47b864
2 changed files with 67 additions and 3 deletions

View File

@ -197,7 +197,9 @@ class HtmlToMarkdown
ALLOWED ||= %w[kbd del ins small big sub sup dl dd dt mark]
ALLOWED.each do |tag|
define_method("visit_#{tag}") { |node| "<#{tag}>#{traverse(node)}</#{tag}>" }
define_method("visit_#{tag}") do |node|
"<#{tag}>#{traverse(node, within_html_block: true)}</#{tag}>"
end
end
def visit_blockquote(node)
@ -250,8 +252,8 @@ class HtmlToMarkdown
def visit_abbr(node)
title = node["title"].presence
title_attr = title ? %[ title="#{title}"] : ""
"<abbr#{title_attr}>#{traverse(node)}</abbr>"
attributes = { title: } if title
create_element("abbr", traverse(node, within_html_block: true), attributes).to_html
end
def visit_acronym(node)

View File

@ -259,6 +259,35 @@ RSpec.describe HtmlToMarkdown do
expect(html_to_markdown("<code>Code</code>")).to eq("`Code`")
end
describe "when HTML is used within Markdown" do
HtmlToMarkdown::ALLOWED.each do |tag|
it "keeps mandatory HTML entities in text of <#{tag}>" do
expect(html_to_markdown("<#{tag}>Less than: &lt;</#{tag}>")).to eq(
"<#{tag}>Less than: &lt;</#{tag}>",
)
expect(html_to_markdown("<#{tag}>Greater than: &gt;")).to eq(
"<#{tag}>Greater than: &gt;</#{tag}>",
)
expect(html_to_markdown("<#{tag}>Ampersand: &amp;")).to eq(
"<#{tag}>Ampersand: &amp;</#{tag}>",
)
expect(html_to_markdown("<#{tag}>Double Quote: &quot;</#{tag}>")).to eq(
"<#{tag}>Double Quote: \"</#{tag}>",
)
expect(html_to_markdown("<#{tag}>Single Quote: &apos;</#{tag}>")).to eq(
"<#{tag}>Single Quote: '</#{tag}>",
)
expect(html_to_markdown("<#{tag}>Copyright Symbol: &copy;</#{tag}>")).to eq(
"<#{tag}>Copyright Symbol: ©</#{tag}>",
)
expect(html_to_markdown("<#{tag}>Euro Symbol: &euro;</#{tag}>")).to eq(
"<#{tag}>Euro Symbol: €</#{tag}>",
)
end
end
end
it "supports <ins>" do
expect(html_to_markdown("This is an <ins>insertion</ins>")).to eq(
"This is an <ins>insertion</ins>",
@ -285,16 +314,37 @@ RSpec.describe HtmlToMarkdown do
it "supports <small>" do
expect(html_to_markdown("<small>Small</small>")).to eq("<small>Small</small>")
expect(html_to_markdown("<mark><small>Small</small></mark>")).to eq(
"<mark><small>Small</small></mark>",
)
expect(html_to_markdown("<strong><small>Small</small></strong>")).to eq(
"**<small>Small</small>**",
)
expect(html_to_markdown("<small><strong>&lt;small&gt;</strong></small>")).to eq(
"<small>**&lt;small&gt;**</small>",
)
end
it "supports <big>" do
expect(html_to_markdown("<big>Big</big>")).to eq("<big>Big</big>")
expect(html_to_markdown("<big>&lt;big&gt;</big>")).to eq("<big>&lt;big&gt;</big>")
end
it "supports <kbd>" do
expect(html_to_markdown("<kbd>CTRL</kbd>+<kbd>C</kbd>")).to eq("<kbd>CTRL</kbd>+<kbd>C</kbd>")
expect(html_to_markdown("<kbd>&lt;</kbd>")).to eq("<kbd>&lt;</kbd>")
end
it "supports <abbr>" do
expect(
html_to_markdown(%Q{<abbr title="Civilized Discourse Construction Kit, Inc.">CDCK</abbr>}),
).to eq(%Q{<abbr title="Civilized Discourse Construction Kit, Inc.">CDCK</abbr>})
expect(
html_to_markdown(
%Q{<abbr title="&quot;abbr&quot;: The Abbreviation element">&lt;abbr&gt;</abbr>},
),
).to eq(%Q{<abbr title="&quot;abbr&quot;: The Abbreviation element">&lt;abbr&gt;</abbr>})
end
it "supports <s>" do
@ -366,6 +416,18 @@ RSpec.describe HtmlToMarkdown do
"<pre> function f() {\n console.log('Hello world!');\n }</pre>",
),
).to eq("```\n function f() {\n console.log('Hello world!');\n }\n```")
html = <<~HTML
<pre data-code-wrap="plaintext"><code class="lang-plaintext">Reported-and-tested-by: A &lt;a@example.com&gt;
Reviewed-by: B &lt;b@example.com&gt;</code></pre>
HTML
md = <<~MD
```plaintext
Reported-and-tested-by: A <a@example.com>
Reviewed-by: B <b@example.com>
```
MD
expect(html_to_markdown(html)).to eq(md.strip)
end
it "supports <pre> inside <blockquote>" do