DEV: Add `additional_allowed_tags` to `HtmlToMarkdown`

Import script often use subclasses of `HtmlToMarkdown` and might need to allow additional tags that can be used within the custom class.
This commit is contained in:
Gerhard Schlager 2024-06-09 12:51:23 +02:00 committed by Gerhard Schlager
parent 1a42249bd8
commit b8f2cbf41c
2 changed files with 38 additions and 1 deletions

View File

@ -26,7 +26,7 @@ class HtmlToMarkdown
end
def remove_not_allowed!(doc)
allowed = Set.new
allowed = Set.new(@opts[:additional_allowed_tags] || [])
HtmlToMarkdown.private_instance_methods.each do |m|
if tag = m.to_s[/^visit_(.+)/, 1]

View File

@ -65,6 +65,43 @@ RSpec.describe HtmlToMarkdown do
expect(html_to_markdown(html)).to eq(markdown.strip)
end
it "removes tags that aren't allowed" do
html = <<~HTML
<custom>Text withing custom <span>tag</span></custom>
<div>Text within allowed tag</div>
HTML
expect(html_to_markdown(html)).to eq("Text within allowed tag")
end
it "allows additional tags that can be consumed by subclasses" do
class ExtendedHtmlToMarkdown < HtmlToMarkdown
def to_markdown
yield @doc
super
end
end
html = <<~HTML
<custom-image image-id="42">Image text</custom-image>
<div>Text within allowed tag</div>
HTML
md =
ExtendedHtmlToMarkdown
.new(html)
.to_markdown { |doc| expect(doc.css("custom-image")).to be_empty }
expect(md).to eq("Text within allowed tag")
md =
ExtendedHtmlToMarkdown
.new(html, { additional_allowed_tags: ["custom-image"] })
.to_markdown do |doc|
doc.css("custom-image").each { |img| img.replace("Image #{img["image-id"]}") }
end
expect(md).to eq("Image 42\nText within allowed tag")
end
it "doesn't error on non-inline elements like (aside, section)" do
html = <<~HTML
<aside class="quote no-group">