DEV: Add `additional_allowed_tags` to `HtmlToMarkdown`
Import script often use subclasses of `HtmlToMarkdown` and might need to allow additional tags that can be used within the custom class.
This commit is contained in:
parent
1a42249bd8
commit
b8f2cbf41c
|
@ -26,7 +26,7 @@ class HtmlToMarkdown
|
|||
end
|
||||
|
||||
def remove_not_allowed!(doc)
|
||||
allowed = Set.new
|
||||
allowed = Set.new(@opts[:additional_allowed_tags] || [])
|
||||
|
||||
HtmlToMarkdown.private_instance_methods.each do |m|
|
||||
if tag = m.to_s[/^visit_(.+)/, 1]
|
||||
|
|
|
@ -65,6 +65,43 @@ RSpec.describe HtmlToMarkdown do
|
|||
expect(html_to_markdown(html)).to eq(markdown.strip)
|
||||
end
|
||||
|
||||
it "removes tags that aren't allowed" do
|
||||
html = <<~HTML
|
||||
<custom>Text withing custom <span>tag</span></custom>
|
||||
<div>Text within allowed tag</div>
|
||||
HTML
|
||||
|
||||
expect(html_to_markdown(html)).to eq("Text within allowed tag")
|
||||
end
|
||||
|
||||
it "allows additional tags that can be consumed by subclasses" do
|
||||
class ExtendedHtmlToMarkdown < HtmlToMarkdown
|
||||
def to_markdown
|
||||
yield @doc
|
||||
super
|
||||
end
|
||||
end
|
||||
|
||||
html = <<~HTML
|
||||
<custom-image image-id="42">Image text</custom-image>
|
||||
<div>Text within allowed tag</div>
|
||||
HTML
|
||||
|
||||
md =
|
||||
ExtendedHtmlToMarkdown
|
||||
.new(html)
|
||||
.to_markdown { |doc| expect(doc.css("custom-image")).to be_empty }
|
||||
expect(md).to eq("Text within allowed tag")
|
||||
|
||||
md =
|
||||
ExtendedHtmlToMarkdown
|
||||
.new(html, { additional_allowed_tags: ["custom-image"] })
|
||||
.to_markdown do |doc|
|
||||
doc.css("custom-image").each { |img| img.replace("Image #{img["image-id"]}") }
|
||||
end
|
||||
expect(md).to eq("Image 42\nText within allowed tag")
|
||||
end
|
||||
|
||||
it "doesn't error on non-inline elements like (aside, section)" do
|
||||
html = <<~HTML
|
||||
<aside class="quote no-group">
|
||||
|
|
Loading…
Reference in New Issue