From b8f2cbf41c2007fcec4676e53e6a4e95590126f6 Mon Sep 17 00:00:00 2001 From: Gerhard Schlager Date: Sun, 9 Jun 2024 12:51:23 +0200 Subject: [PATCH] DEV: Add `additional_allowed_tags` to `HtmlToMarkdown` Import script often use subclasses of `HtmlToMarkdown` and might need to allow additional tags that can be used within the custom class. --- lib/html_to_markdown.rb | 2 +- spec/lib/html_to_markdown_spec.rb | 37 +++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/lib/html_to_markdown.rb b/lib/html_to_markdown.rb index d302f6f2380..3052cedd318 100644 --- a/lib/html_to_markdown.rb +++ b/lib/html_to_markdown.rb @@ -26,7 +26,7 @@ class HtmlToMarkdown end def remove_not_allowed!(doc) - allowed = Set.new + allowed = Set.new(@opts[:additional_allowed_tags] || []) HtmlToMarkdown.private_instance_methods.each do |m| if tag = m.to_s[/^visit_(.+)/, 1] diff --git a/spec/lib/html_to_markdown_spec.rb b/spec/lib/html_to_markdown_spec.rb index 34b2cd30869..821d2ad1f15 100644 --- a/spec/lib/html_to_markdown_spec.rb +++ b/spec/lib/html_to_markdown_spec.rb @@ -65,6 +65,43 @@ RSpec.describe HtmlToMarkdown do expect(html_to_markdown(html)).to eq(markdown.strip) end + it "removes tags that aren't allowed" do + html = <<~HTML + Text withing custom tag +
Text within allowed tag
+ HTML + + expect(html_to_markdown(html)).to eq("Text within allowed tag") + end + + it "allows additional tags that can be consumed by subclasses" do + class ExtendedHtmlToMarkdown < HtmlToMarkdown + def to_markdown + yield @doc + super + end + end + + html = <<~HTML + Image text +
Text within allowed tag
+ HTML + + md = + ExtendedHtmlToMarkdown + .new(html) + .to_markdown { |doc| expect(doc.css("custom-image")).to be_empty } + expect(md).to eq("Text within allowed tag") + + md = + ExtendedHtmlToMarkdown + .new(html, { additional_allowed_tags: ["custom-image"] }) + .to_markdown do |doc| + doc.css("custom-image").each { |img| img.replace("Image #{img["image-id"]}") } + end + expect(md).to eq("Image 42\nText within allowed tag") + end + it "doesn't error on non-inline elements like (aside, section)" do html = <<~HTML