Add 'keep_cid_imgs' option to HTML to Markdown converter to improve incoming email parsing
This commit is contained in:
parent
9d7917f79d
commit
768c63c103
|
@ -136,7 +136,7 @@ class HtmlToMarkdown
|
|||
end
|
||||
|
||||
def visit_img(node)
|
||||
if is_valid_url?(node["src"]) && is_visible_img?(node)
|
||||
if is_valid_src?(node["src"]) && is_visible_img?(node)
|
||||
if @opts[:keep_img_tags]
|
||||
@stack[-1].markdown << node.to_html
|
||||
else
|
||||
|
@ -147,7 +147,7 @@ class HtmlToMarkdown
|
|||
end
|
||||
|
||||
def visit_a(node)
|
||||
if is_valid_url?(node["href"])
|
||||
if is_valid_href?(node["href"])
|
||||
@stack[-1].markdown << "["
|
||||
traverse(node)
|
||||
@stack[-1].markdown << "](#{node["href"]})"
|
||||
|
@ -206,14 +206,20 @@ class HtmlToMarkdown
|
|||
(lines + [""]).join("\n")
|
||||
end
|
||||
|
||||
def is_valid_url?(url)
|
||||
url.present? && (url.start_with?("http") || url.start_with?("www."))
|
||||
def is_valid_href?(href)
|
||||
href.present? && (href.start_with?("http") || href.start_with?("www."))
|
||||
end
|
||||
|
||||
def is_valid_src?(src)
|
||||
return false if src.blank?
|
||||
return true if @opts[:keep_cid_imgs] && src.start_with?("cid:")
|
||||
src.start_with?("http") || src.start_with?("www.")
|
||||
end
|
||||
|
||||
def is_visible_img?(img)
|
||||
return false if img["width"].present? && img["width"].to_i == 0
|
||||
return false if img["width"].present? && img["width"].to_i == 0
|
||||
return false if img["height"].present? && img["height"].to_i == 0
|
||||
return false if img["style"].present? && img["style"][/(width|height)\s*:\s*0/]
|
||||
return false if img["style"].present? && img["style"][/(width|height)\s*:\s*0/]
|
||||
true
|
||||
end
|
||||
|
||||
|
|
|
@ -3,8 +3,8 @@ require 'html_to_markdown'
|
|||
|
||||
describe HtmlToMarkdown do
|
||||
|
||||
def html_to_markdown(html)
|
||||
HtmlToMarkdown.new(html).to_markdown
|
||||
def html_to_markdown(html, opts={})
|
||||
HtmlToMarkdown.new(html, opts).to_markdown
|
||||
end
|
||||
|
||||
it "remove whitespaces" do
|
||||
|
@ -55,14 +55,15 @@ describe HtmlToMarkdown do
|
|||
expect(html_to_markdown(%Q{<a href="foo.bar">Discourse</a>})).to eq("Discourse")
|
||||
end
|
||||
|
||||
HTML_WITH_IMG ||= %Q{<img src="https://www.discourse.org/logo.svg" alt="Discourse Logo">}
|
||||
HTML_WITH_IMG ||= %Q{<img src="https://www.discourse.org/logo.svg" alt="Discourse Logo">}
|
||||
HTML_WITH_CID_IMG ||= %Q{<img src="cid:ii_1525434659ddb4cb" alt="Discourse Logo">}
|
||||
|
||||
it "converts <img>" do
|
||||
expect(html_to_markdown(HTML_WITH_IMG)).to eq("![Discourse Logo](https://www.discourse.org/logo.svg)")
|
||||
end
|
||||
|
||||
it "keeps <img> with 'keep_img_tags'" do
|
||||
expect(HtmlToMarkdown.new(HTML_WITH_IMG, keep_img_tags: true).to_markdown).to eq(HTML_WITH_IMG)
|
||||
expect(html_to_markdown(HTML_WITH_IMG, keep_img_tags: true)).to eq(HTML_WITH_IMG)
|
||||
end
|
||||
|
||||
it "removes empty & invalid <img>" do
|
||||
|
@ -71,6 +72,11 @@ describe HtmlToMarkdown do
|
|||
expect(html_to_markdown(%Q{<img src="foo.bar">})).to eq("")
|
||||
end
|
||||
|
||||
it "keeps <img> with src='cid:' whith 'keep_cid_imgs'" do
|
||||
expect(html_to_markdown(HTML_WITH_CID_IMG, keep_cid_imgs: true)).to eq("![Discourse Logo](cid:ii_1525434659ddb4cb)")
|
||||
expect(html_to_markdown(HTML_WITH_CID_IMG, keep_img_tags: true, keep_cid_imgs: true)).to eq("<img src=\"cid:ii_1525434659ddb4cb\" alt=\"Discourse Logo\">")
|
||||
end
|
||||
|
||||
it "skips hidden <img>" do
|
||||
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" width=0>})).to eq("")
|
||||
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" height="0">})).to eq("")
|
||||
|
|
Loading…
Reference in New Issue