Add 'keep_cid_imgs' option to HTML to Markdown converter to improve incoming email parsing

This commit is contained in:
Régis Hanol 2017-05-03 22:53:47 +02:00
parent 9d7917f79d
commit 768c63c103
2 changed files with 22 additions and 10 deletions

View File

@ -136,7 +136,7 @@ class HtmlToMarkdown
end
def visit_img(node)
if is_valid_url?(node["src"]) && is_visible_img?(node)
if is_valid_src?(node["src"]) && is_visible_img?(node)
if @opts[:keep_img_tags]
@stack[-1].markdown << node.to_html
else
@ -147,7 +147,7 @@ class HtmlToMarkdown
end
def visit_a(node)
if is_valid_url?(node["href"])
if is_valid_href?(node["href"])
@stack[-1].markdown << "["
traverse(node)
@stack[-1].markdown << "](#{node["href"]})"
@ -206,8 +206,14 @@ class HtmlToMarkdown
(lines + [""]).join("\n")
end
def is_valid_url?(url)
url.present? && (url.start_with?("http") || url.start_with?("www."))
def is_valid_href?(href)
href.present? && (href.start_with?("http") || href.start_with?("www."))
end
def is_valid_src?(src)
return false if src.blank?
return true if @opts[:keep_cid_imgs] && src.start_with?("cid:")
src.start_with?("http") || src.start_with?("www.")
end
def is_visible_img?(img)

View File

@ -3,8 +3,8 @@ require 'html_to_markdown'
describe HtmlToMarkdown do
def html_to_markdown(html)
HtmlToMarkdown.new(html).to_markdown
def html_to_markdown(html, opts={})
HtmlToMarkdown.new(html, opts).to_markdown
end
it "remove whitespaces" do
@ -56,13 +56,14 @@ describe HtmlToMarkdown do
end
HTML_WITH_IMG ||= %Q{<img src="https://www.discourse.org/logo.svg" alt="Discourse Logo">}
HTML_WITH_CID_IMG ||= %Q{<img src="cid:ii_1525434659ddb4cb" alt="Discourse Logo">}
it "converts <img>" do
expect(html_to_markdown(HTML_WITH_IMG)).to eq("![Discourse Logo](https://www.discourse.org/logo.svg)")
end
it "keeps <img> with 'keep_img_tags'" do
expect(HtmlToMarkdown.new(HTML_WITH_IMG, keep_img_tags: true).to_markdown).to eq(HTML_WITH_IMG)
expect(html_to_markdown(HTML_WITH_IMG, keep_img_tags: true)).to eq(HTML_WITH_IMG)
end
it "removes empty & invalid <img>" do
@ -71,6 +72,11 @@ describe HtmlToMarkdown do
expect(html_to_markdown(%Q{<img src="foo.bar">})).to eq("")
end
it "keeps <img> with src='cid:' whith 'keep_cid_imgs'" do
expect(html_to_markdown(HTML_WITH_CID_IMG, keep_cid_imgs: true)).to eq("![Discourse Logo](cid:ii_1525434659ddb4cb)")
expect(html_to_markdown(HTML_WITH_CID_IMG, keep_img_tags: true, keep_cid_imgs: true)).to eq("<img src=\"cid:ii_1525434659ddb4cb\" alt=\"Discourse Logo\">")
end
it "skips hidden <img>" do
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" width=0>})).to eq("")
expect(html_to_markdown(%Q{<img src="https://www.discourse.org/logo.svg" height="0">})).to eq("")