Some more HTML to Markdown fixes (#5046)

* FIX: handle spaces better within emphasis tags in html_to_markdown

* FIX: handle line breaks at beginning of emphasis tags in html_to_markdown
This commit is contained in:
Leo McArdle 2017-08-14 21:13:24 +01:00 committed by Régis Hanol
parent 43c0111ca1
commit 0ef7a969f2
2 changed files with 54 additions and 20 deletions

View File

@ -178,6 +178,7 @@ class HtmlToMarkdown
end
def visit_br(node)
return if node.previous_sibling.nil? && EMPHASIS.include?(node.parent.name)
@stack[-1].markdown << "\n"
end
@ -185,29 +186,30 @@ class HtmlToMarkdown
@stack[-1].markdown << "\n\n---\n\n"
end
def visit_strong(node)
return if node.text.blank?
delimiter = node.text["*"] ? "__" : "**"
EMPHASIS ||= %w{b strong i em}
EMPHASIS.each do |tag|
class_eval <<-RUBY
def visit_#{tag}(node)
return if node.text.empty?
return @stack[-1].markdown << " " if node.text.blank?
times = "#{tag}" == "i" || "#{tag}" == "em" ? 1 : 2
delimiter = (node.text["*"] ? "_" : "*") * times
@stack[-1].markdown << " " if node.text[0] == " "
@stack[-1].markdown << delimiter
traverse(node)
@stack[-1].markdown.chomp!
@stack[-1].markdown << delimiter
if @stack[-1].markdown[-1] == " "
@stack[-1].markdown.chomp!(" ")
append_space = true
end
alias :visit_b :visit_strong
def visit_em(node)
return if node.text.blank?
delimiter = node.text["*"] ? "_" : "*"
@stack[-1].markdown << delimiter
traverse(node)
@stack[-1].markdown.chomp!
@stack[-1].markdown << delimiter
@stack[-1].markdown << " " if append_space
end
RUBY
end
alias :visit_i :visit_em
def visit_text(node)
node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element.nil? && EMPHASIS.include?(node.parent.name)
@stack[-1].markdown << node.text.gsub(/\s{2,}/, " ")
end

View File

@ -231,11 +231,13 @@ describe HtmlToMarkdown do
context "with an oddly placed <br>" do
it "handles <strong>" do
expect(html_to_markdown("<strong><br>Bold</strong>")).to eq("**Bold**")
expect(html_to_markdown("<strong>Bold<br></strong>")).to eq("**Bold**")
expect(html_to_markdown("<strong>Bold<br>text</strong>")).to eq("**Bold\ntext**")
end
it "handles <em>" do
expect(html_to_markdown("<em><br>Italic</em>")).to eq("*Italic*")
expect(html_to_markdown("<em>Italic<br></em>")).to eq("*Italic*")
expect(html_to_markdown("<em>Italic<br>text</em>")).to eq("*Italic\ntext*")
end
@ -247,11 +249,41 @@ describe HtmlToMarkdown do
it "handles <strong>" do
expect(html_to_markdown("<strong></strong>")).to eq("")
expect(html_to_markdown("<strong> </strong>")).to eq("")
expect(html_to_markdown("Some<strong> </strong>text")).to eq("Some text")
expect(html_to_markdown("Some<strong> </strong>text")).to eq("Some text")
end
it "handles <em>" do
expect(html_to_markdown("<em></em>")).to eq("")
expect(html_to_markdown("<em> </em>")).to eq("")
expect(html_to_markdown("Some<em> </em>text")).to eq("Some text")
expect(html_to_markdown("Some<em> </em>text")).to eq("Some text")
end
end
context "with spaces around text" do
it "handles <strong>" do
expect(html_to_markdown("<strong> Bold</strong>")).to eq("**Bold**")
expect(html_to_markdown("<strong> Bold</strong>")).to eq("**Bold**")
expect(html_to_markdown("<strong>Bold </strong>")).to eq("**Bold**")
expect(html_to_markdown("<strong>Bold </strong>")).to eq("**Bold**")
expect(html_to_markdown("Some<strong> bold</strong> text")).to eq("Some **bold** text")
expect(html_to_markdown("Some<strong> bold</strong> text")).to eq("Some **bold** text")
expect(html_to_markdown("Some <strong>bold </strong>text")).to eq("Some **bold** text")
expect(html_to_markdown("Some <strong>bold </strong>text")).to eq("Some **bold** text")
end
it "handles <em>" do
expect(html_to_markdown("<em> Italic</em>")).to eq("*Italic*")
expect(html_to_markdown("<em> Italic</em>")).to eq("*Italic*")
expect(html_to_markdown("<em>Italic </em>")).to eq("*Italic*")
expect(html_to_markdown("<em>Italic </em>")).to eq("*Italic*")
expect(html_to_markdown("Some<em> italic</em> text")).to eq("Some *italic* text")
expect(html_to_markdown("Some<em> italic</em> text")).to eq("Some *italic* text")
expect(html_to_markdown("Some <em>italic </em>text")).to eq("Some *italic* text")
expect(html_to_markdown("Some <em>italic </em>text")).to eq("Some *italic* text")
end
end