Some more HTML to Markdown fixes (#5046)
* FIX: handle spaces better within emphasis tags in html_to_markdown * FIX: handle line breaks at beginning of emphasis tags in html_to_markdown
This commit is contained in:
parent
43c0111ca1
commit
0ef7a969f2
|
@ -178,6 +178,7 @@ class HtmlToMarkdown
|
|||
end
|
||||
|
||||
def visit_br(node)
|
||||
return if node.previous_sibling.nil? && EMPHASIS.include?(node.parent.name)
|
||||
@stack[-1].markdown << "\n"
|
||||
end
|
||||
|
||||
|
@ -185,29 +186,30 @@ class HtmlToMarkdown
|
|||
@stack[-1].markdown << "\n\n---\n\n"
|
||||
end
|
||||
|
||||
def visit_strong(node)
|
||||
return if node.text.blank?
|
||||
delimiter = node.text["*"] ? "__" : "**"
|
||||
@stack[-1].markdown << delimiter
|
||||
traverse(node)
|
||||
@stack[-1].markdown.chomp!
|
||||
@stack[-1].markdown << delimiter
|
||||
EMPHASIS ||= %w{b strong i em}
|
||||
EMPHASIS.each do |tag|
|
||||
class_eval <<-RUBY
|
||||
def visit_#{tag}(node)
|
||||
return if node.text.empty?
|
||||
return @stack[-1].markdown << " " if node.text.blank?
|
||||
times = "#{tag}" == "i" || "#{tag}" == "em" ? 1 : 2
|
||||
delimiter = (node.text["*"] ? "_" : "*") * times
|
||||
@stack[-1].markdown << " " if node.text[0] == " "
|
||||
@stack[-1].markdown << delimiter
|
||||
traverse(node)
|
||||
@stack[-1].markdown.chomp!
|
||||
if @stack[-1].markdown[-1] == " "
|
||||
@stack[-1].markdown.chomp!(" ")
|
||||
append_space = true
|
||||
end
|
||||
@stack[-1].markdown << delimiter
|
||||
@stack[-1].markdown << " " if append_space
|
||||
end
|
||||
RUBY
|
||||
end
|
||||
|
||||
alias :visit_b :visit_strong
|
||||
|
||||
def visit_em(node)
|
||||
return if node.text.blank?
|
||||
delimiter = node.text["*"] ? "_" : "*"
|
||||
@stack[-1].markdown << delimiter
|
||||
traverse(node)
|
||||
@stack[-1].markdown.chomp!
|
||||
@stack[-1].markdown << delimiter
|
||||
end
|
||||
|
||||
alias :visit_i :visit_em
|
||||
|
||||
def visit_text(node)
|
||||
node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element.nil? && EMPHASIS.include?(node.parent.name)
|
||||
@stack[-1].markdown << node.text.gsub(/\s{2,}/, " ")
|
||||
end
|
||||
|
||||
|
|
|
@ -231,11 +231,13 @@ describe HtmlToMarkdown do
|
|||
context "with an oddly placed <br>" do
|
||||
|
||||
it "handles <strong>" do
|
||||
expect(html_to_markdown("<strong><br>Bold</strong>")).to eq("**Bold**")
|
||||
expect(html_to_markdown("<strong>Bold<br></strong>")).to eq("**Bold**")
|
||||
expect(html_to_markdown("<strong>Bold<br>text</strong>")).to eq("**Bold\ntext**")
|
||||
end
|
||||
|
||||
it "handles <em>" do
|
||||
expect(html_to_markdown("<em><br>Italic</em>")).to eq("*Italic*")
|
||||
expect(html_to_markdown("<em>Italic<br></em>")).to eq("*Italic*")
|
||||
expect(html_to_markdown("<em>Italic<br>text</em>")).to eq("*Italic\ntext*")
|
||||
end
|
||||
|
@ -247,11 +249,41 @@ describe HtmlToMarkdown do
|
|||
it "handles <strong>" do
|
||||
expect(html_to_markdown("<strong></strong>")).to eq("")
|
||||
expect(html_to_markdown("<strong> </strong>")).to eq("")
|
||||
expect(html_to_markdown("Some<strong> </strong>text")).to eq("Some text")
|
||||
expect(html_to_markdown("Some<strong> </strong>text")).to eq("Some text")
|
||||
end
|
||||
|
||||
it "handles <em>" do
|
||||
expect(html_to_markdown("<em></em>")).to eq("")
|
||||
expect(html_to_markdown("<em> </em>")).to eq("")
|
||||
expect(html_to_markdown("Some<em> </em>text")).to eq("Some text")
|
||||
expect(html_to_markdown("Some<em> </em>text")).to eq("Some text")
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
context "with spaces around text" do
|
||||
|
||||
it "handles <strong>" do
|
||||
expect(html_to_markdown("<strong> Bold</strong>")).to eq("**Bold**")
|
||||
expect(html_to_markdown("<strong> Bold</strong>")).to eq("**Bold**")
|
||||
expect(html_to_markdown("<strong>Bold </strong>")).to eq("**Bold**")
|
||||
expect(html_to_markdown("<strong>Bold </strong>")).to eq("**Bold**")
|
||||
expect(html_to_markdown("Some<strong> bold</strong> text")).to eq("Some **bold** text")
|
||||
expect(html_to_markdown("Some<strong> bold</strong> text")).to eq("Some **bold** text")
|
||||
expect(html_to_markdown("Some <strong>bold </strong>text")).to eq("Some **bold** text")
|
||||
expect(html_to_markdown("Some <strong>bold </strong>text")).to eq("Some **bold** text")
|
||||
end
|
||||
|
||||
it "handles <em>" do
|
||||
expect(html_to_markdown("<em> Italic</em>")).to eq("*Italic*")
|
||||
expect(html_to_markdown("<em> Italic</em>")).to eq("*Italic*")
|
||||
expect(html_to_markdown("<em>Italic </em>")).to eq("*Italic*")
|
||||
expect(html_to_markdown("<em>Italic </em>")).to eq("*Italic*")
|
||||
expect(html_to_markdown("Some<em> italic</em> text")).to eq("Some *italic* text")
|
||||
expect(html_to_markdown("Some<em> italic</em> text")).to eq("Some *italic* text")
|
||||
expect(html_to_markdown("Some <em>italic </em>text")).to eq("Some *italic* text")
|
||||
expect(html_to_markdown("Some <em>italic </em>text")).to eq("Some *italic* text")
|
||||
end
|
||||
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue