Some more HTML to Markdown fixes (#5046)
* FIX: handle spaces better within emphasis tags in html_to_markdown * FIX: handle line breaks at beginning of emphasis tags in html_to_markdown
This commit is contained in:
parent
43c0111ca1
commit
0ef7a969f2
|
@ -178,6 +178,7 @@ class HtmlToMarkdown
|
||||||
end
|
end
|
||||||
|
|
||||||
def visit_br(node)
|
def visit_br(node)
|
||||||
|
return if node.previous_sibling.nil? && EMPHASIS.include?(node.parent.name)
|
||||||
@stack[-1].markdown << "\n"
|
@stack[-1].markdown << "\n"
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -185,29 +186,30 @@ class HtmlToMarkdown
|
||||||
@stack[-1].markdown << "\n\n---\n\n"
|
@stack[-1].markdown << "\n\n---\n\n"
|
||||||
end
|
end
|
||||||
|
|
||||||
def visit_strong(node)
|
EMPHASIS ||= %w{b strong i em}
|
||||||
return if node.text.blank?
|
EMPHASIS.each do |tag|
|
||||||
delimiter = node.text["*"] ? "__" : "**"
|
class_eval <<-RUBY
|
||||||
|
def visit_#{tag}(node)
|
||||||
|
return if node.text.empty?
|
||||||
|
return @stack[-1].markdown << " " if node.text.blank?
|
||||||
|
times = "#{tag}" == "i" || "#{tag}" == "em" ? 1 : 2
|
||||||
|
delimiter = (node.text["*"] ? "_" : "*") * times
|
||||||
|
@stack[-1].markdown << " " if node.text[0] == " "
|
||||||
@stack[-1].markdown << delimiter
|
@stack[-1].markdown << delimiter
|
||||||
traverse(node)
|
traverse(node)
|
||||||
@stack[-1].markdown.chomp!
|
@stack[-1].markdown.chomp!
|
||||||
@stack[-1].markdown << delimiter
|
if @stack[-1].markdown[-1] == " "
|
||||||
|
@stack[-1].markdown.chomp!(" ")
|
||||||
|
append_space = true
|
||||||
end
|
end
|
||||||
|
|
||||||
alias :visit_b :visit_strong
|
|
||||||
|
|
||||||
def visit_em(node)
|
|
||||||
return if node.text.blank?
|
|
||||||
delimiter = node.text["*"] ? "_" : "*"
|
|
||||||
@stack[-1].markdown << delimiter
|
|
||||||
traverse(node)
|
|
||||||
@stack[-1].markdown.chomp!
|
|
||||||
@stack[-1].markdown << delimiter
|
@stack[-1].markdown << delimiter
|
||||||
|
@stack[-1].markdown << " " if append_space
|
||||||
|
end
|
||||||
|
RUBY
|
||||||
end
|
end
|
||||||
|
|
||||||
alias :visit_i :visit_em
|
|
||||||
|
|
||||||
def visit_text(node)
|
def visit_text(node)
|
||||||
|
node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element.nil? && EMPHASIS.include?(node.parent.name)
|
||||||
@stack[-1].markdown << node.text.gsub(/\s{2,}/, " ")
|
@stack[-1].markdown << node.text.gsub(/\s{2,}/, " ")
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -231,11 +231,13 @@ describe HtmlToMarkdown do
|
||||||
context "with an oddly placed <br>" do
|
context "with an oddly placed <br>" do
|
||||||
|
|
||||||
it "handles <strong>" do
|
it "handles <strong>" do
|
||||||
|
expect(html_to_markdown("<strong><br>Bold</strong>")).to eq("**Bold**")
|
||||||
expect(html_to_markdown("<strong>Bold<br></strong>")).to eq("**Bold**")
|
expect(html_to_markdown("<strong>Bold<br></strong>")).to eq("**Bold**")
|
||||||
expect(html_to_markdown("<strong>Bold<br>text</strong>")).to eq("**Bold\ntext**")
|
expect(html_to_markdown("<strong>Bold<br>text</strong>")).to eq("**Bold\ntext**")
|
||||||
end
|
end
|
||||||
|
|
||||||
it "handles <em>" do
|
it "handles <em>" do
|
||||||
|
expect(html_to_markdown("<em><br>Italic</em>")).to eq("*Italic*")
|
||||||
expect(html_to_markdown("<em>Italic<br></em>")).to eq("*Italic*")
|
expect(html_to_markdown("<em>Italic<br></em>")).to eq("*Italic*")
|
||||||
expect(html_to_markdown("<em>Italic<br>text</em>")).to eq("*Italic\ntext*")
|
expect(html_to_markdown("<em>Italic<br>text</em>")).to eq("*Italic\ntext*")
|
||||||
end
|
end
|
||||||
|
@ -247,11 +249,41 @@ describe HtmlToMarkdown do
|
||||||
it "handles <strong>" do
|
it "handles <strong>" do
|
||||||
expect(html_to_markdown("<strong></strong>")).to eq("")
|
expect(html_to_markdown("<strong></strong>")).to eq("")
|
||||||
expect(html_to_markdown("<strong> </strong>")).to eq("")
|
expect(html_to_markdown("<strong> </strong>")).to eq("")
|
||||||
|
expect(html_to_markdown("Some<strong> </strong>text")).to eq("Some text")
|
||||||
|
expect(html_to_markdown("Some<strong> </strong>text")).to eq("Some text")
|
||||||
end
|
end
|
||||||
|
|
||||||
it "handles <em>" do
|
it "handles <em>" do
|
||||||
expect(html_to_markdown("<em></em>")).to eq("")
|
expect(html_to_markdown("<em></em>")).to eq("")
|
||||||
expect(html_to_markdown("<em> </em>")).to eq("")
|
expect(html_to_markdown("<em> </em>")).to eq("")
|
||||||
|
expect(html_to_markdown("Some<em> </em>text")).to eq("Some text")
|
||||||
|
expect(html_to_markdown("Some<em> </em>text")).to eq("Some text")
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
context "with spaces around text" do
|
||||||
|
|
||||||
|
it "handles <strong>" do
|
||||||
|
expect(html_to_markdown("<strong> Bold</strong>")).to eq("**Bold**")
|
||||||
|
expect(html_to_markdown("<strong> Bold</strong>")).to eq("**Bold**")
|
||||||
|
expect(html_to_markdown("<strong>Bold </strong>")).to eq("**Bold**")
|
||||||
|
expect(html_to_markdown("<strong>Bold </strong>")).to eq("**Bold**")
|
||||||
|
expect(html_to_markdown("Some<strong> bold</strong> text")).to eq("Some **bold** text")
|
||||||
|
expect(html_to_markdown("Some<strong> bold</strong> text")).to eq("Some **bold** text")
|
||||||
|
expect(html_to_markdown("Some <strong>bold </strong>text")).to eq("Some **bold** text")
|
||||||
|
expect(html_to_markdown("Some <strong>bold </strong>text")).to eq("Some **bold** text")
|
||||||
|
end
|
||||||
|
|
||||||
|
it "handles <em>" do
|
||||||
|
expect(html_to_markdown("<em> Italic</em>")).to eq("*Italic*")
|
||||||
|
expect(html_to_markdown("<em> Italic</em>")).to eq("*Italic*")
|
||||||
|
expect(html_to_markdown("<em>Italic </em>")).to eq("*Italic*")
|
||||||
|
expect(html_to_markdown("<em>Italic </em>")).to eq("*Italic*")
|
||||||
|
expect(html_to_markdown("Some<em> italic</em> text")).to eq("Some *italic* text")
|
||||||
|
expect(html_to_markdown("Some<em> italic</em> text")).to eq("Some *italic* text")
|
||||||
|
expect(html_to_markdown("Some <em>italic </em>text")).to eq("Some *italic* text")
|
||||||
|
expect(html_to_markdown("Some <em>italic </em>text")).to eq("Some *italic* text")
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue