FIX: properly trim whitespaces (including those pesky   html entities)

This commit is contained in:
Régis Hanol 2017-05-03 18:04:31 +02:00
parent dbb6e461aa
commit c880af8120
2 changed files with 18 additions and 4 deletions

View File

@ -15,10 +15,10 @@ class HtmlToMarkdown
def remove_whitespaces! def remove_whitespaces!
@doc.traverse do |node| @doc.traverse do |node|
if node.is_a? Nokogiri::XML::Text if node.is_a? Nokogiri::XML::Text
node.content = node.content.lstrip if node.previous_element&.description&.block? node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element&.description&.block?
node.content = node.content.lstrip if node.previous_element.nil? && node.parent.description&.block? node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element.nil? && node.parent.description&.block?
node.content = node.content.rstrip if node.next_element&.description&.block? node.content = node.content.gsub(/[[:space:]]+\z/, "") if node.next_element&.description&.block?
node.content = node.content.rstrip if node.next_element.nil? && node.parent.description&.block? node.content = node.content.gsub(/[[:space:]]+\z/, "") if node.next_element.nil? && node.parent.description&.block?
node.remove if node.content.empty? node.remove if node.content.empty?
end end
end end

View File

@ -7,6 +7,20 @@ describe HtmlToMarkdown do
HtmlToMarkdown.new(html).to_markdown HtmlToMarkdown.new(html).to_markdown
end end
it "remove whitespaces" do
expect(html_to_markdown(<<-HTML
<div dir="auto">Hello,
<div dir="auto"><br></div>
<div dir="auto">&nbsp; &nbsp; This is the 1st paragraph.&nbsp; &nbsp; </div>
<div dir="auto"><br></div>
<div dir="auto">
&nbsp; &nbsp; &nbsp; &nbsp; This is another paragraph
</div>
</div>
HTML
)).to eq("Hello,\n\nThis is the 1st paragraph.\n\nThis is another paragraph")
end
it "converts <strong>" do it "converts <strong>" do
expect(html_to_markdown("<strong>Strong</strong>")).to eq("**Strong**") expect(html_to_markdown("<strong>Strong</strong>")).to eq("**Strong**")
expect(html_to_markdown("<strong>Str*ng</strong>")).to eq("__Str*ng__") expect(html_to_markdown("<strong>Str*ng</strong>")).to eq("__Str*ng__")