From 0ef7a969f2274285699080409b5901a2b6cd108d Mon Sep 17 00:00:00 2001 From: Leo McArdle Date: Mon, 14 Aug 2017 21:13:24 +0100 Subject: [PATCH] Some more HTML to Markdown fixes (#5046) * FIX: handle spaces better within emphasis tags in html_to_markdown * FIX: handle line breaks at beginning of emphasis tags in html_to_markdown --- lib/html_to_markdown.rb | 42 +++++++++++++----------- spec/components/html_to_markdown_spec.rb | 32 ++++++++++++++++++ 2 files changed, 54 insertions(+), 20 deletions(-) diff --git a/lib/html_to_markdown.rb b/lib/html_to_markdown.rb index 6edf93555e3..d21179b708c 100644 --- a/lib/html_to_markdown.rb +++ b/lib/html_to_markdown.rb @@ -178,6 +178,7 @@ class HtmlToMarkdown end def visit_br(node) + return if node.previous_sibling.nil? && EMPHASIS.include?(node.parent.name) @stack[-1].markdown << "\n" end @@ -185,29 +186,30 @@ class HtmlToMarkdown @stack[-1].markdown << "\n\n---\n\n" end - def visit_strong(node) - return if node.text.blank? - delimiter = node.text["*"] ? "__" : "**" - @stack[-1].markdown << delimiter - traverse(node) - @stack[-1].markdown.chomp! - @stack[-1].markdown << delimiter + EMPHASIS ||= %w{b strong i em} + EMPHASIS.each do |tag| + class_eval <<-RUBY + def visit_#{tag}(node) + return if node.text.empty? + return @stack[-1].markdown << " " if node.text.blank? + times = "#{tag}" == "i" || "#{tag}" == "em" ? 1 : 2 + delimiter = (node.text["*"] ? "_" : "*") * times + @stack[-1].markdown << " " if node.text[0] == " " + @stack[-1].markdown << delimiter + traverse(node) + @stack[-1].markdown.chomp! + if @stack[-1].markdown[-1] == " " + @stack[-1].markdown.chomp!(" ") + append_space = true + end + @stack[-1].markdown << delimiter + @stack[-1].markdown << " " if append_space + end + RUBY end - alias :visit_b :visit_strong - - def visit_em(node) - return if node.text.blank? - delimiter = node.text["*"] ? "_" : "*" - @stack[-1].markdown << delimiter - traverse(node) - @stack[-1].markdown.chomp! - @stack[-1].markdown << delimiter - end - - alias :visit_i :visit_em - def visit_text(node) + node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element.nil? && EMPHASIS.include?(node.parent.name) @stack[-1].markdown << node.text.gsub(/\s{2,}/, " ") end diff --git a/spec/components/html_to_markdown_spec.rb b/spec/components/html_to_markdown_spec.rb index 9b425035273..352547215dd 100644 --- a/spec/components/html_to_markdown_spec.rb +++ b/spec/components/html_to_markdown_spec.rb @@ -231,11 +231,13 @@ describe HtmlToMarkdown do context "with an oddly placed
" do it "handles " do + expect(html_to_markdown("
Bold
")).to eq("**Bold**") expect(html_to_markdown("Bold
")).to eq("**Bold**") expect(html_to_markdown("Bold
text
")).to eq("**Bold\ntext**") end it "handles " do + expect(html_to_markdown("
Italic
")).to eq("*Italic*") expect(html_to_markdown("Italic
")).to eq("*Italic*") expect(html_to_markdown("Italic
text
")).to eq("*Italic\ntext*") end @@ -247,11 +249,41 @@ describe HtmlToMarkdown do it "handles " do expect(html_to_markdown("")).to eq("") expect(html_to_markdown(" ")).to eq("") + expect(html_to_markdown("Some text")).to eq("Some text") + expect(html_to_markdown("Some text")).to eq("Some text") end it "handles " do expect(html_to_markdown("")).to eq("") expect(html_to_markdown(" ")).to eq("") + expect(html_to_markdown("Some text")).to eq("Some text") + expect(html_to_markdown("Some text")).to eq("Some text") + end + + end + + context "with spaces around text" do + + it "handles " do + expect(html_to_markdown(" Bold")).to eq("**Bold**") + expect(html_to_markdown(" Bold")).to eq("**Bold**") + expect(html_to_markdown("Bold ")).to eq("**Bold**") + expect(html_to_markdown("Bold ")).to eq("**Bold**") + expect(html_to_markdown("Some bold text")).to eq("Some **bold** text") + expect(html_to_markdown("Some bold text")).to eq("Some **bold** text") + expect(html_to_markdown("Some bold text")).to eq("Some **bold** text") + expect(html_to_markdown("Some bold text")).to eq("Some **bold** text") + end + + it "handles " do + expect(html_to_markdown(" Italic")).to eq("*Italic*") + expect(html_to_markdown(" Italic")).to eq("*Italic*") + expect(html_to_markdown("Italic ")).to eq("*Italic*") + expect(html_to_markdown("Italic ")).to eq("*Italic*") + expect(html_to_markdown("Some italic text")).to eq("Some *italic* text") + expect(html_to_markdown("Some italic text")).to eq("Some *italic* text") + expect(html_to_markdown("Some italic text")).to eq("Some *italic* text") + expect(html_to_markdown("Some italic text")).to eq("Some *italic* text") end end