diff --git a/Gemfile b/Gemfile index 39b714d5c77..a0f43d3313f 100644 --- a/Gemfile +++ b/Gemfile @@ -118,7 +118,6 @@ gem 'rake' gem 'thor', require: false gem 'diffy', require: false gem 'rinku' -gem 'sanitize' gem 'sidekiq' gem 'mini_scheduler' diff --git a/Gemfile.lock b/Gemfile.lock index b059691d9d4..b4284976574 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -533,7 +533,6 @@ DEPENDENCIES ruby-prof ruby-readability rubyzip - sanitize sassc (= 2.0.1) sassc-rails seed-fu diff --git a/lib/html_to_markdown.rb b/lib/html_to_markdown.rb index a3eb5e8aaeb..448a0cd7257 100644 --- a/lib/html_to_markdown.rb +++ b/lib/html_to_markdown.rb @@ -1,267 +1,365 @@ # frozen_string_literal: true -require "nokogiri" +require "nokogumbo" +require "securerandom" class HtmlToMarkdown - class Block < Struct.new(:name, :head, :body, :opened, :markdown) - def initialize(name, head = "", body = "", opened = false, markdown = +"") - super - end - end - def initialize(html, opts = {}) - @opts = opts || {} - @doc = fix_span_elements(Nokogiri::HTML(html)) + @opts = opts - remove_whitespaces! - end + # we're only interested in + @doc = Nokogiri::HTML5(html).at("body") - # If a `
` is within a `` that's invalid, so let's hoist the `
` up - INLINE_ELEMENTS ||= %w{span font} - BLOCK_ELEMENTS ||= %w{div p} - def fix_span_elements(node) - if (INLINE_ELEMENTS.include?(node.name) && BLOCK_ELEMENTS.any? { |e| node.at(e) }) - node.swap(node.children) - end - - node.children.each { |c| fix_span_elements(c) } - node - end - - def remove_whitespaces! - @doc.traverse do |node| - if node.is_a?(Nokogiri::XML::Text) && node.parent.name != "pre" - node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element&.description&.block? - node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element.nil? && node.parent.description&.block? - node.content = node.content.gsub(/[[:space:]]+\z/, "") if node.next_element&.description&.block? - node.content = node.content.gsub(/[[:space:]]+\z/, "") if node.next_element.nil? && node.parent.description&.block? - node.content = node.content.gsub(/\r\n?/, "\n") - node.remove if node.content.empty? - end - end + remove_not_allowed!(@doc) + remove_hidden!(@doc) + hoist_line_breaks!(@doc) + remove_whitespaces!(@doc) end def to_markdown - @stack = [Block.new("root")] - @markdown = +"" traverse(@doc) - @markdown << format_block - @markdown.gsub!(/\n{3,}/, "\n\n") - @markdown.strip! - @markdown + .gsub(/\n{2,}/, "\n\n") + .strip + end + + private + + def remove_not_allowed!(doc) + allowed = Set.new + + HtmlToMarkdown.private_instance_methods.each do |m| + if tag = m.to_s[/^visit_(.+)/, 1] + allowed << tag + end + end + + @doc.traverse { |node| node.remove if !allowed.include?(node.name) } + end + + HIDDEN_STYLES ||= /(display\s*:\s*none)|(visibility\s*:\s*hidden)|(opacity\s*:\s*0)|(transform\s*:\s*scale\(0\))|((width|height)\s*:\s*0)/i + + def remove_hidden!(doc) + @doc.css("[hidden]").remove + @doc.css("[style]").each { |n| n.remove if n["style"][HIDDEN_STYLES] } + @doc.css("img[width]").each { |n| n.remove if n["width"].to_i <= 0 } + @doc.css("img[height]").each { |n| n.remove if n["height"].to_i <= 0 } + end + + # When there's a
inside an inline element, split the inline element around the
+ def hoist_line_breaks!(doc) + klass = "_" + SecureRandom.hex + doc.css("br").each { |br| br.add_class(klass) } + + loop do + changed = false + + doc.css("br.#{klass}").each do |br| + parent = br.parent + + if parent.description.block? + br.remove_class(klass) + else + before, after = parent.children.slice_when { |n| n == br }.to_a + + if before.size > 1 + b = Nokogiri::XML::Node.new(parent.name, doc) + before[0...-1].each { |c| b.add_child(c) } + parent.previous = b if b.inner_html.present? + end + + if after.present? + a = Nokogiri::XML::Node.new(parent.name, doc) + after.each { |c| a.add_child(c) } + parent.next = a if a.inner_html.present? + end + + parent.replace(br) + + changed = true + end + end + + break if !changed + end + end + + # Removes most of the unnecessary white spaces for better markdown conversion + # Loosely based on the CSS' White Space Processing Rules (https://www.w3.org/TR/css-text-3/#white-space-rules) + def remove_whitespaces!(node) + return true if "pre" == node.name + + node.children.chunk { |n| is_inline?(n) }.each do |inline, nodes| + if inline + collapse_spaces!(nodes) && remove_trailing_space!(nodes) + else + nodes.each { |n| remove_whitespaces!(n) } + end + end + end + + def is_inline?(node) + node.text? || ("br" != node.name && node.description.inline? && node.children.all? { |n| is_inline?(n) }) + end + + def collapse_spaces!(nodes, was_space = true) + nodes.each do |node| + if node.text? + text = String.new + + node.text.chars.each do |c| + if c[/[[:space:]]/] + text << " " if !was_space + was_space = true + else + text << c + was_space = false + end + end + + node.content = text + else + node.children.each { |n| was_space = collapse_spaces!([n], was_space) } + end + end + + was_space + end + + def remove_trailing_space!(nodes) + last = nodes[-1] + + if last.text? + last.content = last.content[0...-1] if last.content[-1] == " " + elsif last.children.present? + remove_trailing_space!(last.children) + end end def traverse(node) - node.children.each { |n| visit(n) } + node.children.map { |n| visit(n) }.join end def visit(node) - return if node["style"] && node["style"][/display\s*:\s*none/] - - if node.description&.block? && node.parent&.description&.block? && @stack[-1].markdown.size > 0 - block = @stack[-1].dup - @markdown << format_block - block.markdown = +"" - block.opened = true - @stack << block - end - visitor = "visit_#{node.name}" - respond_to?(visitor) ? send(visitor, node) : traverse(node) + send(visitor, node) if respond_to?(visitor, true) end - BLACKLISTED ||= %w{button datalist fieldset form input label legend meter optgroup option output progress select textarea style script} - BLACKLISTED.each do |tag| - class_eval <<-RUBY - def visit_#{tag}(node) - "" - end - RUBY - end + ALLOWED_IMG_SRCS ||= %w{http:// https:// www.} - def visit_pre(node) - code = node.children.find { |c| c.name == "code" } - code_class = code ? code["class"] : "" - lang = code_class ? code_class[/lang-(\w+)/, 1] : "" - pre = Block.new("pre") - pre.markdown = +"```#{lang}\n" - @stack << pre - traverse(node) - pre.markdown << "\n```\n" - @markdown << format_block - end - - def visit_blockquote(node) - @stack << Block.new("blockquote", "> ", "> ") - traverse(node) - @markdown << format_block - end - - BLOCK_WITH_NEWLINE ||= %w{div p} - BLOCK_WITH_NEWLINE.each do |tag| - class_eval <<-RUBY - def visit_#{tag}(node) - @stack << Block.new("#{tag}") - traverse(node) - @markdown << format_block - @markdown << "\n" - end - RUBY - end - - BLOCK_LIST ||= %w{menu ol ul} - BLOCK_LIST.each do |tag| - class_eval <<-RUBY - def visit_#{tag}(node) - @stack << Block.new("#{tag}") - traverse(node) - @markdown << format_block - end - RUBY - end - - def visit_li(node) - parent = @stack.reverse.find { |n| n.name[/ul|ol|menu/] } - prefix = parent&.name == "ol" ? "1. " : "- " - @stack << Block.new("li", prefix, " ") - traverse(node) - @markdown << format_block - end - - (1..6).each do |n| - class_eval <<-RUBY - def visit_h#{n}(node) - @stack << Block.new("h#{n}", "#" * #{n} + " ") - traverse(node) - @markdown << format_block - end - RUBY - end - - WHITELISTED ||= %w{del ins kbd s small strike sub sup} - WHITELISTED.each do |tag| - class_eval <<-RUBY - def visit_#{tag}(node) - @stack[-1].markdown << "<#{tag}>" - traverse(node) - @stack[-1].markdown << "" - end - RUBY - end - - def visit_abbr(node) - @stack[-1].markdown << (node["title"].present? ? %Q[] : "") - traverse(node) - @stack[-1].markdown << "" - end - - def visit_img(node) - if is_valid_src?(node["src"]) && is_visible_img?(node) - if @opts[:keep_img_tags] - @stack[-1].markdown << node.to_html - else - title = node["alt"].presence || node["title"].presence - @stack[-1].markdown << "![#{title}](#{node["src"]})" - end + def allowed_hrefs + @allowed_hrefs ||= begin + hrefs = SiteSetting.allowed_href_schemes.split("|").map { |scheme| "#{scheme}:" }.to_set + ALLOWED_IMG_SRCS.each { |src| hrefs << src } + hrefs << "mailto:" + hrefs.to_a end end def visit_a(node) - if is_valid_href?(node["href"]) - @stack[-1].markdown << "[" - traverse(node) - @stack[-1].markdown << "](#{node["href"]})" + if node["href"].present? && node["href"].starts_with?(*allowed_hrefs) + "[#{traverse(node)}](#{node["href"]})" else traverse(node) end end + def visit_img(node) + return if node["src"].blank? + + if @opts[:keep_img_tags] + node.to_html + elsif @opts[:keep_cid_imgs] && node["src"].starts_with?("cid:") + node.to_html + elsif node["src"].starts_with?(*ALLOWED_IMG_SRCS) + title = node["alt"].presence || node["title"].presence + width = node["width"].to_i + height = node["height"].to_i + dimensions = "|#{width}x#{height}" if width > 0 && height > 0 + "![#{title}#{dimensions}](#{node["src"]})" + end + end + + ALLOWED ||= %w{kbd del ins small big sub sup dl dd dt} + ALLOWED.each do |tag| + define_method("visit_#{tag}") do |node| + "<#{tag}>#{traverse(node)}" + end + end + + def visit_blockquote(node) + text = traverse(node) + text.strip! + text.gsub!(/\n{2,}/, "\n\n") + text.gsub!(/^/, "> ") + "\n\n#{text}\n\n" + end + + BLOCKS ||= %w{div tr} + BLOCKS.each do |tag| + define_method("visit_#{tag}") do |node| + prefix = node.previous_element&.description&.block? ? "" : "\n" + "#{prefix}#{traverse(node)}\n" + end + end + + def visit_p(node) + "\n\n#{traverse(node)}\n\n" + end + + TRAVERSABLES ||= %w{aside font span thead tbody tfooter u} + TRAVERSABLES.each do |tag| + define_method("visit_#{tag}") do |node| + traverse(node) + end + end + def visit_tt(node) - @stack[-1].markdown << "`" - traverse(node) - @stack[-1].markdown << "`" + "`#{traverse(node)}`" end def visit_code(node) - @stack.reverse.find { |n| n.name["pre"] } ? traverse(node) : visit_tt(node) + node.ancestors("pre").present? ? traverse(node) : visit_tt(node) + end + + def visit_pre(node) + text = traverse(node) + fence = text["`"] ? "~~~" : "```" + code = node.at("code") + code_class = code ? code["class"] : "" + lang = code_class ? code_class[/lang-(\w+)/, 1] : "" + "\n\n#{fence}#{lang}\n#{traverse(node)}\n#{fence}\n\n" end def visit_br(node) - return if node.previous_sibling.nil? && EMPHASIS.include?(node.parent.name) - return if node.parent.name == "p" && (node.next_sibling&.text || "").start_with?("\n") - @stack[-1].markdown << "\n" + "\n" end def visit_hr(node) - @stack[-1].markdown << "\n\n---\n\n" + "\n\n---\n\n" end - EMPHASIS ||= %w{b strong i em} - EMPHASIS.each do |tag| - class_eval <<-RUBY - def visit_#{tag}(node) - return if node.text.empty? - return @stack[-1].markdown << " " if node.text.blank? - times = "#{tag}" == "i" || "#{tag}" == "em" ? 1 : 2 - delimiter = (node.text["*"] ? "_" : "*") * times - @stack[-1].markdown << " " if node.text[0] == " " - @stack[-1].markdown << delimiter - traverse(node) - @stack[-1].markdown.gsub!(/\n+$/, "") - if @stack[-1].markdown[-1] == " " - @stack[-1].markdown.chomp!(" ") - append_space = true - end - @stack[-1].markdown << delimiter - @stack[-1].markdown << " " if append_space + def visit_abbr(node) + title = node["title"].presence + title_attr = title ? %[ title="#{title}"] : "" + "#{traverse(node)}" + end + + def visit_acronym(node) + visit_abbr(node) + end + + (1..6).each do |n| + define_method("visit_h#{n}") do |node| + "#{"#" * n} #{traverse(node)}" + end + end + + CELLS ||= %w{th td} + CELLS.each do |tag| + define_method("visit_#{tag}") do |node| + "#{traverse(node)} " + end + end + + def visit_table(node) + if rows = extract_rows(node) + headers = rows[0].css("td, th") + text = "| " + headers.map { |td| traverse(td).gsub(/\n/, "
") }.join(" | ") + " |\n" + text << "| " + (["-"] * headers.size).join(" | ") + " |\n" + rows[1..-1].each do |row| + text << "| " + row.css("td").map { |td| traverse(td).gsub(/\n/, "
") }.join(" | ") + " |\n" end - RUBY + "\n\n#{text}\n\n" + else + traverse(node) + end + end + + def extract_rows(table) + return if table.ancestors("table").present? + return if (rows = table.css("tr")).empty? + headers_count = rows[0].css("td, th").size + return if rows[1..-1].any? { |row| row.css("td").size != headers_count } + rows + end + + LISTS ||= %w{ul ol} + LISTS.each do |tag| + define_method("visit_#{tag}") do |node| + prefix = node.previous_element&.description&.block? ? "" : "\n" + suffix = node.ancestors("ul, ol, li").size > 0 ? "" : "\n" + "#{prefix}#{traverse(node)}#{suffix}" + end + end + + def visit_li(node) + text = traverse(node) + + lists = node.ancestors("ul, ol") + marker = "ol" == lists[0]&.name ? "1. " : "- " + indent = (" " * marker.size) * [1, lists.size].max + suffix = node == node.parent.elements[-1] ? "" : "\n" + + text.gsub!(/\n{2,}/, "\n\n") + text.gsub!(/^(?!\s*$)/, indent) + text.lstrip! + + "#{marker}#{text}#{suffix}" + end + + EMPHASES ||= %w{i em} + EMPHASES.each do |tag| + define_method("visit_#{tag}") do |node| + text = traverse(node) + + return "" if text.empty? + return " " if text.blank? + return "<#{tag}>#{text}" if text["\n"] || (text["*"] && text["_"]) + + prefix = text[0][" "] + suffix = text[-1][" "] if text.size > 1 + wrap = text["*"] ? "_" : "*" + + "#{prefix}#{wrap}#{text.strip}#{wrap}#{suffix}" + end + end + + STRONGS ||= %w{b strong} + STRONGS.each do |tag| + define_method("visit_#{tag}") do |node| + text = traverse(node) + + return "" if text.empty? + return " " if text.blank? + return "<#{tag}>#{text}" if text["\n"] || (text["*"] && text["_"]) + + prefix = text[0][" "] + suffix = text[-1][" "] if text.size > 1 + wrap = text["*"] ? "__" : "**" + + "#{prefix}#{wrap}#{text.strip}#{wrap}#{suffix}" + end + end + + STRIKES ||= %w{s strike} + STRIKES.each do |tag| + define_method("visit_#{tag}") do |node| + text = traverse(node) + + return "" if text.empty? + return " " if text.blank? + return "<#{tag}>#{text}" if text["\n"] || text["~~"] + + prefix = text[0][" "] + suffix = text[-1][" "] if text.size > 1 + + "#{prefix}~~#{text.strip}~~#{suffix}" + end end def visit_text(node) - top_block = @stack[-1] - - if top_block.name == "pre" - top_block.markdown << node.text - return - end - - node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element.nil? && EMPHASIS.include?(node.parent.name) - - if top_block.markdown.present? && indent = node.text[/^\s+/] - top_block.markdown << indent - end - - text = node.text.gsub(/^\s+/, "").gsub(/\s{2,}/, " ") - top_block.markdown << text - end - - def format_block - lines = @stack[-1].markdown.each_line.map do |line| - prefix = @stack.map { |b| b.opened ? b.body : b.head }.join - @stack.each { |b| b.opened = true } - prefix + line.rstrip - end - @stack.pop - (lines + [""]).join("\n") - end - - def is_valid_href?(href) - href.present? && (href.start_with?("http") || href.start_with?("www.")) - end - - def is_valid_src?(src) - return false if src.blank? - return true if @opts[:keep_cid_imgs] && src.start_with?("cid:") - src.start_with?("http") || src.start_with?("www.") - end - - def is_visible_img?(img) - return false if img["width"].present? && img["width"].to_i == 0 - return false if img["height"].present? && img["height"].to_i == 0 - return false if img["style"].present? && img["style"][/(width|height)\s*:\s*0/] - true + node.text end end diff --git a/spec/components/html_to_markdown_spec.rb b/spec/components/html_to_markdown_spec.rb index 266e875aca6..3d19c66f9a2 100644 --- a/spec/components/html_to_markdown_spec.rb +++ b/spec/components/html_to_markdown_spec.rb @@ -10,7 +10,7 @@ describe HtmlToMarkdown do end it "remove whitespaces" do - expect(html_to_markdown(<<-HTML + html = <<-HTML
Hello,

    This is the 1st paragraph.   
@@ -20,11 +20,54 @@ describe HtmlToMarkdown do
HTML - )).to eq("Hello,\n\nThis is the 1st paragraph.\n\nThis is another paragraph") + + expect(html_to_markdown(html)).to eq("Hello,\n\nThis is the 1st paragraph.\n\nThis is another paragraph") + + html = <<~HTML + +

Let me see if it happens by answering your message through + Thunderbird.

+

Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 + Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 + Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 + Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 + Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 + Long sentence 1 +

+ + HTML + + markdown = <<~MD + Let me see if it happens by answering your message through Thunderbird. + + Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 Long sentence 1 + MD + + expect(html_to_markdown(html)).to eq(markdown.strip) + + html = <<~HTML +

This post + has lots
of + space +

+
    This     space    was   left untouched     !
+ HTML + + markdown = <<~MD + This post has lots + of space + + ``` + This space was left untouched ! + ``` + MD + + expect(html_to_markdown(html)).to eq(markdown.strip) end it "skips hidden tags" do expect(html_to_markdown(%Q{

Hello cruel World!

})).to eq("Hello World!") + expect(html_to_markdown(%Q{

Hello World!

})).to eq("Hello World!") end it "converts " do @@ -37,13 +80,15 @@ describe HtmlToMarkdown do expect(html_to_markdown("B*ld")).to eq("__B*ld__") html = <<~HTML + Before

Bold

+ After HTML - expect(html_to_markdown(html)).to eq("**Bold**") + expect(html_to_markdown(html)).to eq("Before\n\n**Bold**\n\nAfter") end it "converts " do @@ -60,6 +105,11 @@ describe HtmlToMarkdown do expect(html_to_markdown(%Q{Discourse})).to eq("[Discourse](https://www.discourse.org)") end + it "supports SiteSetting.allowed_href_schemes" do + SiteSetting.allowed_href_schemes = "tel|steam" + expect(html_to_markdown(%Q{LIMBO})).to eq("[LIMBO](steam://store/48000)") + end + it "removes empty & invalid " do expect(html_to_markdown(%Q{Discourse})).to eq("Discourse") expect(html_to_markdown(%Q{Discourse})).to eq("Discourse") @@ -67,7 +117,7 @@ describe HtmlToMarkdown do end HTML_WITH_IMG ||= %Q{Discourse Logo} - HTML_WITH_CID_IMG ||= %Q{Discourse Logo} + HTML_WITH_CID_IMG ||= %Q{} it "converts " do expect(html_to_markdown(HTML_WITH_IMG)).to eq("![Discourse Logo](https://www.discourse.org/logo.svg)") @@ -84,8 +134,7 @@ describe HtmlToMarkdown do end it "keeps with src='cid:' whith 'keep_cid_imgs'" do - expect(html_to_markdown(HTML_WITH_CID_IMG, keep_cid_imgs: true)).to eq("![Discourse Logo](cid:ii_1525434659ddb4cb)") - expect(html_to_markdown(HTML_WITH_CID_IMG, keep_img_tags: true, keep_cid_imgs: true)).to eq("\"Discourse") + expect(html_to_markdown(HTML_WITH_CID_IMG, keep_cid_imgs: true)).to eq(HTML_WITH_CID_IMG) end it "skips hidden " do @@ -95,6 +144,12 @@ describe HtmlToMarkdown do expect(html_to_markdown(%Q{})).to eq("") end + it "supports width/height on " do + expect(html_to_markdown(%Q{})).to eq("![](https://www.discourse.org/logo.svg)") + expect(html_to_markdown(%Q{})).to eq("![](https://www.discourse.org/logo.svg)") + expect(html_to_markdown(%Q{})).to eq("![|200x100](https://www.discourse.org/logo.svg)") + end + (1..6).each do |n| it "converts " do expect(html_to_markdown("Header #{n}")).to eq("#" * n + " Header #{n}") @@ -150,11 +205,11 @@ describe HtmlToMarkdown do end it "supports " do - expect(html_to_markdown("Strike Through")).to eq("Strike Through") + expect(html_to_markdown("Strike Through")).to eq("~~Strike Through~~") end it "supports " do - expect(html_to_markdown("Strike Through")).to eq("Strike Through") + expect(html_to_markdown("Strike Through")).to eq("~~Strike Through~~") end it "supports
" do @@ -221,11 +276,11 @@ describe HtmlToMarkdown do it "handles

" do expect(html_to_markdown("

1st paragraph

2nd paragraph

")).to eq("1st paragraph\n\n2nd paragraph") - expect(html_to_markdown("

1st paragraph

\n

2nd paragraph\n 2nd paragraph

\n

3rd paragraph

")).to eq("1st paragraph\n\n2nd paragraph\n2nd paragraph\n\n3rd paragraph") + expect(html_to_markdown("

1st paragraph

\n

2nd paragraph\n 2nd paragraph

\n

3rd paragraph

")).to eq("1st paragraph\n\n2nd paragraph 2nd paragraph\n\n3rd paragraph") end it "handles
" do - expect(html_to_markdown("
1st div
2nd div
")).to eq("1st div\n\n2nd div") + expect(html_to_markdown("
1st div
2nd div
")).to eq("1st div\n2nd div") end it "swallows " do @@ -257,15 +312,19 @@ describe HtmlToMarkdown do context "with an oddly placed
" do it "handles " do - expect(html_to_markdown("
Bold
")).to eq("**Bold**") - expect(html_to_markdown("Bold
")).to eq("**Bold**") - expect(html_to_markdown("Bold
text
")).to eq("**Bold\ntext**") + expect(html_to_markdown("Hello
Bold
World")).to eq("Hello\n**Bold** World") + expect(html_to_markdown("Hello Bold
World")).to eq("Hello **Bold**\nWorld") + expect(html_to_markdown("Hello Bold
text
World")).to eq("Hello **Bold**\n**text** World") end it "handles " do - expect(html_to_markdown("
Italic
")).to eq("*Italic*") - expect(html_to_markdown("Italic
")).to eq("*Italic*") - expect(html_to_markdown("Italic
text
")).to eq("*Italic\ntext*") + expect(html_to_markdown("Hello
Italic
World")).to eq("Hello\n*Italic* World") + expect(html_to_markdown("Hello Italic
World")).to eq("Hello *Italic*\nWorld") + expect(html_to_markdown("Hello Italic
text
World")).to eq("Hello *Italic*\n*text* World") + end + + it "works" do + expect(html_to_markdown("
A B C
D
E
F
G
")).to eq("A __B *C*__\n__*D* E__\n**F** G") end end @@ -314,4 +373,64 @@ describe HtmlToMarkdown do end + it "supoorts " do + html = <<~HTML +
+ + + + + + + + + + + + + + + + + + + + + + +
Thisistheheaders
I amthefirstrow
And thisis the2ndline
+ HTML + + markdown = <<~MD + | This | is | the | *headers* | + | - | - | - | - | + | I am | the | **first** | row | + | And this | is the | 2nd | line | + MD + + expect(html_to_markdown(html)).to eq(markdown.strip) + + expect(html_to_markdown("
HelloWorld
")).to eq("| Hello | World |\n| - | - |") + end + + it "doesn't swallow badly formatted " do + html = <<~HTML +
+ + + + + + + + + + + +
1234
OneTwoThree
+ HTML + + expect(html_to_markdown(html)).to eq("1 2 3 4 \nOne Two Three") + end + end