From 16738cfb1b6726f06bf7e203211bcddd96e45440 Mon Sep 17 00:00:00 2001 From: Gerhard Schlager Date: Wed, 6 Dec 2017 01:47:31 +0100 Subject: [PATCH] FEATURE: convert plain text emails to markdown --- lib/email/receiver.rb | 25 ++- lib/plain_text_to_markdown.rb | 190 +++++++++++++++++ script/import_scripts/mbox/importer.rb | 21 +- script/import_scripts/mbox/support/indexer.rb | 2 +- spec/components/email/receiver_spec.rb | 4 +- spec/components/email_cook_spec.rb | 4 + .../components/plain_text_to_markdown_spec.rb | 197 ++++++++++++++++++ 7 files changed, 419 insertions(+), 24 deletions(-) create mode 100644 lib/plain_text_to_markdown.rb create mode 100644 spec/components/plain_text_to_markdown_spec.rb diff --git a/lib/email/receiver.rb b/lib/email/receiver.rb index 82712eaefbf..78ed3008f35 100644 --- a/lib/email/receiver.rb +++ b/lib/email/receiver.rb @@ -2,6 +2,7 @@ require "digest" require_dependency "new_post_manager" require_dependency "post_action_creator" require_dependency "html_to_markdown" +require_dependency "plain_text_to_markdown" require_dependency "upload_creator" module Email @@ -43,12 +44,13 @@ module Email markdown: 2) end - def initialize(mail_string) + def initialize(mail_string, opts = {}) raise EmptyEmailError if mail_string.blank? @staged_users = [] @raw_email = try_to_encode(mail_string, "UTF-8") || try_to_encode(mail_string, "ISO-8859-1") || mail_string @mail = Mail.new(@raw_email) @message_id = @mail.message_id.presence || Digest::MD5.hexdigest(mail_string) + @opts = opts end def process! @@ -222,19 +224,32 @@ module Email def select_body text = nil html = nil + text_content_type = nil if @mail.multipart? text = fix_charset(@mail.text_part) html = fix_charset(@mail.html_part) + text_content_type = @mail.text_part&.content_type elsif @mail.content_type.to_s["text/html"] html = fix_charset(@mail) else text = fix_charset(@mail) + text_content_type = @mail.content_type end - text, elided_text = if text.present? + if text.present? text = trim_discourse_markers(text) - EmailReplyTrimmer.trim(text, true) + text, elided_text = EmailReplyTrimmer.trim(text, true) + + if @opts[:convert_plaintext] || sent_to_mailinglist_mirror? + text_content_type ||= "" + converter_opts = { + format_flowed: !!(text_content_type =~ /format\s*=\s*["']?flowed["']?/i), + delete_flowed_space: !!(text_content_type =~ /DelSp\s*=\s*["']?yes["']?/i) + } + text = PlainTextToMarkdown.new(text, converter_opts).to_markdown + elided_text = PlainTextToMarkdown.new(elided_text, converter_opts).to_markdown + end end markdown, elided_markdown = if html.present? @@ -755,8 +770,8 @@ module Email def self.elided_html(elided) html = "\n\n" << "
" << "\n" - html << "···" << "\n" - html << elided << "\n" + html << "···" << "\n\n" + html << elided << "\n\n" html << "
" << "\n" html end diff --git a/lib/plain_text_to_markdown.rb b/lib/plain_text_to_markdown.rb new file mode 100644 index 00000000000..ce785ca50f8 --- /dev/null +++ b/lib/plain_text_to_markdown.rb @@ -0,0 +1,190 @@ +class PlainTextToMarkdown + SIGNATURE_SEPARATOR ||= "-- ".freeze + + URL_REGEX ||= /((?:https?:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.])(?:[^\s()<>]+|\([^\s()<>]+\))+(?:\([^\s()<>]+\)|[^`!()\[\]{};:'".,<>?«»“”‘’\s]))/i + + def initialize(plaintext, opts = {}) + @plaintext = plaintext + @lines = [] + + @format_flowed = opts[:format_flowed] || false + @delete_flowed_space = opts[:delete_flowed_space] || false + end + + def to_markdown + prepare_lines + classify_lines + + markdown = "" + last_quote_level = 0 + last_line_blank = false + + @lines.each do |line| + current_line_blank = line.text.blank? + + unless last_line_blank && current_line_blank + if line.quote_level > 0 + quote_identifiers = ">" * line.quote_level + markdown << quote_identifiers << "\n" unless line.quote_level >= last_quote_level || current_line_blank + markdown << quote_identifiers + markdown << " " unless current_line_blank + else + markdown << "\n" unless last_quote_level == 0 || current_line_blank + end + + markdown << convert_text(line) + markdown << "\n" + end + + last_line_blank = current_line_blank + last_quote_level = line.quote_level + end + + markdown.rstrip! + markdown + end + + private + + class CodeBlock < Struct.new(:start_line, :end_line) + def initialize(start_line, end_line = nil) + super + end + + def valid? + start_line.present? && end_line.present? + end + end + + class Line < Struct.new(:text, :quote_level, :code_block) + def initialize(text, quote_level = 0, code_block = nil) + super + end + + def valid_code_block? + code_block&.valid? + end + end + + def prepare_lines + previous_line = nil + + @plaintext.each_line do |text| + text.chomp! + line = Line.new(text) + + remove_quote_level_indicators!(line) + + if @format_flowed + line = merge_lines(line, previous_line) + @lines << line unless line == previous_line + else + @lines << line + end + + previous_line = line + end + end + + def classify_lines + previous_line = nil + + @lines.each do |line| + classify_line_as_code!(line, previous_line) + + previous_line = line + end + end + + # @param line [Line] + def remove_quote_level_indicators!(line) + match_data = line.text.match(/^(?>+)\s?(?.*)/) + + if match_data + line.text = match_data[:text] + line.quote_level = match_data[:indicators].length + end + end + + # @param line [Line] + # @param previous_line [Line] + # @return [Line] + def merge_lines(line, previous_line) + return line if previous_line.nil? || line.text.blank? + return line if line.text == SIGNATURE_SEPARATOR || previous_line.text == SIGNATURE_SEPARATOR + return line unless line.quote_level == previous_line.quote_level && previous_line.text.end_with?(" ") + + previous_line.text = previous_line.text[0...-1] if @delete_flowed_space + previous_line.text += line.text + previous_line + end + + # @param line [Line] + # @param previous_line [Line] + def classify_line_as_code!(line, previous_line) + line.code_block = previous_line.code_block unless previous_line.nil? || previous_line.valid_code_block? + return unless line.text =~ /^\s{0,3}```/ + + if line.code_block.present? + line.code_block.end_line = line + else + line.code_block = CodeBlock.new(line) + end + end + + # @param line [Line] + # @return [string] + def convert_text(line) + text = line.text + + if line.valid_code_block? + code_block = line.code_block + return code_block.start_line == line || code_block.end_line == line ? text.lstrip : text + end + + converted_text = replace_duplicate_links(text) + converted_text = escape_special_characters(converted_text) + converted_text = indent_with_non_breaking_spaces(converted_text) + converted_text + end + + def replace_duplicate_links(text) + text.to_enum(:scan, URL_REGEX) + .map { $& } + .group_by { |url| url } + .keep_if { |_, urls | urls.length > 1 } + .keys.each do |url| + + text.gsub!(Regexp.new(%Q|#{url}(\s*[()\\[\\]<>«»'"“”‘’]?#{url}[()\\[\\]<>«»'"“”‘’]?)|, Regexp::IGNORECASE), url) + end + + text + end + + def indent_with_non_breaking_spaces(text) + text.sub(/^\s+/) do |s| + # replace tabs with 2 spaces + s.gsub!("\t", " ") + + # replace indentation with non-breaking spaces + s.length > 1 ? " " * s.length : s + end + end + + def escape_special_characters(text) + escaped_text = "" + + text.split(URL_REGEX).each do |text_part| + if text_part =~ URL_REGEX + # no escaping withing URLs + escaped_text << text_part + else + # escape Markdown and HTML + text_part.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" } + escaped_text << CGI.escapeHTML(text_part) + end + end + + escaped_text + end +end diff --git a/script/import_scripts/mbox/importer.rb b/script/import_scripts/mbox/importer.rb index 028d7c00ca4..2b2c13c40a7 100644 --- a/script/import_scripts/mbox/importer.rb +++ b/script/import_scripts/mbox/importer.rb @@ -104,30 +104,19 @@ module ImportScripts::Mbox id: row['msg_id'], user_id: user_id, created_at: to_time(row['email_date']), - raw: format_raw(row['body'], attachment_html, row['elided'], row['format']), + raw: format_raw(row['body'], attachment_html, row['elided']), raw_email: row['raw_message'], via_email: true, - cook_method: Post.cook_methods[:email], post_create_action: proc do |post| create_incoming_email(post, row) end } end - def format_raw(email_body, attachment_html, elided, format) - email_body ||= '' - - case format - when Email::Receiver::formats[:markdown] - body = email_body - body << attachment_html if attachment_html.present? - body << Email::Receiver.elided_html(elided) if elided.present? - when Email::Receiver::formats[:plaintext] - body = %|[plaintext]\n#{escape_tags(email_body)}\n[/plaintext]| - body << %|\n[attachments]\n#{escape_tags(attachment_html)}\n[/attachments]| if attachment_html.present? - body << %|\n[elided]\n#{escape_tags(elided)}\n[/elided]| if elided.present? - end - + def format_raw(email_body, attachment_html, elided) + body = email_body || '' + body << attachment_html if attachment_html.present? + body << Email::Receiver.elided_html(elided) if elided.present? body end diff --git a/script/import_scripts/mbox/support/indexer.rb b/script/import_scripts/mbox/support/indexer.rb index b7972f1ce77..251975f07da 100644 --- a/script/import_scripts/mbox/support/indexer.rb +++ b/script/import_scripts/mbox/support/indexer.rb @@ -163,7 +163,7 @@ module ImportScripts::Mbox end def read_mail_from_string(raw_message) - Email::Receiver.new(raw_message) unless raw_message.blank? + Email::Receiver.new(raw_message, convert_plaintext: true) unless raw_message.blank? end def extract_reply_message_ids(mail) diff --git a/spec/components/email/receiver_spec.rb b/spec/components/email/receiver_spec.rb index 8b701a3a776..5b5d91f5caf 100644 --- a/spec/components/email/receiver_spec.rb +++ b/spec/components/email/receiver_spec.rb @@ -344,7 +344,7 @@ describe Email::Receiver do topic.save expect { process(:original_message) }.to change { topic.posts.count } - expect(topic.posts.last.raw).to eq("This is a reply :)\n\n
\n···\n---Original Message---\nThis part should not be included\n
") + expect(topic.posts.last.raw).to eq("This is a reply :)\n\n
\n···\n\n---Original Message---\nThis part should not be included\n\n
") end it "doesn't include the 'elided' part of the original message when always_show_trimmed_content is disabled" do @@ -356,7 +356,7 @@ describe Email::Receiver do it "adds the 'elided' part of the original message for public replies when always_show_trimmed_content is enabled" do SiteSetting.always_show_trimmed_content = true expect { process(:original_message) }.to change { topic.posts.count }.from(1).to(2) - expect(topic.posts.last.raw).to eq("This is a reply :)\n\n
\n···\n---Original Message---\nThis part should not be included\n
") + expect(topic.posts.last.raw).to eq("This is a reply :)\n\n
\n···\n\n---Original Message---\nThis part should not be included\n\n
") end it "supports attached images in TEXT part" do diff --git a/spec/components/email_cook_spec.rb b/spec/components/email_cook_spec.rb index 211b45ff43e..4d3177efa2c 100644 --- a/spec/components/email_cook_spec.rb +++ b/spec/components/email_cook_spec.rb @@ -146,7 +146,9 @@ describe EmailCook do
··· + At vero eos *et accusam* et justo duo dolores et ea rebum.
+
LONG_COOKED @@ -169,7 +171,9 @@ describe EmailCook do
··· + At vero eos *et accusam* et justo duo dolores et ea rebum.
+
LONG_COOKED diff --git a/spec/components/plain_text_to_markdown_spec.rb b/spec/components/plain_text_to_markdown_spec.rb new file mode 100644 index 00000000000..43437ab6cfe --- /dev/null +++ b/spec/components/plain_text_to_markdown_spec.rb @@ -0,0 +1,197 @@ +require 'rails_helper' +require 'plain_text_to_markdown' + +describe PlainTextToMarkdown do + def to_markdown(text, opts = {}) + PlainTextToMarkdown.new(text, opts).to_markdown + end + + let(:nbsp) { " " } + + context "quotes" do + it "uses the correct quote level" do + expect(to_markdown("> foo")).to eq("> foo") + expect(to_markdown(">>> foo")).to eq(">>> foo") + expect(to_markdown(">>>>>>> foo")).to eq(">>>>>>> foo") + end + + it "ignores the first whitespace after the quote identifier" do + expect(to_markdown(">foo")).to eq("> foo") + expect(to_markdown("> foo")).to eq("> foo") + expect(to_markdown(">\tfoo")).to eq("> foo") + + expect(to_markdown("> foo")).to eq("> foo") + expect(to_markdown(">\t foo")).to eq("> foo") + end + + it "adds a blank line after a quote if it is followed by text" do + expect(to_markdown("> foo\nbar")).to eq("> foo\n\nbar") + expect(to_markdown(">> foo\nbar")).to eq(">> foo\n\nbar") + end + + it "ignores multiple consecutive blank lines" do + expect(to_markdown("> foo\n\nbar")).to eq("> foo\n\nbar") + expect(to_markdown("> foo\n\n\nbar")).to eq("> foo\n\nbar") + expect(to_markdown("> foo\n> \n>\n>\n> bar")).to eq("> foo\n>\n> bar") + end + + it "adds an additional line with quote identifier if the quote level is decreasing" do + expect(to_markdown(">> foo\n>bar")).to eq(">> foo\n>\n> bar") + expect(to_markdown(">>>> foo\n>bar")).to eq(">>>> foo\n>\n> bar") + expect(to_markdown(">> foo\nno quote\n>bar")).to eq(">> foo\n\nno quote\n> bar") + end + + it "does not add an additional line with quote identifier if the quote level is decreasing and text is blank" do + expect(to_markdown(">>> foo\n>>\n>> bar")).to eq(">>> foo\n>>\n>> bar") + end + end + + context "special characters" do + it "escapes special Markdown characters" do + expect(to_markdown('\ backslash')).to eq('\\\\ backslash') + expect(to_markdown('` backtick')).to eq('\` backtick') + expect(to_markdown('* asterisk')).to eq('\* asterisk') + expect(to_markdown('_ underscore')).to eq('\_ underscore') + expect(to_markdown('{} curly braces')).to eq('\{\} curly braces') + expect(to_markdown('[] square brackets')).to eq('\[\] square brackets') + expect(to_markdown('() parentheses')).to eq('\(\) parentheses') + expect(to_markdown('# hash mark')).to eq('\# hash mark') + expect(to_markdown('+ plus sign')).to eq('\+ plus sign') + expect(to_markdown('- minus sign')).to eq('\- minus sign') + expect(to_markdown('. dot')).to eq('\. dot') + expect(to_markdown('! exclamation mark')).to eq('\! exclamation mark') + expect(to_markdown('~ tilde')).to eq('\~ tilde') + end + + it "escapes special HTML characters" do + expect(to_markdown("' single quote")).to eq("' single quote") + expect(to_markdown("\" double quote")).to eq("" double quote") + expect(to_markdown("& ampersand")).to eq("& ampersand") + expect(to_markdown("<> less-than and greater-than sign")).to eq("<> less\\-than and greater\\-than sign") + end + + it "escapes special characters but ignores links" do + expect(to_markdown("*some text* https://www.example.com/foo.html?a=1&b=0 & *more text*")) + .to eq("\\*some text\\* https://www.example.com/foo.html?a=1&b=0 & <https://www.example.com/bar.html?a=1&b=0> \\*more text\\*") + end + end + + context "indentation" do + it "does not replace one leading whitespace" do + expect(to_markdown(" foo")).to eq(" foo") + end + + it "replaces leading whitespaces with non-breaking spaces" do + expect(to_markdown(" foo")).to eq("#{nbsp}#{nbsp}foo") + expect(to_markdown(" foo")).to eq("#{nbsp}#{nbsp}#{nbsp}#{nbsp}foo") + end + + it "replaces each leading tabs with two non-breaking spaces" do + expect(to_markdown("\tfoo")).to eq("#{nbsp}#{nbsp}foo") + expect(to_markdown(" \tfoo")).to eq("#{nbsp}#{nbsp}#{nbsp}foo") + expect(to_markdown("\t foo")).to eq("#{nbsp}#{nbsp}#{nbsp}foo") + expect(to_markdown(" \t foo")).to eq("#{nbsp}#{nbsp}#{nbsp}#{nbsp}foo") + expect(to_markdown("\t\tfoo")).to eq("#{nbsp}#{nbsp}#{nbsp}#{nbsp}foo") + end + + it "correctly replaces leading whitespaces within quotes" do + expect(to_markdown("> foo")).to eq("> foo") + expect(to_markdown("> foo")).to eq("> #{nbsp}#{nbsp}foo") + end + + it "does not replace whitespaces within text" do + expect(to_markdown("foo bar")).to eq("foo bar") + expect(to_markdown("foo\t\tbar")).to eq("foo\t\tbar") + end + end + + context "format=flowed" do + it "concats lines ending with a space" do + text = "Lorem ipsum dolor sit amet, consectetur \nadipiscing elit. Quasi vero, inquit, \nperpetua oratio rhetorum solum, non \netiam philosophorum sit." + markdown = "Lorem ipsum dolor sit amet, consectetur adipiscing elit\\. Quasi vero, inquit, perpetua oratio rhetorum solum, non etiam philosophorum sit\\." + + expect(to_markdown(text, format_flowed: true)).to eq(markdown) + end + + it "does not concat lines when there is an empty line between" do + text = "Lorem ipsum dolor sit amet, consectetur \nadipiscing elit. \n\nQuasi vero, inquit, \nperpetua oratio rhetorum solum, non \netiam philosophorum sit." + markdown = "Lorem ipsum dolor sit amet, consectetur adipiscing elit\\. \n\nQuasi vero, inquit, perpetua oratio rhetorum solum, non etiam philosophorum sit\\." + + expect(to_markdown(text, format_flowed: true)).to eq(markdown) + end + + it "concats quoted lines ending with a space" do + text = "> Lorem ipsum dolor sit amet, consectetur \n> adipiscing elit. Quasi vero, inquit, \n> perpetua oratio rhetorum solum, non \n> etiam philosophorum sit." + markdown = "> Lorem ipsum dolor sit amet, consectetur adipiscing elit\\. Quasi vero, inquit, perpetua oratio rhetorum solum, non etiam philosophorum sit\\." + + expect(to_markdown(text, format_flowed: true)).to eq(markdown) + end + + it "does not concat quoted lines ending with a space when the quote level differs" do + text = "> Lorem ipsum dolor sit amet, consectetur \n> adipiscing elit. \n>> Quasi vero, inquit, \n>> perpetua oratio rhetorum solum, non \n> etiam philosophorum sit." + markdown = "> Lorem ipsum dolor sit amet, consectetur adipiscing elit\\. \n>> Quasi vero, inquit, perpetua oratio rhetorum solum, non \n>\n> etiam philosophorum sit\\." + + expect(to_markdown(text, format_flowed: true)).to eq(markdown) + end + + it "does not recognize a signature separator as start of flowed text" do + text = "-- \nsignature line 1\nsignature line 2" + markdown = "\\-\\- \nsignature line 1\nsignature line 2" + + expect(to_markdown(text, format_flowed: true)).to eq(markdown) + end + + it "does not concat lines when there is a signature separator" do + text = "Lorem ipsum \ndolor sit amet \n-- \nsignature line 1\nsignature line 2" + markdown = "Lorem ipsum dolor sit amet \n\\-\\- \nsignature line 1\nsignature line 2" + + expect(to_markdown(text, format_flowed: true)).to eq(markdown) + end + + it "removes the trailing space if DelSp is set to 'yes'" do + text = "Lorem ipsum dolor sit amet, consectetur \nadipiscing elit. \nQuasi vero, inquit" + markdown = "Lorem ipsum dolor sit amet, consecteturadipiscing elit\\.Quasi vero, inquit" + + expect(to_markdown(text, format_flowed: true, delete_flowed_space: true)).to eq(markdown) + end + end + + context "links" do + it "removes duplicate links" do + expect(to_markdown("foo https://www.example.com/foo.html bar")) + .to eq("foo https://www.example.com/foo.html bar") + + expect(to_markdown("foo https://www.example.com/foo.html (https://www.example.com/foo.html) bar")) + .to eq("foo https://www.example.com/foo.html bar") + + expect(to_markdown("foo https://www.example.com/foo.html https://www.example.com/foo.html bar")) + .to eq("foo https://www.example.com/foo.html bar") + end + + it "does not removes duplicate links when there is text between the links" do + expect(to_markdown("foo https://www.example.com/foo.html bar https://www.example.com/foo.html baz")) + .to eq("foo https://www.example.com/foo.html bar https://www.example.com/foo.html baz") + end + end + + context "code" do + it "detects matching Markdown code block within backticks" do + expect(to_markdown("foo\n```\n\n```")).to eq("foo\n```\n\n```") + end + + it "does not detect Markdown code block when backticks are not on new line" do + expect(to_markdown("foo\n```\n ```")).to eq("foo\n\\`\\`\\`\n<this is code> \\`\\`\\`") + end + + it "does not detect Markdown code block when backticks are indented by more than 3 whitespaces" do + expect(to_markdown("foo\n ```\n\n ```")).to include("") + expect(to_markdown("foo\n ```\n\n ```")).to include("") + + expect(to_markdown("foo\n ```\n\n```")).to include("<this is code>") + expect(to_markdown("foo\n```\n\n ```")).to include("<this is code>") + + expect(to_markdown("foo\n ```\n\n```")).to include("<this is code>") + expect(to_markdown("foo\n```\n\n ```")).to include("<this is code>") + end + end +end