FIX: Prefer HTML in incoming emails, heavily refactor email receiver

This commit heavily refactors Email::Receiver to both better handle
different emails and improve testability.

A primary focus of the refactor is reducing the usage of class
variables, in favor of actually passing parameters - making it possible
for multiple tests to use the same Receiver instance.

The EmailLog reported when a topic is created is reflected to put the
user's email in the to_address field, instead of the system address.

The discourse_email_parser function is renamed to
discourse_email_trimmer, and additional stopping conditions are added to
make up for EmailReplyParser's inability to deal with html at the start
of a line.

The force_encoding calls are refactored out to a 'fix_charset' method.

parse_body is renamed to select_body, and the scrub_html method is
dropped in favor of the new HtmlCleaner class.

A new parse_body method is added, which performs the job of the removed
lines of code in the 'process' method.

EmailUnparsableError is redefined again, to be encoding errors (when the
declared encoding is not what was delivered).
This commit is contained in:
riking 2014-08-26 17:31:51 -07:00
parent cb55ef4702
commit 0a09593f3b
1 changed files with 59 additions and 49 deletions

View File

@ -1,3 +1,4 @@
require 'email/html_cleaner'
# #
# Handles an incoming message # Handles an incoming message
# #
@ -26,20 +27,12 @@ module Email
def process def process
raise EmptyEmailError if @raw.blank? raise EmptyEmailError if @raw.blank?
@message = Mail.new(@raw) message = Mail.new(@raw)
# First remove the known discourse stuff. body = parse_body message
parse_body
raise EmptyEmailError if @body.blank?
# Then run the github EmailReplyParser on it in case we didn't catch it
@body = EmailReplyParser.read(@body).visible_text.force_encoding('UTF-8')
discourse_email_parser
raise EmailUnparsableError if @body.blank?
dest_info = {type: :invalid, obj: nil} dest_info = {type: :invalid, obj: nil}
@message.to.each do |to_address| message.to.each do |to_address|
if dest_info[:type] == :invalid if dest_info[:type] == :invalid
dest_info = check_address to_address dest_info = check_address to_address
end end
@ -47,6 +40,10 @@ module Email
raise BadDestinationAddress if dest_info[:type] == :invalid raise BadDestinationAddress if dest_info[:type] == :invalid
# TODO get to a state where we can remove this
@message = message
@body = body
if dest_info[:type] == :category if dest_info[:type] == :category
raise BadDestinationAddress unless SiteSetting.email_in raise BadDestinationAddress unless SiteSetting.email_in
category = dest_info[:obj] category = dest_info[:obj]
@ -74,6 +71,8 @@ module Email
create_reply create_reply
end end
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError => e
raise EmailUnparsableError.new(e)
end end
def check_address(address) def check_address(address)
@ -94,56 +93,63 @@ module Email
{type: :invalid, obj: nil} {type: :invalid, obj: nil}
end end
def parse_body(message)
body = select_body message
raise EmptyEmailError if body.strip.blank?
def parse_body body = discourse_email_trimmer body
raise EmptyEmailError if body.strip.blank?
body = EmailReplyParser.parse_reply body
raise EmptyEmailError if body.strip.blank?
body
end
def select_body(message)
html = nil html = nil
# If the message is multipart, return that part (favor html)
# If the message is multipart, find the best type for our purposes if message.multipart?
if @message.multipart? html = fix_charset message.html_part
if p = @message.text_part text = fix_charset message.text_part
@body = p.charset ? p.body.decoded.force_encoding(p.charset).encode("UTF-8").to_s : p.body.to_s # TODO picking text if available may be better
return @body if text && !html
elsif p = @message.html_part return text
html = p.charset ? p.body.decoded.force_encoding(p.charset).encode("UTF-8").to_s : p.body.to_s
end end
elsif message.content_type =~ /text\/html/
html = fix_charset message
end end
if @message.content_type =~ /text\/html/ if html
if defined? @message.charset body = HtmlCleaner.new(html).output_html
html = @message.body.decoded.force_encoding(@message.charset).encode("UTF-8").to_s else
else body = fix_charset message
html = @message.body.to_s
end
end end
if html.present?
@body = scrub_html(html)
return @body
end
@body = @message.charset ? @message.body.decoded.force_encoding(@message.charset).encode("UTF-8").to_s.strip : @message.body.to_s
# Certain trigger phrases that means we didn't parse correctly # Certain trigger phrases that means we didn't parse correctly
@body = nil if @body =~ /Content\-Type\:/ || if body =~ /Content\-Type\:/ || body =~ /multipart\/alternative/ || body =~ /text\/plain/
@body =~ /multipart\/alternative/ || raise EmptyEmailError
@body =~ /text\/plain/ end
@body body
end end
def scrub_html(html) # Force encoding to UTF-8 on a Mail::Message or Mail::Part
# If we have an HTML message, strip the markup def fix_charset(object)
doc = Nokogiri::HTML(html) return nil if object.nil?
# Blackberry is annoying in that it only provides HTML. We can easily extract it though if object.charset
content = doc.at("#BB10_response_div") object.body.decoded.force_encoding(object.charset).encode("UTF-8").to_s
return content.text if content.present? else
object.body.to_s
doc.xpath("//text()").text end
end end
def discourse_email_parser REPLYING_HEADER_LABELS = ['From', 'Sent', 'To', 'Subject', 'Reply To']
lines = @body.scrub.lines.to_a REPLYING_HEADER_REGEX = Regexp.union(REPLYING_HEADER_LABELS.map { |lbl| "#{lbl}:" })
def discourse_email_trimmer(body)
lines = body.scrub.lines.to_a
range_end = 0 range_end = 0
lines.each_with_index do |l, idx| lines.each_with_index do |l, idx|
@ -154,11 +160,15 @@ module Email
# Let's try it and see how well it works. # Let's try it and see how well it works.
(l =~ /\d{4}/ && l =~ /\d:\d\d/ && l =~ /\:$/) (l =~ /\d{4}/ && l =~ /\d:\d\d/ && l =~ /\:$/)
# Headers on subsequent lines
break if (0..2).all? { |off| lines[idx+off] =~ REPLYING_HEADER_REGEX }
# Headers on the same line
break if REPLYING_HEADER_LABELS.count { |lbl| l.include? lbl } >= 3
range_end = idx range_end = idx
end end
@body = lines[0..range_end].join lines[0..range_end].join.strip
@body.strip!
end end
def wrap_body_in_quote(user_email) def wrap_body_in_quote(user_email)