FIX: Prefer HTML in incoming emails, heavily refactor email receiver
This commit heavily refactors Email::Receiver to both better handle different emails and improve testability. A primary focus of the refactor is reducing the usage of class variables, in favor of actually passing parameters - making it possible for multiple tests to use the same Receiver instance. The EmailLog reported when a topic is created is reflected to put the user's email in the to_address field, instead of the system address. The discourse_email_parser function is renamed to discourse_email_trimmer, and additional stopping conditions are added to make up for EmailReplyParser's inability to deal with html at the start of a line. The force_encoding calls are refactored out to a 'fix_charset' method. parse_body is renamed to select_body, and the scrub_html method is dropped in favor of the new HtmlCleaner class. A new parse_body method is added, which performs the job of the removed lines of code in the 'process' method. EmailUnparsableError is redefined again, to be encoding errors (when the declared encoding is not what was delivered).
This commit is contained in:
parent
cb55ef4702
commit
0a09593f3b
|
@ -1,3 +1,4 @@
|
||||||
|
require 'email/html_cleaner'
|
||||||
#
|
#
|
||||||
# Handles an incoming message
|
# Handles an incoming message
|
||||||
#
|
#
|
||||||
|
@ -26,20 +27,12 @@ module Email
|
||||||
def process
|
def process
|
||||||
raise EmptyEmailError if @raw.blank?
|
raise EmptyEmailError if @raw.blank?
|
||||||
|
|
||||||
@message = Mail.new(@raw)
|
message = Mail.new(@raw)
|
||||||
|
|
||||||
# First remove the known discourse stuff.
|
body = parse_body message
|
||||||
parse_body
|
|
||||||
raise EmptyEmailError if @body.blank?
|
|
||||||
|
|
||||||
# Then run the github EmailReplyParser on it in case we didn't catch it
|
|
||||||
@body = EmailReplyParser.read(@body).visible_text.force_encoding('UTF-8')
|
|
||||||
|
|
||||||
discourse_email_parser
|
|
||||||
raise EmailUnparsableError if @body.blank?
|
|
||||||
|
|
||||||
dest_info = {type: :invalid, obj: nil}
|
dest_info = {type: :invalid, obj: nil}
|
||||||
@message.to.each do |to_address|
|
message.to.each do |to_address|
|
||||||
if dest_info[:type] == :invalid
|
if dest_info[:type] == :invalid
|
||||||
dest_info = check_address to_address
|
dest_info = check_address to_address
|
||||||
end
|
end
|
||||||
|
@ -47,6 +40,10 @@ module Email
|
||||||
|
|
||||||
raise BadDestinationAddress if dest_info[:type] == :invalid
|
raise BadDestinationAddress if dest_info[:type] == :invalid
|
||||||
|
|
||||||
|
# TODO get to a state where we can remove this
|
||||||
|
@message = message
|
||||||
|
@body = body
|
||||||
|
|
||||||
if dest_info[:type] == :category
|
if dest_info[:type] == :category
|
||||||
raise BadDestinationAddress unless SiteSetting.email_in
|
raise BadDestinationAddress unless SiteSetting.email_in
|
||||||
category = dest_info[:obj]
|
category = dest_info[:obj]
|
||||||
|
@ -74,6 +71,8 @@ module Email
|
||||||
|
|
||||||
create_reply
|
create_reply
|
||||||
end
|
end
|
||||||
|
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError => e
|
||||||
|
raise EmailUnparsableError.new(e)
|
||||||
end
|
end
|
||||||
|
|
||||||
def check_address(address)
|
def check_address(address)
|
||||||
|
@ -94,56 +93,63 @@ module Email
|
||||||
{type: :invalid, obj: nil}
|
{type: :invalid, obj: nil}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def parse_body(message)
|
||||||
|
body = select_body message
|
||||||
|
raise EmptyEmailError if body.strip.blank?
|
||||||
|
|
||||||
def parse_body
|
body = discourse_email_trimmer body
|
||||||
|
raise EmptyEmailError if body.strip.blank?
|
||||||
|
|
||||||
|
body = EmailReplyParser.parse_reply body
|
||||||
|
raise EmptyEmailError if body.strip.blank?
|
||||||
|
|
||||||
|
body
|
||||||
|
end
|
||||||
|
|
||||||
|
def select_body(message)
|
||||||
html = nil
|
html = nil
|
||||||
|
# If the message is multipart, return that part (favor html)
|
||||||
# If the message is multipart, find the best type for our purposes
|
if message.multipart?
|
||||||
if @message.multipart?
|
html = fix_charset message.html_part
|
||||||
if p = @message.text_part
|
text = fix_charset message.text_part
|
||||||
@body = p.charset ? p.body.decoded.force_encoding(p.charset).encode("UTF-8").to_s : p.body.to_s
|
# TODO picking text if available may be better
|
||||||
return @body
|
if text && !html
|
||||||
elsif p = @message.html_part
|
return text
|
||||||
html = p.charset ? p.body.decoded.force_encoding(p.charset).encode("UTF-8").to_s : p.body.to_s
|
|
||||||
end
|
end
|
||||||
|
elsif message.content_type =~ /text\/html/
|
||||||
|
html = fix_charset message
|
||||||
end
|
end
|
||||||
|
|
||||||
if @message.content_type =~ /text\/html/
|
if html
|
||||||
if defined? @message.charset
|
body = HtmlCleaner.new(html).output_html
|
||||||
html = @message.body.decoded.force_encoding(@message.charset).encode("UTF-8").to_s
|
else
|
||||||
else
|
body = fix_charset message
|
||||||
html = @message.body.to_s
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
if html.present?
|
|
||||||
@body = scrub_html(html)
|
|
||||||
return @body
|
|
||||||
end
|
|
||||||
|
|
||||||
@body = @message.charset ? @message.body.decoded.force_encoding(@message.charset).encode("UTF-8").to_s.strip : @message.body.to_s
|
|
||||||
|
|
||||||
# Certain trigger phrases that means we didn't parse correctly
|
# Certain trigger phrases that means we didn't parse correctly
|
||||||
@body = nil if @body =~ /Content\-Type\:/ ||
|
if body =~ /Content\-Type\:/ || body =~ /multipart\/alternative/ || body =~ /text\/plain/
|
||||||
@body =~ /multipart\/alternative/ ||
|
raise EmptyEmailError
|
||||||
@body =~ /text\/plain/
|
end
|
||||||
|
|
||||||
@body
|
body
|
||||||
end
|
end
|
||||||
|
|
||||||
def scrub_html(html)
|
# Force encoding to UTF-8 on a Mail::Message or Mail::Part
|
||||||
# If we have an HTML message, strip the markup
|
def fix_charset(object)
|
||||||
doc = Nokogiri::HTML(html)
|
return nil if object.nil?
|
||||||
|
|
||||||
# Blackberry is annoying in that it only provides HTML. We can easily extract it though
|
if object.charset
|
||||||
content = doc.at("#BB10_response_div")
|
object.body.decoded.force_encoding(object.charset).encode("UTF-8").to_s
|
||||||
return content.text if content.present?
|
else
|
||||||
|
object.body.to_s
|
||||||
doc.xpath("//text()").text
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def discourse_email_parser
|
REPLYING_HEADER_LABELS = ['From', 'Sent', 'To', 'Subject', 'Reply To']
|
||||||
lines = @body.scrub.lines.to_a
|
REPLYING_HEADER_REGEX = Regexp.union(REPLYING_HEADER_LABELS.map { |lbl| "#{lbl}:" })
|
||||||
|
|
||||||
|
def discourse_email_trimmer(body)
|
||||||
|
lines = body.scrub.lines.to_a
|
||||||
range_end = 0
|
range_end = 0
|
||||||
|
|
||||||
lines.each_with_index do |l, idx|
|
lines.each_with_index do |l, idx|
|
||||||
|
@ -154,11 +160,15 @@ module Email
|
||||||
# Let's try it and see how well it works.
|
# Let's try it and see how well it works.
|
||||||
(l =~ /\d{4}/ && l =~ /\d:\d\d/ && l =~ /\:$/)
|
(l =~ /\d{4}/ && l =~ /\d:\d\d/ && l =~ /\:$/)
|
||||||
|
|
||||||
|
# Headers on subsequent lines
|
||||||
|
break if (0..2).all? { |off| lines[idx+off] =~ REPLYING_HEADER_REGEX }
|
||||||
|
# Headers on the same line
|
||||||
|
break if REPLYING_HEADER_LABELS.count { |lbl| l.include? lbl } >= 3
|
||||||
|
|
||||||
range_end = idx
|
range_end = idx
|
||||||
end
|
end
|
||||||
|
|
||||||
@body = lines[0..range_end].join
|
lines[0..range_end].join.strip
|
||||||
@body.strip!
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def wrap_body_in_quote(user_email)
|
def wrap_body_in_quote(user_email)
|
||||||
|
|
Loading…
Reference in New Issue