FIX: correctly extract body and/or reply from exchange emails (#30512)

When receiving emails sent with Exchange, we look for some markers to identify the body of the mail and the reply (aka. previous email).

For some reasons, those markers aren't 100% reliable and sometimes, only one of them is present.

The commit 20ba54d536 introduced the bug because the `HTML_EXTRACTERS` regex for exchange looks for either `messageBodySection` or `messageReplySection` but we were only using the `reply` section. So if an email had only the `body` section, it would not be correctly extracted.

This commit handle the cases where either one of them is missing and use the other one as the actual "reply". When both are present, it correctly elides the "reply" section.
This commit is contained in:
Régis Hanol 2024-12-31 15:29:36 +01:00 committed by GitHub
parent 9497a6165f
commit d523c37057
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 97 additions and 7 deletions

View File

@ -564,10 +564,21 @@ module Email
end
def extract_from_exchange(doc)
# Exchange is using the 'messageReplySection' class for forwarded emails
# And 'messageBodySection' for the actual email
elided = doc.css("div[name='messageReplySection']").remove
to_markdown(doc.css("div[name='messageReplySection']").to_html, elided.to_html)
# Exchange is using 'messageReplySection' for forwarded emails and 'messageBodySection' for the actual email
reply = doc.css("div[name='messageReplySection']")
body = doc.css("div[name='messageBodySection']")
if reply.present? && body.present?
elided = doc.css("div[name='messageReplySection']").remove
body = doc.css("div[name='messageBodySection']")
to_markdown(body.to_html, elided.to_html)
elsif reply.present?
to_markdown(reply.to_html, "")
elsif body.present?
to_markdown(body.to_html, "")
else
to_markdown(doc.to_html, "")
end
end
def extract_from_apple_mail(doc)

View File

@ -0,0 +1,14 @@
Return-Path: <discourse@bar.com>
From: Foo Bar <discourse@bar.com>
To: alt+4f97315cc828096c9cb34c6f1a0d6fe8@bar.com
Date: Fri, 15 Jan 2017 00:12:43 +0100
Message-ID: <180@foo.bar.mail>
Mime-Version: 1.0
Content-Type: text/html; charset=UTF-8
Content-Transfer-Encoding: quoted-printable
<div>
<div name="messageBodySection">
<p>This is the <b>body</b> of the email.</p>
</div>
</div>

View File

@ -0,0 +1,17 @@
Return-Path: <discourse@bar.com>
From: Foo Bar <discourse@bar.com>
To: alt+4f97315cc828096c9cb34c6f1a0d6fe8@bar.com
Date: Fri, 15 Jan 2017 00:12:43 +0100
Message-ID: <180@foo.bar.mail>
Mime-Version: 1.0
Content-Type: text/html; charset=UTF-8
Content-Transfer-Encoding: quoted-printable
<div>
<div name="messageBodySection">
<p>This is the <b>body</b> of the email.</p>
</div>
<div name="messageReplySection">
<p>This is the <i>reply</i>!</p>
</div>
</div>

View File

@ -0,0 +1,14 @@
Return-Path: <discourse@bar.com>
From: Foo Bar <discourse@bar.com>
To: alt+4f97315cc828096c9cb34c6f1a0d6fe8@bar.com
Date: Fri, 15 Jan 2017 00:12:43 +0100
Message-ID: <180@foo.bar.mail>
Mime-Version: 1.0
Content-Type: text/html; charset=UTF-8
Content-Transfer-Encoding: quoted-printable
<div>
<div name="messageReplySection">
<p>This is the <b>body !!</b> of the email.</p>
</div>
</div>

View File

@ -415,9 +415,43 @@ RSpec.describe Email::Receiver do
it "automatically elides gmail quotes" do
SiteSetting.always_show_trimmed_content = true
expect { process(:gmail_html_reply) }.to change { topic.posts.count }
expect(topic.posts.last.raw).to eq(
"This is a **GMAIL** reply ;)\n\n<details class='elided'>\n<summary title='Show trimmed content'>&#183;&#183;&#183;</summary>\n\nThis is the *elided* part!\n\n</details>",
)
expect(topic.posts.last.raw).to eq <<~MD.strip
This is a **GMAIL** reply ;)
<details class='elided'>
<summary title='Show trimmed content'>&#183;&#183;&#183;</summary>
This is the *elided* part!
</details>
MD
end
it "correctly extracts body from exchange emails" do
SiteSetting.always_show_trimmed_content = true
expect { process(:exchange_html_body) }.to change { topic.posts.count }
expect(topic.posts.last.raw).to eq("This is the **body** of the email.")
end
it "correctly extracts reply from exchange emails" do
SiteSetting.always_show_trimmed_content = true
expect { process(:exchange_html_reply) }.to change { topic.posts.count }
expect(topic.posts.last.raw).to eq("This is the **body !!** of the email.")
end
it "correctly extracts body & reply from exchange emails" do
SiteSetting.always_show_trimmed_content = true
expect { process(:exchange_html_body_and_reply) }.to change { topic.posts.count }
expect(topic.posts.last.raw).to eq <<~MD.strip
This is the **body** of the email.
<details class='elided'>
<summary title='Show trimmed content'>&#183;&#183;&#183;</summary>
This is the *reply*!
</details>
MD
end
it "doesn't process email with same message-id more than once" do