From 482c615ef882c1953070125e0a813683f979e5ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Hanol?= Date: Fri, 2 Mar 2018 01:51:15 +0100 Subject: [PATCH] FEATURE: extract signatures from most popular email services/software --- lib/email/receiver.rb | 81 +++++++++++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 23 deletions(-) diff --git a/lib/email/receiver.rb b/lib/email/receiver.rb index 21c6b2c4f46..efc21cddd59 100644 --- a/lib/email/receiver.rb +++ b/lib/email/receiver.rb @@ -261,16 +261,9 @@ module Email end markdown, elided_markdown = if html.present? - if html[%{
- elided << fwd.next_element - # also elide any signatures - elided << doc.css("#Signature").remove - # remove the leading
- [fwd.previous_element, fwd].each(&:remove) - [doc.to_html, elided.to_html] + # Outlook properly identifies the signature and any replied/forwarded email + # Use their id to remove them and anything that comes after + elided = doc.css("#Signature, #Signature ~ *, hr, #divRplyFwdMsg, #divRplyFwdMsg ~ *").remove + to_markdown(doc.to_html, elided.to_html) + end + + def extract_from_word(html) + doc = Nokogiri::HTML.fragment(html) + # Word (?) keeps the content in the 'WordSection1' class and uses

tags + # When there's something else (,
, etc..) there's high chance it's a signature or forwarded email + elided = doc.css(".WordSection1 > :not(p):not(ul):first-of-type, .WordSection1 > :not(p):not(ul):first-of-type ~ *").remove + to_markdown(doc.at(".WordSection1").to_html, elided.to_html) + end + + def extract_from_exchange(html) + doc = Nokogiri::HTML.fragment(html) + # Exchange is using the 'messageReplySection' class for forwarded emails + # And 'messageBodySection' for the actual email + elided = doc.css("div[name='messageReplySection']").remove + to_markdown(doc.css("div[name='messageBodySection'").to_html, elided.to_html) + end + + def extract_from_apple_mail(html) + doc = Nokogiri::HTML.fragment(html) + # AppleMail is the worst. It adds 'AppleMailSignature' ids (!) to several div/p with no deterministic rules + # Our best guess is to elide whatever comes after that. + elided = doc.css("#AppleMailSignature:last-of_type ~ *").remove + to_markdown(doc.to_html, elided.to_html) + end + + def extract_from_mozilla(html) + doc = Nokogiri::HTML.fragment(html) + # Mozilla (Thunderbird ?) properly identifies signature and forwarded emails + # Remove them and anything that comes after + elided = doc.css("*[class^='moz-'], *[class^='moz-'] ~ *").remove + to_markdown(doc.to_html, elided.to_html) end def trim_reply_and_extract_elided(text)