From fe32733a57220bbeb5313080b80719f3c4c7fed5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Hanol?= Date: Fri, 13 Apr 2018 19:04:27 +0200 Subject: [PATCH] extract signatures from emails sent using Zimbra --- lib/email/receiver.rb | 45 ++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/lib/email/receiver.rb b/lib/email/receiver.rb index 4c093de916b..a390c2061da 100644 --- a/lib/email/receiver.rb +++ b/lib/email/receiver.rb @@ -274,7 +274,8 @@ module Email markdown, elided_markdown = if html.present? # use the first html extracter that matches if html_extracter = HTML_EXTRACTERS.select { |_, r| html[r] }.min_by { |_, r| html =~ r } - self.send(:"extract_from_#{html_extracter[0]}", html) + doc = Nokogiri::HTML.fragment(html) + self.send(:"extract_from_#{html_extracter[0]}", doc) else markdown = HtmlToMarkdown.new(html, keep_img_tags: true, keep_cid_imgs: true).to_markdown markdown = trim_discourse_markers(markdown) @@ -295,70 +296,70 @@ module Email end HTML_EXTRACTERS ||= [ - [:gmail, /class="gmail_/], - [:outlook, /id="(divRplyFwdMsg|Signature)"/], - [:word, /class="WordSection1"/], - [:exchange, /name="message(Body|Reply)Section"/], - [:apple_mail, /id="AppleMailSignature"/], - [:mozilla, /class="moz-/], - [:protonmail, /class="protonmail_/], + [:gmail, / class="gmail_/], + [:outlook, / id="(divRplyFwdMsg|Signature)"/], + [:word, / class="WordSection1"/], + [:exchange, / name="message(Body|Reply)Section"/], + [:apple_mail, / id="AppleMailSignature"/], + [:mozilla, / class="moz-/], + [:protonmail, / class="protonmail_/], + [:zimbra, / data-marker="__/], ] - def extract_from_gmail(html) - doc = Nokogiri::HTML.fragment(html) + def extract_from_gmail(doc) # GMail adds a bunch of 'gmail_' prefixed classes like: gmail_signature, gmail_extra, gmail_quote # Just elide them all elided = doc.css("*[class^='gmail_']").remove to_markdown(doc.to_html, elided.to_html) end - def extract_from_outlook(html) - doc = Nokogiri::HTML.fragment(html) + def extract_from_outlook(doc) # Outlook properly identifies the signature and any replied/forwarded email # Use their id to remove them and anything that comes after elided = doc.css("#Signature, #Signature ~ *, hr, #divRplyFwdMsg, #divRplyFwdMsg ~ *").remove to_markdown(doc.to_html, elided.to_html) end - def extract_from_word(html) - doc = Nokogiri::HTML.fragment(html) + def extract_from_word(doc) # Word (?) keeps the content in the 'WordSection1' class and uses

tags # When there's something else (,
, etc..) there's high chance it's a signature or forwarded email elided = doc.css(".WordSection1 > :not(p):not(ul):first-of-type, .WordSection1 > :not(p):not(ul):first-of-type ~ *").remove to_markdown(doc.at(".WordSection1").to_html, elided.to_html) end - def extract_from_exchange(html) - doc = Nokogiri::HTML.fragment(html) + def extract_from_exchange(doc) # Exchange is using the 'messageReplySection' class for forwarded emails # And 'messageBodySection' for the actual email elided = doc.css("div[name='messageReplySection']").remove to_markdown(doc.css("div[name='messageReplySection']").to_html, elided.to_html) end - def extract_from_apple_mail(html) - doc = Nokogiri::HTML.fragment(html) + def extract_from_apple_mail(doc) # AppleMail is the worst. It adds 'AppleMailSignature' ids (!) to several div/p with no deterministic rules # Our best guess is to elide whatever comes after that. elided = doc.css("#AppleMailSignature:last-of-type ~ *").remove to_markdown(doc.to_html, elided.to_html) end - def extract_from_mozilla(html) - doc = Nokogiri::HTML.fragment(html) + def extract_from_mozilla(doc) # Mozilla (Thunderbird ?) properly identifies signature and forwarded emails # Remove them and anything that comes after elided = doc.css("*[class^='moz-'], *[class^='moz-'] ~ *").remove to_markdown(doc.to_html, elided.to_html) end - def extract_from_protonmail(html) - doc = Nokogiri::HTML.fragment(html) + def extract_from_protonmail(doc) # Removes anything that has a class starting with "protonmail_" and everything after that elided = doc.css("*[class^='protonmail_'], *[class^='protonmail_'] ~ *").remove to_markdown(doc.to_html, elided.to_html) end + def extract_from_zimbra(doc) + # Removes anything that has a 'data-marker' attribute + elided = doc.css("*[data-marker]").remove + to_markdown(doc.to_html, elided.to_html) + end + def trim_reply_and_extract_elided(text) return [text, ""] if @opts[:skip_trimming] EmailReplyTrimmer.trim(text, true)