extract signatures from emails sent using Zimbra

This commit is contained in:
Régis Hanol 2018-04-13 19:04:27 +02:00
parent a4b8813a02
commit fe32733a57
1 changed files with 23 additions and 22 deletions

View File

@ -274,7 +274,8 @@ module Email
markdown, elided_markdown = if html.present? markdown, elided_markdown = if html.present?
# use the first html extracter that matches # use the first html extracter that matches
if html_extracter = HTML_EXTRACTERS.select { |_, r| html[r] }.min_by { |_, r| html =~ r } if html_extracter = HTML_EXTRACTERS.select { |_, r| html[r] }.min_by { |_, r| html =~ r }
self.send(:"extract_from_#{html_extracter[0]}", html) doc = Nokogiri::HTML.fragment(html)
self.send(:"extract_from_#{html_extracter[0]}", doc)
else else
markdown = HtmlToMarkdown.new(html, keep_img_tags: true, keep_cid_imgs: true).to_markdown markdown = HtmlToMarkdown.new(html, keep_img_tags: true, keep_cid_imgs: true).to_markdown
markdown = trim_discourse_markers(markdown) markdown = trim_discourse_markers(markdown)
@ -302,63 +303,63 @@ module Email
[:apple_mail, / id="AppleMailSignature"/], [:apple_mail, / id="AppleMailSignature"/],
[:mozilla, / class="moz-/], [:mozilla, / class="moz-/],
[:protonmail, / class="protonmail_/], [:protonmail, / class="protonmail_/],
[:zimbra, / data-marker="__/],
] ]
def extract_from_gmail(html) def extract_from_gmail(doc)
doc = Nokogiri::HTML.fragment(html)
# GMail adds a bunch of 'gmail_' prefixed classes like: gmail_signature, gmail_extra, gmail_quote # GMail adds a bunch of 'gmail_' prefixed classes like: gmail_signature, gmail_extra, gmail_quote
# Just elide them all # Just elide them all
elided = doc.css("*[class^='gmail_']").remove elided = doc.css("*[class^='gmail_']").remove
to_markdown(doc.to_html, elided.to_html) to_markdown(doc.to_html, elided.to_html)
end end
def extract_from_outlook(html) def extract_from_outlook(doc)
doc = Nokogiri::HTML.fragment(html)
# Outlook properly identifies the signature and any replied/forwarded email # Outlook properly identifies the signature and any replied/forwarded email
# Use their id to remove them and anything that comes after # Use their id to remove them and anything that comes after
elided = doc.css("#Signature, #Signature ~ *, hr, #divRplyFwdMsg, #divRplyFwdMsg ~ *").remove elided = doc.css("#Signature, #Signature ~ *, hr, #divRplyFwdMsg, #divRplyFwdMsg ~ *").remove
to_markdown(doc.to_html, elided.to_html) to_markdown(doc.to_html, elided.to_html)
end end
def extract_from_word(html) def extract_from_word(doc)
doc = Nokogiri::HTML.fragment(html)
# Word (?) keeps the content in the 'WordSection1' class and uses <p> tags # Word (?) keeps the content in the 'WordSection1' class and uses <p> tags
# When there's something else (<table>, <div>, etc..) there's high chance it's a signature or forwarded email # When there's something else (<table>, <div>, etc..) there's high chance it's a signature or forwarded email
elided = doc.css(".WordSection1 > :not(p):not(ul):first-of-type, .WordSection1 > :not(p):not(ul):first-of-type ~ *").remove elided = doc.css(".WordSection1 > :not(p):not(ul):first-of-type, .WordSection1 > :not(p):not(ul):first-of-type ~ *").remove
to_markdown(doc.at(".WordSection1").to_html, elided.to_html) to_markdown(doc.at(".WordSection1").to_html, elided.to_html)
end end
def extract_from_exchange(html) def extract_from_exchange(doc)
doc = Nokogiri::HTML.fragment(html)
# Exchange is using the 'messageReplySection' class for forwarded emails # Exchange is using the 'messageReplySection' class for forwarded emails
# And 'messageBodySection' for the actual email # And 'messageBodySection' for the actual email
elided = doc.css("div[name='messageReplySection']").remove elided = doc.css("div[name='messageReplySection']").remove
to_markdown(doc.css("div[name='messageReplySection']").to_html, elided.to_html) to_markdown(doc.css("div[name='messageReplySection']").to_html, elided.to_html)
end end
def extract_from_apple_mail(html) def extract_from_apple_mail(doc)
doc = Nokogiri::HTML.fragment(html)
# AppleMail is the worst. It adds 'AppleMailSignature' ids (!) to several div/p with no deterministic rules # AppleMail is the worst. It adds 'AppleMailSignature' ids (!) to several div/p with no deterministic rules
# Our best guess is to elide whatever comes after that. # Our best guess is to elide whatever comes after that.
elided = doc.css("#AppleMailSignature:last-of-type ~ *").remove elided = doc.css("#AppleMailSignature:last-of-type ~ *").remove
to_markdown(doc.to_html, elided.to_html) to_markdown(doc.to_html, elided.to_html)
end end
def extract_from_mozilla(html) def extract_from_mozilla(doc)
doc = Nokogiri::HTML.fragment(html)
# Mozilla (Thunderbird ?) properly identifies signature and forwarded emails # Mozilla (Thunderbird ?) properly identifies signature and forwarded emails
# Remove them and anything that comes after # Remove them and anything that comes after
elided = doc.css("*[class^='moz-'], *[class^='moz-'] ~ *").remove elided = doc.css("*[class^='moz-'], *[class^='moz-'] ~ *").remove
to_markdown(doc.to_html, elided.to_html) to_markdown(doc.to_html, elided.to_html)
end end
def extract_from_protonmail(html) def extract_from_protonmail(doc)
doc = Nokogiri::HTML.fragment(html)
# Removes anything that has a class starting with "protonmail_" and everything after that # Removes anything that has a class starting with "protonmail_" and everything after that
elided = doc.css("*[class^='protonmail_'], *[class^='protonmail_'] ~ *").remove elided = doc.css("*[class^='protonmail_'], *[class^='protonmail_'] ~ *").remove
to_markdown(doc.to_html, elided.to_html) to_markdown(doc.to_html, elided.to_html)
end end
def extract_from_zimbra(doc)
# Removes anything that has a 'data-marker' attribute
elided = doc.css("*[data-marker]").remove
to_markdown(doc.to_html, elided.to_html)
end
def trim_reply_and_extract_elided(text) def trim_reply_and_extract_elided(text)
return [text, ""] if @opts[:skip_trimming] return [text, ""] if @opts[:skip_trimming]
EmailReplyTrimmer.trim(text, true) EmailReplyTrimmer.trim(text, true)