extract signatures from emails sent using Zimbra
This commit is contained in:
parent
a4b8813a02
commit
fe32733a57
|
@ -274,7 +274,8 @@ module Email
|
||||||
markdown, elided_markdown = if html.present?
|
markdown, elided_markdown = if html.present?
|
||||||
# use the first html extracter that matches
|
# use the first html extracter that matches
|
||||||
if html_extracter = HTML_EXTRACTERS.select { |_, r| html[r] }.min_by { |_, r| html =~ r }
|
if html_extracter = HTML_EXTRACTERS.select { |_, r| html[r] }.min_by { |_, r| html =~ r }
|
||||||
self.send(:"extract_from_#{html_extracter[0]}", html)
|
doc = Nokogiri::HTML.fragment(html)
|
||||||
|
self.send(:"extract_from_#{html_extracter[0]}", doc)
|
||||||
else
|
else
|
||||||
markdown = HtmlToMarkdown.new(html, keep_img_tags: true, keep_cid_imgs: true).to_markdown
|
markdown = HtmlToMarkdown.new(html, keep_img_tags: true, keep_cid_imgs: true).to_markdown
|
||||||
markdown = trim_discourse_markers(markdown)
|
markdown = trim_discourse_markers(markdown)
|
||||||
|
@ -295,70 +296,70 @@ module Email
|
||||||
end
|
end
|
||||||
|
|
||||||
HTML_EXTRACTERS ||= [
|
HTML_EXTRACTERS ||= [
|
||||||
[:gmail, /class="gmail_/],
|
[:gmail, / class="gmail_/],
|
||||||
[:outlook, /id="(divRplyFwdMsg|Signature)"/],
|
[:outlook, / id="(divRplyFwdMsg|Signature)"/],
|
||||||
[:word, /class="WordSection1"/],
|
[:word, / class="WordSection1"/],
|
||||||
[:exchange, /name="message(Body|Reply)Section"/],
|
[:exchange, / name="message(Body|Reply)Section"/],
|
||||||
[:apple_mail, /id="AppleMailSignature"/],
|
[:apple_mail, / id="AppleMailSignature"/],
|
||||||
[:mozilla, /class="moz-/],
|
[:mozilla, / class="moz-/],
|
||||||
[:protonmail, /class="protonmail_/],
|
[:protonmail, / class="protonmail_/],
|
||||||
|
[:zimbra, / data-marker="__/],
|
||||||
]
|
]
|
||||||
|
|
||||||
def extract_from_gmail(html)
|
def extract_from_gmail(doc)
|
||||||
doc = Nokogiri::HTML.fragment(html)
|
|
||||||
# GMail adds a bunch of 'gmail_' prefixed classes like: gmail_signature, gmail_extra, gmail_quote
|
# GMail adds a bunch of 'gmail_' prefixed classes like: gmail_signature, gmail_extra, gmail_quote
|
||||||
# Just elide them all
|
# Just elide them all
|
||||||
elided = doc.css("*[class^='gmail_']").remove
|
elided = doc.css("*[class^='gmail_']").remove
|
||||||
to_markdown(doc.to_html, elided.to_html)
|
to_markdown(doc.to_html, elided.to_html)
|
||||||
end
|
end
|
||||||
|
|
||||||
def extract_from_outlook(html)
|
def extract_from_outlook(doc)
|
||||||
doc = Nokogiri::HTML.fragment(html)
|
|
||||||
# Outlook properly identifies the signature and any replied/forwarded email
|
# Outlook properly identifies the signature and any replied/forwarded email
|
||||||
# Use their id to remove them and anything that comes after
|
# Use their id to remove them and anything that comes after
|
||||||
elided = doc.css("#Signature, #Signature ~ *, hr, #divRplyFwdMsg, #divRplyFwdMsg ~ *").remove
|
elided = doc.css("#Signature, #Signature ~ *, hr, #divRplyFwdMsg, #divRplyFwdMsg ~ *").remove
|
||||||
to_markdown(doc.to_html, elided.to_html)
|
to_markdown(doc.to_html, elided.to_html)
|
||||||
end
|
end
|
||||||
|
|
||||||
def extract_from_word(html)
|
def extract_from_word(doc)
|
||||||
doc = Nokogiri::HTML.fragment(html)
|
|
||||||
# Word (?) keeps the content in the 'WordSection1' class and uses <p> tags
|
# Word (?) keeps the content in the 'WordSection1' class and uses <p> tags
|
||||||
# When there's something else (<table>, <div>, etc..) there's high chance it's a signature or forwarded email
|
# When there's something else (<table>, <div>, etc..) there's high chance it's a signature or forwarded email
|
||||||
elided = doc.css(".WordSection1 > :not(p):not(ul):first-of-type, .WordSection1 > :not(p):not(ul):first-of-type ~ *").remove
|
elided = doc.css(".WordSection1 > :not(p):not(ul):first-of-type, .WordSection1 > :not(p):not(ul):first-of-type ~ *").remove
|
||||||
to_markdown(doc.at(".WordSection1").to_html, elided.to_html)
|
to_markdown(doc.at(".WordSection1").to_html, elided.to_html)
|
||||||
end
|
end
|
||||||
|
|
||||||
def extract_from_exchange(html)
|
def extract_from_exchange(doc)
|
||||||
doc = Nokogiri::HTML.fragment(html)
|
|
||||||
# Exchange is using the 'messageReplySection' class for forwarded emails
|
# Exchange is using the 'messageReplySection' class for forwarded emails
|
||||||
# And 'messageBodySection' for the actual email
|
# And 'messageBodySection' for the actual email
|
||||||
elided = doc.css("div[name='messageReplySection']").remove
|
elided = doc.css("div[name='messageReplySection']").remove
|
||||||
to_markdown(doc.css("div[name='messageReplySection']").to_html, elided.to_html)
|
to_markdown(doc.css("div[name='messageReplySection']").to_html, elided.to_html)
|
||||||
end
|
end
|
||||||
|
|
||||||
def extract_from_apple_mail(html)
|
def extract_from_apple_mail(doc)
|
||||||
doc = Nokogiri::HTML.fragment(html)
|
|
||||||
# AppleMail is the worst. It adds 'AppleMailSignature' ids (!) to several div/p with no deterministic rules
|
# AppleMail is the worst. It adds 'AppleMailSignature' ids (!) to several div/p with no deterministic rules
|
||||||
# Our best guess is to elide whatever comes after that.
|
# Our best guess is to elide whatever comes after that.
|
||||||
elided = doc.css("#AppleMailSignature:last-of-type ~ *").remove
|
elided = doc.css("#AppleMailSignature:last-of-type ~ *").remove
|
||||||
to_markdown(doc.to_html, elided.to_html)
|
to_markdown(doc.to_html, elided.to_html)
|
||||||
end
|
end
|
||||||
|
|
||||||
def extract_from_mozilla(html)
|
def extract_from_mozilla(doc)
|
||||||
doc = Nokogiri::HTML.fragment(html)
|
|
||||||
# Mozilla (Thunderbird ?) properly identifies signature and forwarded emails
|
# Mozilla (Thunderbird ?) properly identifies signature and forwarded emails
|
||||||
# Remove them and anything that comes after
|
# Remove them and anything that comes after
|
||||||
elided = doc.css("*[class^='moz-'], *[class^='moz-'] ~ *").remove
|
elided = doc.css("*[class^='moz-'], *[class^='moz-'] ~ *").remove
|
||||||
to_markdown(doc.to_html, elided.to_html)
|
to_markdown(doc.to_html, elided.to_html)
|
||||||
end
|
end
|
||||||
|
|
||||||
def extract_from_protonmail(html)
|
def extract_from_protonmail(doc)
|
||||||
doc = Nokogiri::HTML.fragment(html)
|
|
||||||
# Removes anything that has a class starting with "protonmail_" and everything after that
|
# Removes anything that has a class starting with "protonmail_" and everything after that
|
||||||
elided = doc.css("*[class^='protonmail_'], *[class^='protonmail_'] ~ *").remove
|
elided = doc.css("*[class^='protonmail_'], *[class^='protonmail_'] ~ *").remove
|
||||||
to_markdown(doc.to_html, elided.to_html)
|
to_markdown(doc.to_html, elided.to_html)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def extract_from_zimbra(doc)
|
||||||
|
# Removes anything that has a 'data-marker' attribute
|
||||||
|
elided = doc.css("*[data-marker]").remove
|
||||||
|
to_markdown(doc.to_html, elided.to_html)
|
||||||
|
end
|
||||||
|
|
||||||
def trim_reply_and_extract_elided(text)
|
def trim_reply_and_extract_elided(text)
|
||||||
return [text, ""] if @opts[:skip_trimming]
|
return [text, ""] if @opts[:skip_trimming]
|
||||||
EmailReplyTrimmer.trim(text, true)
|
EmailReplyTrimmer.trim(text, true)
|
||||||
|
|
Loading…
Reference in New Issue