diff --git a/lib/email/receiver.rb b/lib/email/receiver.rb index 4c093de916b..a390c2061da 100644 --- a/lib/email/receiver.rb +++ b/lib/email/receiver.rb @@ -274,7 +274,8 @@ module Email markdown, elided_markdown = if html.present? # use the first html extracter that matches if html_extracter = HTML_EXTRACTERS.select { |_, r| html[r] }.min_by { |_, r| html =~ r } - self.send(:"extract_from_#{html_extracter[0]}", html) + doc = Nokogiri::HTML.fragment(html) + self.send(:"extract_from_#{html_extracter[0]}", doc) else markdown = HtmlToMarkdown.new(html, keep_img_tags: true, keep_cid_imgs: true).to_markdown markdown = trim_discourse_markers(markdown) @@ -295,70 +296,70 @@ module Email end HTML_EXTRACTERS ||= [ - [:gmail, /class="gmail_/], - [:outlook, /id="(divRplyFwdMsg|Signature)"/], - [:word, /class="WordSection1"/], - [:exchange, /name="message(Body|Reply)Section"/], - [:apple_mail, /id="AppleMailSignature"/], - [:mozilla, /class="moz-/], - [:protonmail, /class="protonmail_/], + [:gmail, / class="gmail_/], + [:outlook, / id="(divRplyFwdMsg|Signature)"/], + [:word, / class="WordSection1"/], + [:exchange, / name="message(Body|Reply)Section"/], + [:apple_mail, / id="AppleMailSignature"/], + [:mozilla, / class="moz-/], + [:protonmail, / class="protonmail_/], + [:zimbra, / data-marker="__/], ] - def extract_from_gmail(html) - doc = Nokogiri::HTML.fragment(html) + def extract_from_gmail(doc) # GMail adds a bunch of 'gmail_' prefixed classes like: gmail_signature, gmail_extra, gmail_quote # Just elide them all elided = doc.css("*[class^='gmail_']").remove to_markdown(doc.to_html, elided.to_html) end - def extract_from_outlook(html) - doc = Nokogiri::HTML.fragment(html) + def extract_from_outlook(doc) # Outlook properly identifies the signature and any replied/forwarded email # Use their id to remove them and anything that comes after elided = doc.css("#Signature, #Signature ~ *, hr, #divRplyFwdMsg, #divRplyFwdMsg ~ *").remove to_markdown(doc.to_html, elided.to_html) end - def extract_from_word(html) - doc = Nokogiri::HTML.fragment(html) + def extract_from_word(doc) # Word (?) keeps the content in the 'WordSection1' class and uses
tags # When there's something else (