From 7ff38d5502606bd2dae0a1a559ed96d0f4dce258 Mon Sep 17 00:00:00 2001 From: Timothy Potter Date: Tue, 15 Jul 2014 21:44:28 +0000 Subject: [PATCH] SOLR-2245: Improvements to the MailEntityProcessor git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1610859 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/ivy-versions.properties | 3 +- solr/CHANGES.txt | 7 + solr/NOTICE.txt | 4 +- .../dataimport/MailEntityProcessor.java | 663 ++++++++++++------ solr/contrib/dataimporthandler/ivy.xml | 3 +- solr/example/example-DIH/README.txt | 2 +- .../solr/mail/conf/mail-data-config.xml | 12 + solr/licenses/gimap-1.5.1.jar.sha1 | 1 + solr/licenses/javax.mail-1.5.1.jar.sha1 | 1 + solr/licenses/log4j-1.2.16.jar.sha1 | 1 + solr/licenses/mail-1.4.3.jar.sha1 | 1 - 11 files changed, 472 insertions(+), 226 deletions(-) create mode 100644 solr/licenses/gimap-1.5.1.jar.sha1 create mode 100644 solr/licenses/javax.mail-1.5.1.jar.sha1 create mode 100644 solr/licenses/log4j-1.2.16.jar.sha1 delete mode 100644 solr/licenses/mail-1.4.3.jar.sha1 diff --git a/lucene/ivy-versions.properties b/lucene/ivy-versions.properties index 0d57a0058d8..b89990374bd 100644 --- a/lucene/ivy-versions.properties +++ b/lucene/ivy-versions.properties @@ -67,7 +67,8 @@ com.sun.jersey.version = 1.9 /jakarta-regexp/jakarta-regexp = 1.4 /javax.activation/activation = 1.1.1 /javax.inject/javax.inject= 1 -/javax.mail/mail = 1.4.3 +/com.sun.mail/javax.mail = 1.5.1 +/com.sun.mail/gimap = 1.5.1 /javax.servlet/javax.servlet-api = 3.0.1 /javax.servlet/servlet-api = 2.4 /jdom/jdom = 1.0 diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 3feb254f14c..46a30e29aef 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -70,6 +70,13 @@ New Features * SOLR-6103: Added DateRangeField for indexing date ranges, especially multi-valued ones. Based on LUCENE-5648. (David Smiley) +* SOLR-2245: Improvements to the MailEntityProcessor: + - Support for server-side date filtering if using GMail; requires new + dependency on the Sun Gmail Java mail extensions + - Support for using the last_index_time from the previous run as the + value for the fetchMailsSince filter. + (Peter Sturge, Timothy Potter) + Other Changes ---------------------- diff --git a/solr/NOTICE.txt b/solr/NOTICE.txt index 8a5061abe2a..4ef39ecabd9 100644 --- a/solr/NOTICE.txt +++ b/solr/NOTICE.txt @@ -65,8 +65,8 @@ Copyright (c) 2004, Sun Microsystems, Inc. Copyright (c) 2006, John Kristian License: The BSD License (http://www.opensource.org/licenses/bsd-license.php) -JavaMail API 1.4.1: https://glassfish.dev.java.net/javaee5/mail/ -License: Common Development and Distribution License (CDDL) v1.0 (https://glassfish.dev.java.net/public/CDDLv1.0.html) +JavaMail API 1.5.1: https://glassfish.dev.java.net/javaee5/mail/ +License: Common Development and Distribution License (CDDL) v1.1 (https://glassfish.java.net/public/CDDL+GPL_1_1.html) JavaBeans Activation Framework (JAF): http://java.sun.com/products/javabeans/jaf/index.jsp License: Common Development and Distribution License (CDDL) v1.0 (https://glassfish.dev.java.net/public/CDDLv1.0.html) diff --git a/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java b/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java index 756b76409ed..e1c5e37f4ce 100644 --- a/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java +++ b/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java @@ -1,4 +1,4 @@ -/* +/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -18,10 +18,9 @@ package org.apache.solr.handler.dataimport; import com.sun.mail.imap.IMAPMessage; +import org.apache.solr.handler.dataimport.config.ConfigNameConstants; import org.apache.tika.Tika; -import org.apache.tika.metadata.HttpHeaders; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaMetadataKeys; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -30,33 +29,40 @@ import javax.mail.internet.AddressException; import javax.mail.internet.ContentType; import javax.mail.internet.InternetAddress; import javax.mail.internet.MimeMessage; -import javax.mail.search.AndTerm; -import javax.mail.search.ComparisonTerm; -import javax.mail.search.ReceivedDateTerm; -import javax.mail.search.SearchTerm; +import javax.mail.search.*; + import java.io.InputStream; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.*; +import com.sun.mail.gimap.GmailFolder; +import com.sun.mail.gimap.GmailRawSearchTerm; + /** - * An {@link EntityProcessor} instance which can index emails along with their attachments from POP3 or IMAP sources. Refer to - * http://wiki.apache.org/solr/DataImportHandler for more - * details. This API is experimental and subject to change - * - * + * An EntityProcessor instance which can index emails along with their + * attachments from POP3 or IMAP sources. Refer to http://wiki.apache.org/solr/DataImportHandler for more details. This + * API is experimental and subject to change + * + * @version $Id$ * @since solr 1.4 */ public class MailEntityProcessor extends EntityProcessorBase { - + + private static final SimpleDateFormat sinceDateParser = + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + private static final SimpleDateFormat afterFmt = + new SimpleDateFormat("yyyy/MM/dd"); + public static interface CustomFilter { public SearchTerm getCustomSearch(Folder folder); } - - @Override + public void init(Context context) { super.init(context); - // set attributes using XXX getXXXFromContext(attribute, defualtValue); + // set attributes using XXX getXXXFromContext(attribute, defualtValue); // applies variable resolver and return default if value is not found or null // REQUIRED : connection and folder info user = getStringFromContext("user", null); @@ -66,60 +72,116 @@ public class MailEntityProcessor extends EntityProcessorBase { folderNames = getStringFromContext("folders", null); // validate if (host == null || protocol == null || user == null || password == null - || folderNames == null) - throw new DataImportHandlerException(DataImportHandlerException.SEVERE, - "'user|password|protocol|host|folders' are required attributes"); - - //OPTIONAL : have defaults and are optional + || folderNames == null) throw new DataImportHandlerException( + DataImportHandlerException.SEVERE, + "'user|password|protocol|host|folders' are required attributes"); + + // OPTIONAL : have defaults and are optional recurse = getBoolFromContext("recurse", true); + + exclude.clear(); String excludes = getStringFromContext("exclude", ""); if (excludes != null && !excludes.trim().equals("")) { exclude = Arrays.asList(excludes.split(",")); } + + include.clear(); String includes = getStringFromContext("include", ""); if (includes != null && !includes.trim().equals("")) { include = Arrays.asList(includes.split(",")); } batchSize = getIntFromContext("batchSize", 20); customFilter = getStringFromContext("customFilter", ""); - String s = getStringFromContext("fetchMailsSince", null); - if (s != null) - try { - fetchMailsSince = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT).parse(s); - } catch (ParseException e) { - throw new DataImportHandlerException(DataImportHandlerException.SEVERE, "Invalid value for fetchMailSince: " + s, e); - } + if (filters != null) filters.clear(); + folderIter = null; + msgIter = null; + + String lastIndexTime = null; + String command = + String.valueOf(context.getRequestParameters().get("command")); + if (!DataImporter.FULL_IMPORT_CMD.equals(command)) + throw new IllegalArgumentException(this.getClass().getSimpleName()+ + " only supports "+DataImporter.FULL_IMPORT_CMD); + + // Read the last_index_time out of the dataimport.properties if available + String cname = getStringFromContext("name", "mailimporter"); + String varName = ConfigNameConstants.IMPORTER_NS_SHORT + "." + cname + "." + + DocBuilder.LAST_INDEX_TIME; + Object varValue = context.getVariableResolver().resolve(varName); + if (varValue == null || "".equals(varValue)) { + varName = ConfigNameConstants.IMPORTER_NS_SHORT + "." + + DocBuilder.LAST_INDEX_TIME; + varValue = context.getVariableResolver().resolve(varName); + } + + if (varValue != null && varValue instanceof String) { + lastIndexTime = (String)varValue; + if (lastIndexTime != null && lastIndexTime.length() == 0) + lastIndexTime = null; + } + + if (lastIndexTime == null) + lastIndexTime = getStringFromContext("fetchMailsSince", ""); + LOG.info("Using lastIndexTime "+lastIndexTime+" for mail import"); + + this.fetchMailsSince = null; + if (lastIndexTime != null && lastIndexTime.length() > 0) { + try { + fetchMailsSince = sinceDateParser.parse(lastIndexTime); + LOG.info("Parsed fetchMailsSince=" + lastIndexTime); + } catch (ParseException e) { + throw new DataImportHandlerException(DataImportHandlerException.SEVERE, + "Invalid value for fetchMailSince: " + lastIndexTime, e); + } + } + fetchSize = getIntFromContext("fetchSize", 32 * 1024); cTimeout = getIntFromContext("connectTimeout", 30 * 1000); rTimeout = getIntFromContext("readTimeout", 60 * 1000); - processAttachment = getBoolFromContext( - getStringFromContext("processAttachment",null) == null ? "processAttachement":"processAttachment" - , true); - - tika = new Tika(); + String tmp = context.getEntityAttribute("includeOtherUserFolders"); + includeOtherUserFolders = (tmp != null && Boolean.valueOf(tmp.trim())); + tmp = context.getEntityAttribute("includeSharedFolders"); + includeSharedFolders = (tmp != null && Boolean.valueOf(tmp.trim())); + + setProcessAttachmentConfig(); + includeContent = getBoolFromContext("includeContent", true); + logConfig(); } - + + private void setProcessAttachmentConfig() { + processAttachment = true; + String tbval = context.getEntityAttribute("processAttachments"); + if (tbval == null) { + tbval = context.getEntityAttribute("processAttachement"); + if (tbval != null) processAttachment = Boolean.valueOf(tbval); + } else processAttachment = Boolean.valueOf(tbval); + } + @Override - public Map nextRow() { - Message mail; - Map row = null; + public Map nextRow() { + Message mail = null; + Map row = null; do { // try till there is a valid document or folders get exhausted. // when mail == NULL, it means end of processing - mail = getNextMail(); + mail = getNextMail(); + if (mail != null) row = getDocumentFromMail(mail); - } while (row == null && mail != null); + + if (row != null && row.get("folder") == null) + row.put("folder", mail.getFolder().getFullName()); + + } while (row == null && mail != null); return row; } - + private Message getNextMail() { if (!connected) { - if (!connectToMailBox()) - return null; + if (!connectToMailBox()) return null; connected = true; } if (folderIter == null) { @@ -131,119 +193,127 @@ public class MailEntityProcessor extends EntityProcessorBase { // loop till a valid mail or all folders exhausted. while (msgIter == null || !msgIter.hasNext()) { Folder next = folderIter.hasNext() ? folderIter.next() : null; - if (next == null) { - return null; - } + if (next == null) return null; + msgIter = new MessageIterator(next, batchSize); } return msgIter.next(); } - - private Map getDocumentFromMail(Message mail) { - Map row = new HashMap<>(); + + private Map getDocumentFromMail(Message mail) { + Map row = new HashMap<>(); try { addPartToDocument(mail, row, true); return row; } catch (Exception e) { + LOG.error("Failed to convert message [" + mail.toString() + + "] to document due to: " + e, e); return null; } } - - public void addPartToDocument(Part part, Map row, boolean outerMost) throws Exception { + + public void addPartToDocument(Part part, Map row, boolean outerMost) throws Exception { if (part instanceof Message) { - addEnvelopToDocument(part, row); + addEnvelopeToDocument(part, row); } - - String ct = part.getContentType(); + + String ct = part.getContentType().toLowerCase(); ContentType ctype = new ContentType(ct); if (part.isMimeType("multipart/*")) { - Multipart mp = (Multipart) part.getContent(); - int count = mp.getCount(); - if (part.isMimeType("multipart/alternative")) - count = 1; - for (int i = 0; i < count; i++) - addPartToDocument(mp.getBodyPart(i), row, false); + Object content = part.getContent(); + if (content != null && content instanceof Multipart) { + Multipart mp = (Multipart) part.getContent(); + int count = mp.getCount(); + if (part.isMimeType("multipart/alternative")) count = 1; + for (int i = 0; i < count; i++) + addPartToDocument(mp.getBodyPart(i), row, false); + } else { + LOG.warn("Multipart content is a not an instance of Multipart! Content is: " + + (content != null ? content.getClass().getName() : "null") + + ". Typically, this is due to the Java Activation JAR being loaded by the wrong classloader."); + } } else if (part.isMimeType("message/rfc822")) { addPartToDocument((Part) part.getContent(), row, false); } else { String disp = part.getDisposition(); - if (!processAttachment || (disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT))) return; - InputStream is = part.getInputStream(); - String fileName = part.getFileName(); - Metadata md = new Metadata(); - md.set(HttpHeaders.CONTENT_TYPE, ctype.getBaseType().toLowerCase(Locale.ROOT)); - md.set(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName); - String content = tika.parseToString(is, md); - if (disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT)) { - if (row.get(ATTACHMENT) == null) - row.put(ATTACHMENT, new ArrayList()); - List contents = (List) row.get(ATTACHMENT); - contents.add(content); - row.put(ATTACHMENT, contents); - if (row.get(ATTACHMENT_NAMES) == null) - row.put(ATTACHMENT_NAMES, new ArrayList()); - List names = (List) row.get(ATTACHMENT_NAMES); - names.add(fileName); - row.put(ATTACHMENT_NAMES, names); - } else { - if (row.get(CONTENT) == null) - row.put(CONTENT, new ArrayList()); + if (includeContent + && !(disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT))) { + InputStream is = part.getInputStream(); + Metadata contentTypeHint = new Metadata(); + contentTypeHint.set(Metadata.CONTENT_TYPE, ctype.getBaseType() + .toLowerCase(Locale.ENGLISH)); + String content = (new Tika()).parseToString(is, contentTypeHint); + if (row.get(CONTENT) == null) row.put(CONTENT, new ArrayList()); List contents = (List) row.get(CONTENT); - contents.add(content); + contents.add(content.trim()); row.put(CONTENT, contents); } + if (!processAttachment || disp == null + || !disp.equalsIgnoreCase(Part.ATTACHMENT)) return; + InputStream is = part.getInputStream(); + String fileName = part.getFileName(); + Metadata contentTypeHint = new Metadata(); + contentTypeHint.set(Metadata.CONTENT_TYPE, ctype.getBaseType() + .toLowerCase(Locale.ENGLISH)); + String content = (new Tika()).parseToString(is, contentTypeHint); + if (content == null || content.trim().length() == 0) return; + + if (row.get(ATTACHMENT) == null) row.put(ATTACHMENT, + new ArrayList()); + List contents = (List) row.get(ATTACHMENT); + contents.add(content.trim()); + row.put(ATTACHMENT, contents); + if (row.get(ATTACHMENT_NAMES) == null) row.put(ATTACHMENT_NAMES, + new ArrayList()); + List names = (List) row.get(ATTACHMENT_NAMES); + names.add(fileName); + row.put(ATTACHMENT_NAMES, names); } } - - private void addEnvelopToDocument(Part part, Map row) throws MessagingException { + + private void addEnvelopeToDocument(Part part, Map row) + throws MessagingException { MimeMessage mail = (MimeMessage) part; Address[] adresses; - if ((adresses = mail.getFrom()) != null && adresses.length > 0) - row.put(FROM, adresses[0].toString()); - + if ((adresses = mail.getFrom()) != null && adresses.length > 0) row.put( + FROM, adresses[0].toString()); + List to = new ArrayList<>(); - if ((adresses = mail.getRecipients(Message.RecipientType.TO)) != null) - addAddressToList(adresses, to); - if ((adresses = mail.getRecipients(Message.RecipientType.CC)) != null) - addAddressToList(adresses, to); - if ((adresses = mail.getRecipients(Message.RecipientType.BCC)) != null) - addAddressToList(adresses, to); - if (to.size() > 0) - row.put(TO_CC_BCC, to); - + if ((adresses = mail.getRecipients(Message.RecipientType.TO)) != null) addAddressToList( + adresses, to); + if ((adresses = mail.getRecipients(Message.RecipientType.CC)) != null) addAddressToList( + adresses, to); + if ((adresses = mail.getRecipients(Message.RecipientType.BCC)) != null) addAddressToList( + adresses, to); + if (to.size() > 0) row.put(TO_CC_BCC, to); + row.put(MESSAGE_ID, mail.getMessageID()); row.put(SUBJECT, mail.getSubject()); - + Date d = mail.getSentDate(); if (d != null) { row.put(SENT_DATE, d); } - + List flags = new ArrayList<>(); for (Flags.Flag flag : mail.getFlags().getSystemFlags()) { - if (flag == Flags.Flag.ANSWERED) - flags.add(FLAG_ANSWERED); - else if (flag == Flags.Flag.DELETED) - flags.add(FLAG_DELETED); - else if (flag == Flags.Flag.DRAFT) - flags.add(FLAG_DRAFT); - else if (flag == Flags.Flag.FLAGGED) - flags.add(FLAG_FLAGGED); - else if (flag == Flags.Flag.RECENT) - flags.add(FLAG_RECENT); - else if (flag == Flags.Flag.SEEN) - flags.add(FLAG_SEEN); + if (flag == Flags.Flag.ANSWERED) flags.add(FLAG_ANSWERED); + else if (flag == Flags.Flag.DELETED) flags.add(FLAG_DELETED); + else if (flag == Flags.Flag.DRAFT) flags.add(FLAG_DRAFT); + else if (flag == Flags.Flag.FLAGGED) flags.add(FLAG_FLAGGED); + else if (flag == Flags.Flag.RECENT) flags.add(FLAG_RECENT); + else if (flag == Flags.Flag.SEEN) flags.add(FLAG_SEEN); } flags.addAll(Arrays.asList(mail.getFlags().getUserFlags())); + if (flags.size() == 0) flags.add(FLAG_NONE); row.put(FLAGS, flags); - + String[] hdrs = mail.getHeader("X-Mailer"); - if (hdrs != null) - row.put(XMAILER, hdrs[0]); + if (hdrs != null) row.put(XMAILER, hdrs[0]); } - - - private void addAddressToList(Address[] adresses, List to) throws AddressException { + + private void addAddressToList(Address[] adresses, List to) + throws AddressException { for (Address address : adresses) { to.add(address.toString()); InternetAddress ia = (InternetAddress) address; @@ -254,25 +324,60 @@ public class MailEntityProcessor extends EntityProcessorBase { } } } - + private boolean connectToMailBox() { + // this is needed to load the activation mail stuff correctly + // otherwise, the JavaMail multipart support doesn't get configured + // correctly, which leads to a class cast exception when processing + // multipart messages: IMAPInputStream cannot be cast to + // javax.mail.Multipart + Thread.currentThread().setContextClassLoader(getClass().getClassLoader()); + try { Properties props = new Properties(); + if (System.getProperty("mail.debug") != null) + props.setProperty("mail.debug", System.getProperty("mail.debug")); + + if (("imap".equals(protocol) || "imaps".equals(protocol)) + && "imap.gmail.com".equals(host)) { + LOG.info("Consider using 'gimaps' protocol instead of '" + protocol + + "' for enabling GMail specific extensions for " + host); + } + props.setProperty("mail.store.protocol", protocol); - props.setProperty("mail.imap.fetchsize", "" + fetchSize); - props.setProperty("mail.imap.timeout", "" + rTimeout); - props.setProperty("mail.imap.connectiontimeout", "" + cTimeout); + + String imapPropPrefix = protocol.startsWith("gimap") ? "gimap" : "imap"; + props.setProperty("mail." + imapPropPrefix + ".fetchsize", "" + fetchSize); + props.setProperty("mail." + imapPropPrefix + ".timeout", "" + rTimeout); + props.setProperty("mail." + imapPropPrefix + ".connectiontimeout", "" + cTimeout); + + int port = -1; + int colonAt = host.indexOf(":"); + if (colonAt != -1) { + port = Integer.parseInt(host.substring(colonAt + 1)); + host = host.substring(0, colonAt); + } + Session session = Session.getDefaultInstance(props, null); mailbox = session.getStore(protocol); - mailbox.connect(host, user, password); - LOG.info("Connected to mailbox"); + if (port != -1) { + mailbox.connect(host, port, user, password); + } else { + mailbox.connect(host, user, password); + } + LOG.info("Connected to " + user + "'s mailbox on " + host); + return true; - } catch (MessagingException e) { + } catch (MessagingException e) { + String errMsg = String.format(Locale.ENGLISH, + "Failed to connect to %s server %s as user %s due to: %s", protocol, + host, user, e.toString()); + LOG.error(errMsg, e); throw new DataImportHandlerException(DataImportHandlerException.SEVERE, - "Connection failed", e); + errMsg, e); } } - + private void createFilters() { if (fetchMailsSince != null) { filters.add(new MailsSinceLastCheckFilter(fetchMailsSince)); @@ -286,49 +391,76 @@ public class MailEntityProcessor extends EntityProcessorBase { } } catch (Exception e) { throw new DataImportHandlerException(DataImportHandlerException.SEVERE, - "Custom filter could not be created", e); + "Custom filter could not be created", e); } } } - + private void logConfig() { if (!LOG.isInfoEnabled()) return; - StringBuilder config = new StringBuilder(); - config.append("user : ").append(user).append(System.getProperty("line.separator")); - config.append("pwd : ").append(password).append(System.getProperty("line.separator")); - config.append("protocol : ").append(protocol).append(System.getProperty("line.separator")); - config.append("host : ").append(host).append(System.getProperty("line.separator")); - config.append("folders : ").append(folderNames).append(System.getProperty("line.separator")); - config.append("recurse : ").append(recurse).append(System.getProperty("line.separator")); - config.append("exclude : ").append(exclude.toString()).append(System.getProperty("line.separator")); - config.append("include : ").append(include.toString()).append(System.getProperty("line.separator")); - config.append("batchSize : ").append(batchSize).append(System.getProperty("line.separator")); - config.append("fetchSize : ").append(fetchSize).append(System.getProperty("line.separator")); - config.append("read timeout : ").append(rTimeout).append(System.getProperty("line.separator")); - config.append("conection timeout : ").append(cTimeout).append(System.getProperty("line.separator")); - config.append("custom filter : ").append(customFilter).append(System.getProperty("line.separator")); - config.append("fetch mail since : ").append(fetchMailsSince).append(System.getProperty("line.separator")); + + String lineSep = System.getProperty("line.separator"); + + StringBuffer config = new StringBuffer(); + config.append("user : ").append(user).append(lineSep); + config + .append("pwd : ") + .append( + password != null && password.length() > 0 ? "" : "") + .append(lineSep); + config.append("protocol : ").append(protocol) + .append(lineSep); + config.append("host : ").append(host) + .append(lineSep); + config.append("folders : ").append(folderNames) + .append(lineSep); + config.append("recurse : ").append(recurse) + .append(lineSep); + config.append("exclude : ").append(exclude.toString()) + .append(lineSep); + config.append("include : ").append(include.toString()) + .append(lineSep); + config.append("batchSize : ").append(batchSize) + .append(lineSep); + config.append("fetchSize : ").append(fetchSize) + .append(lineSep); + config.append("read timeout : ").append(rTimeout) + .append(lineSep); + config.append("conection timeout : ").append(cTimeout) + .append(lineSep); + config.append("custom filter : ").append(customFilter) + .append(lineSep); + config.append("fetch mail since : ").append(fetchMailsSince) + .append(lineSep); + config.append("includeContent : ").append(includeContent) + .append(lineSep); + config.append("processAttachments : ").append(processAttachment) + .append(lineSep); + config.append("includeOtherUserFolders : ").append(includeOtherUserFolders) + .append(lineSep); + config.append("includeSharedFolders : ").append(includeSharedFolders) + .append(lineSep); LOG.info(config.toString()); } - + class FolderIterator implements Iterator { private Store mailbox; private List topLevelFolders; private List folders = null; private Folder lastFolder = null; - + public FolderIterator(Store mailBox) { this.mailbox = mailBox; folders = new ArrayList<>(); getTopLevelFolders(mailBox); + if (includeOtherUserFolders) getOtherUserFolders(); + if (includeSharedFolders) getSharedFolders(); } - - @Override + public boolean hasNext() { return !folders.isEmpty(); } - - @Override + public Folder next() { try { boolean hasMessages = false; @@ -358,83 +490,142 @@ public class MailEntityProcessor extends EntityProcessorBase { folders.add(0, children[i]); LOG.info("child name : " + children[i].getFullName()); } - if (children.length == 0) - LOG.info("NO children : "); + if (children.length == 0) LOG.info("NO children : "); } } - } - while (!hasMessages); + } while (!hasMessages); return next; - } catch (MessagingException e) { - //throw new DataImportHandlerException(DataImportHandlerException.SEVERE, - // "Folder open failed", e); + } catch (Exception e) { + LOG.warn("Failed to read folders due to: "+e); + // throw new + // DataImportHandlerException(DataImportHandlerException.SEVERE, + // "Folder open failed", e); } return null; } - - @Override + public void remove() { throw new UnsupportedOperationException("Its read only mode..."); } - + private void getTopLevelFolders(Store mailBox) { - if (folderNames != null) - topLevelFolders = Arrays.asList(folderNames.split(",")); + if (folderNames != null) topLevelFolders = Arrays.asList(folderNames + .split(",")); for (int i = 0; topLevelFolders != null && i < topLevelFolders.size(); i++) { try { folders.add(mailbox.getFolder(topLevelFolders.get(i))); } catch (MessagingException e) { // skip bad ones unless its the last one and still no good folder - if (folders.size() == 0 && i == topLevelFolders.size() - 1) - throw new DataImportHandlerException(DataImportHandlerException.SEVERE, - "Folder retreival failed"); + if (folders.size() == 0 && i == topLevelFolders.size() - 1) throw new DataImportHandlerException( + DataImportHandlerException.SEVERE, "Folder retreival failed"); } } if (topLevelFolders == null || topLevelFolders.size() == 0) { try { folders.add(mailBox.getDefaultFolder()); } catch (MessagingException e) { - throw new DataImportHandlerException(DataImportHandlerException.SEVERE, - "Folder retreival failed"); + throw new DataImportHandlerException( + DataImportHandlerException.SEVERE, "Folder retreival failed"); } } } - + + private void getOtherUserFolders() { + try { + Folder[] ufldrs = mailbox.getUserNamespaces(null); + if (ufldrs != null) { + LOG.info("Found " + ufldrs.length + " user namespace folders"); + for (Folder ufldr : ufldrs) + folders.add(ufldr); + } + } catch (MessagingException me) { + LOG.warn("Messaging exception retrieving user namespaces: " + + me.getMessage()); + } + } + + private void getSharedFolders() { + try { + Folder[] sfldrs = mailbox.getSharedNamespaces(); + if (sfldrs != null) { + LOG.info("Found " + sfldrs.length + " shared namespace folders"); + for (Folder sfldr : sfldrs) + folders.add(sfldr); + } + } catch (MessagingException me) { + LOG.warn("Messaging exception retrieving shared namespaces: " + + me.getMessage()); + } + } + private boolean excludeFolder(String name) { for (String s : exclude) { - if (name.matches(s)) - return true; + if (name.matches(s)) return true; } for (String s : include) { - if (name.matches(s)) - return false; + if (name.matches(s)) return false; } return include.size() > 0; } } - - class MessageIterator implements Iterator { + + class MessageIterator extends SearchTerm implements Iterator { private Folder folder; - private Message[] messagesInCurBatch; + private Message[] messagesInCurBatch = null; private int current = 0; private int currentBatch = 0; private int batchSize = 0; private int totalInFolder = 0; private boolean doBatching = true; - + public MessageIterator(Folder folder, int batchSize) { + super(); + try { this.folder = folder; this.batchSize = batchSize; SearchTerm st = getSearchTerm(); - if (st != null) { + + LOG.info("SearchTerm=" + st); + + if (st != null || folder instanceof GmailFolder) { doBatching = false; - messagesInCurBatch = folder.search(st); + // Searching can still take a while even though we're only pulling + // envelopes; unless you're using gmail server-side filter, which is + // fast + LOG.info("Searching folder " + folder.getName() + " for messages"); + long searchAtMs = System.currentTimeMillis(); + + // If using GMail, speed up the envelope processing by doing a + // server-side + // search for messages occurring on or after the fetch date (at + // midnight), + // which reduces the number of envelopes we need to pull from the + // server + // to apply the precise DateTerm filter; GMail server-side search has + // date + // granularity only but the local filters are also applied + + if (folder instanceof GmailFolder && fetchMailsSince != null) { + String afterCrit = "after:" + afterFmt.format(fetchMailsSince); + LOG.info("Added server-side gmail filter: " + afterCrit); + Message[] afterMessages = folder.search(new GmailRawSearchTerm( + afterCrit)); + + LOG.info("GMail server-side filter found " + afterMessages.length + + " messages received " + afterCrit + " in folder " + folder.getName()); + + // now pass in the server-side filtered messages to the local filter + messagesInCurBatch = folder.search((st != null ? st : this), afterMessages); + } else { + messagesInCurBatch = folder.search(st); + } totalInFolder = messagesInCurBatch.length; folder.fetch(messagesInCurBatch, fp); current = 0; + long tookMs = (System.currentTimeMillis() - searchAtMs); LOG.info("Total messages : " + totalInFolder); - LOG.info("Search criteria applied. Batching disabled"); + LOG.info("Search criteria applied. Batching disabled. Took " + tookMs + " (ms)"); } else { totalInFolder = folder.getMessageCount(); LOG.info("Total messages : " + totalInFolder); @@ -442,60 +633,55 @@ public class MailEntityProcessor extends EntityProcessorBase { } } catch (MessagingException e) { throw new DataImportHandlerException(DataImportHandlerException.SEVERE, - "Message retreival failed", e); + "Message retreival failed", e); } } - - private void getNextBatch(int batchSize, Folder folder) throws MessagingException { + + private void getNextBatch(int batchSize, Folder folder) + throws MessagingException { // after each batch invalidate cache if (messagesInCurBatch != null) { for (Message m : messagesInCurBatch) { - if (m instanceof IMAPMessage) - ((IMAPMessage) m).invalidateHeaders(); + if (m instanceof IMAPMessage) ((IMAPMessage) m).invalidateHeaders(); } } int lastMsg = (currentBatch + 1) * batchSize; lastMsg = lastMsg > totalInFolder ? totalInFolder : lastMsg; - messagesInCurBatch = folder.getMessages(currentBatch * batchSize + 1, lastMsg); + messagesInCurBatch = folder.getMessages(currentBatch * batchSize + 1, + lastMsg); folder.fetch(messagesInCurBatch, fp); current = 0; currentBatch++; LOG.info("Current Batch : " + currentBatch); LOG.info("Messages in this batch : " + messagesInCurBatch.length); } - - @Override + public boolean hasNext() { boolean hasMore = current < messagesInCurBatch.length; - if (!hasMore && doBatching - && currentBatch * batchSize < totalInFolder) { + if (!hasMore && doBatching && currentBatch * batchSize < totalInFolder) { // try next batch try { getNextBatch(batchSize, folder); hasMore = current < messagesInCurBatch.length; } catch (MessagingException e) { - throw new DataImportHandlerException(DataImportHandlerException.SEVERE, - "Message retreival failed", e); + throw new DataImportHandlerException( + DataImportHandlerException.SEVERE, "Message retreival failed", e); } } return hasMore; } - - @Override + public Message next() { return hasNext() ? messagesInCurBatch[current++] : null; } - - @Override + public void remove() { throw new UnsupportedOperationException("Its read only mode..."); } - + private SearchTerm getSearchTerm() { - if (filters.size() == 0) - return null; - if (filters.size() == 1) - return filters.get(0).getCustomSearch(folder); + if (filters.size() == 0) return null; + if (filters.size() == 1) return filters.get(0).getCustomSearch(folder); SearchTerm last = filters.get(0).getCustomSearch(folder); for (int i = 1; i < filters.size(); i++) { CustomFilter filter = filters.get(i); @@ -506,44 +692,83 @@ public class MailEntityProcessor extends EntityProcessorBase { } return last; } + + public boolean match(Message message) { + return true; + } } - + class MailsSinceLastCheckFilter implements CustomFilter { - + private Date since; - + public MailsSinceLastCheckFilter(Date date) { since = date; } - - @Override - public SearchTerm getCustomSearch(Folder folder) { - return new ReceivedDateTerm(ComparisonTerm.GE, since); + + @SuppressWarnings("serial") + public SearchTerm getCustomSearch(final Folder folder) { + LOG.info("Building mail filter for messages in " + folder.getName() + + " that occur after " + sinceDateParser.format(since)); + return new DateTerm(ComparisonTerm.GE, since) { + private int matched = 0; + private int seen = 0; + + @Override + public boolean match(Message msg) { + boolean isMatch = false; + ++seen; + try { + Date msgDate = msg.getReceivedDate(); + if (msgDate == null) msgDate = msg.getSentDate(); + + if (msgDate != null && msgDate.getTime() >= since.getTime()) { + ++matched; + isMatch = true; + } else { + String msgDateStr = (msgDate != null) ? sinceDateParser.format(msgDate) : "null"; + String sinceDateStr = (since != null) ? sinceDateParser.format(since) : "null"; + LOG.debug("Message " + msg.getSubject() + " was received at [" + msgDateStr + + "], since filter is [" + sinceDateStr + "]"); + } + } catch (MessagingException e) { + LOG.warn("Failed to process message due to: "+e, e); + } + + if (seen % 100 == 0) { + LOG.info("Matched " + matched + " of " + seen + " messages since: " + + sinceDateParser.format(since)); + } + + return isMatch; + } + }; } } - + // user settings stored in member variables private String user; private String password; private String host; private String protocol; - + private String folderNames; private List exclude = new ArrayList<>(); private List include = new ArrayList<>(); private boolean recurse; - + private int batchSize; private int fetchSize; private int cTimeout; private int rTimeout; - + private Date fetchMailsSince; private String customFilter; - + private boolean processAttachment = true; - - private Tika tika; + private boolean includeContent = true; + private boolean includeOtherUserFolders = false; + private boolean includeSharedFolders = false; // holds the current state private Store mailbox; @@ -553,16 +778,13 @@ public class MailEntityProcessor extends EntityProcessorBase { private List filters = new ArrayList<>(); private static FetchProfile fp = new FetchProfile(); private static final Logger LOG = LoggerFactory.getLogger(DataImporter.class); - - // diagnostics - private int rowCount = 0; - + static { fp.add(FetchProfile.Item.ENVELOPE); fp.add(FetchProfile.Item.FLAGS); fp.add("X-Mailer"); } - + // Fields To Index // single valued private static final String MESSAGE_ID = "messageId"; @@ -577,13 +799,14 @@ public class MailEntityProcessor extends EntityProcessorBase { private static final String ATTACHMENT = "attachment"; private static final String ATTACHMENT_NAMES = "attachmentNames"; // flag values + private static final String FLAG_NONE = "none"; private static final String FLAG_ANSWERED = "answered"; private static final String FLAG_DELETED = "deleted"; private static final String FLAG_DRAFT = "draft"; private static final String FLAG_FLAGGED = "flagged"; private static final String FLAG_RECENT = "recent"; private static final String FLAG_SEEN = "seen"; - + private int getIntFromContext(String prop, int ifNull) { int v = ifNull; try { @@ -593,11 +816,11 @@ public class MailEntityProcessor extends EntityProcessorBase { v = Integer.valueOf(val); } } catch (NumberFormatException e) { - //do nothing + // do nothing } return v; } - + private boolean getBoolFromContext(String prop, boolean ifNull) { boolean v = ifNull; String val = context.getEntityAttribute(prop); @@ -607,7 +830,7 @@ public class MailEntityProcessor extends EntityProcessorBase { } return v; } - + private String getStringFromContext(String prop, String ifNull) { String v = ifNull; String val = context.getEntityAttribute(prop); diff --git a/solr/contrib/dataimporthandler/ivy.xml b/solr/contrib/dataimporthandler/ivy.xml index 1e4d9dc751c..bc9ab88393f 100644 --- a/solr/contrib/dataimporthandler/ivy.xml +++ b/solr/contrib/dataimporthandler/ivy.xml @@ -24,7 +24,8 @@ - + + diff --git a/solr/example/example-DIH/README.txt b/solr/example/example-DIH/README.txt index 9c2f3c37e24..98db213d2d0 100644 --- a/solr/example/example-DIH/README.txt +++ b/solr/example/example-DIH/README.txt @@ -35,7 +35,7 @@ To import data from the slashdot feed, connect to To import data from your imap server -1. Edit the example-DIH/solr/mail/conf/data-config.xml and add details about username, password, imap server +1. Edit the example-DIH/solr/mail/conf/mail-data-config.xml and add details about username, password, imap server 2. Connect to http://localhost:8983/solr/mail/dataimport?command=full-import To copy data from db Solr core, connect to diff --git a/solr/example/example-DIH/solr/mail/conf/mail-data-config.xml b/solr/example/example-DIH/solr/mail/conf/mail-data-config.xml index 736aea7cc99..6448f6a9bb5 100644 --- a/solr/example/example-DIH/solr/mail/conf/mail-data-config.xml +++ b/solr/example/example-DIH/solr/mail/conf/mail-data-config.xml @@ -10,3 +10,15 @@ name="mail_entity"/> + + + + + + diff --git a/solr/licenses/gimap-1.5.1.jar.sha1 b/solr/licenses/gimap-1.5.1.jar.sha1 new file mode 100644 index 00000000000..41c9dbff5dc --- /dev/null +++ b/solr/licenses/gimap-1.5.1.jar.sha1 @@ -0,0 +1 @@ +3a4ccd3aa6ce33ec701893c3ee632eeb0e012c89 diff --git a/solr/licenses/javax.mail-1.5.1.jar.sha1 b/solr/licenses/javax.mail-1.5.1.jar.sha1 new file mode 100644 index 00000000000..e7a0a834c9a --- /dev/null +++ b/solr/licenses/javax.mail-1.5.1.jar.sha1 @@ -0,0 +1 @@ +9724dd44f1abbba99c9858aa05fc91d53f59e7a5 diff --git a/solr/licenses/log4j-1.2.16.jar.sha1 b/solr/licenses/log4j-1.2.16.jar.sha1 new file mode 100644 index 00000000000..4b09bd1cc40 --- /dev/null +++ b/solr/licenses/log4j-1.2.16.jar.sha1 @@ -0,0 +1 @@ +7999a63bfccbc7c247a9aea10d83d4272bd492c6 diff --git a/solr/licenses/mail-1.4.3.jar.sha1 b/solr/licenses/mail-1.4.3.jar.sha1 deleted file mode 100644 index 847bc32b329..00000000000 --- a/solr/licenses/mail-1.4.3.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -8154bf8d666e6db154c548dc31a8d512c273f5ee