diff --git a/lucene/ivy-versions.properties b/lucene/ivy-versions.properties index 0d57a0058d8..b89990374bd 100644 --- a/lucene/ivy-versions.properties +++ b/lucene/ivy-versions.properties @@ -67,7 +67,8 @@ com.sun.jersey.version = 1.9 /jakarta-regexp/jakarta-regexp = 1.4 /javax.activation/activation = 1.1.1 /javax.inject/javax.inject= 1 -/javax.mail/mail = 1.4.3 +/com.sun.mail/javax.mail = 1.5.1 +/com.sun.mail/gimap = 1.5.1 /javax.servlet/javax.servlet-api = 3.0.1 /javax.servlet/servlet-api = 2.4 /jdom/jdom = 1.0 diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index e0dca882c37..0ba8c0bf637 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -90,6 +90,10 @@ Other Changes * SOLR-6215: TrieDateField should directly extend TrieField instead of forwarding to a wrapped TrieField. (Steve Rowe) +* SOLR-2245: Numerous improvements of the MailEntityProcessor, including using + the GMail extensions to do server-side date filtering and using GreenMail in + the unit test to enable automated tests. (Timothy Potter) + ================== 4.10.0 ================= Versions of Major Components diff --git a/solr/contrib/dataimporthandler-extras/ivy.xml b/solr/contrib/dataimporthandler-extras/ivy.xml index 432c6c1e9cd..07093540525 100644 --- a/solr/contrib/dataimporthandler-extras/ivy.xml +++ b/solr/contrib/dataimporthandler-extras/ivy.xml @@ -32,5 +32,6 @@ + diff --git a/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java b/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java index 756b76409ed..e1c5e37f4ce 100644 --- a/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java +++ b/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java @@ -1,4 +1,4 @@ -/* +/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -18,10 +18,9 @@ package org.apache.solr.handler.dataimport; import com.sun.mail.imap.IMAPMessage; +import org.apache.solr.handler.dataimport.config.ConfigNameConstants; import org.apache.tika.Tika; -import org.apache.tika.metadata.HttpHeaders; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaMetadataKeys; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -30,33 +29,40 @@ import javax.mail.internet.AddressException; import javax.mail.internet.ContentType; import javax.mail.internet.InternetAddress; import javax.mail.internet.MimeMessage; -import javax.mail.search.AndTerm; -import javax.mail.search.ComparisonTerm; -import javax.mail.search.ReceivedDateTerm; -import javax.mail.search.SearchTerm; +import javax.mail.search.*; + import java.io.InputStream; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.*; +import com.sun.mail.gimap.GmailFolder; +import com.sun.mail.gimap.GmailRawSearchTerm; + /** - * An {@link EntityProcessor} instance which can index emails along with their attachments from POP3 or IMAP sources. Refer to - * http://wiki.apache.org/solr/DataImportHandler for more - * details. This API is experimental and subject to change - * - * + * An EntityProcessor instance which can index emails along with their + * attachments from POP3 or IMAP sources. Refer to http://wiki.apache.org/solr/DataImportHandler for more details. This + * API is experimental and subject to change + * + * @version $Id$ * @since solr 1.4 */ public class MailEntityProcessor extends EntityProcessorBase { - + + private static final SimpleDateFormat sinceDateParser = + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + private static final SimpleDateFormat afterFmt = + new SimpleDateFormat("yyyy/MM/dd"); + public static interface CustomFilter { public SearchTerm getCustomSearch(Folder folder); } - - @Override + public void init(Context context) { super.init(context); - // set attributes using XXX getXXXFromContext(attribute, defualtValue); + // set attributes using XXX getXXXFromContext(attribute, defualtValue); // applies variable resolver and return default if value is not found or null // REQUIRED : connection and folder info user = getStringFromContext("user", null); @@ -66,60 +72,116 @@ public class MailEntityProcessor extends EntityProcessorBase { folderNames = getStringFromContext("folders", null); // validate if (host == null || protocol == null || user == null || password == null - || folderNames == null) - throw new DataImportHandlerException(DataImportHandlerException.SEVERE, - "'user|password|protocol|host|folders' are required attributes"); - - //OPTIONAL : have defaults and are optional + || folderNames == null) throw new DataImportHandlerException( + DataImportHandlerException.SEVERE, + "'user|password|protocol|host|folders' are required attributes"); + + // OPTIONAL : have defaults and are optional recurse = getBoolFromContext("recurse", true); + + exclude.clear(); String excludes = getStringFromContext("exclude", ""); if (excludes != null && !excludes.trim().equals("")) { exclude = Arrays.asList(excludes.split(",")); } + + include.clear(); String includes = getStringFromContext("include", ""); if (includes != null && !includes.trim().equals("")) { include = Arrays.asList(includes.split(",")); } batchSize = getIntFromContext("batchSize", 20); customFilter = getStringFromContext("customFilter", ""); - String s = getStringFromContext("fetchMailsSince", null); - if (s != null) - try { - fetchMailsSince = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT).parse(s); - } catch (ParseException e) { - throw new DataImportHandlerException(DataImportHandlerException.SEVERE, "Invalid value for fetchMailSince: " + s, e); - } + if (filters != null) filters.clear(); + folderIter = null; + msgIter = null; + + String lastIndexTime = null; + String command = + String.valueOf(context.getRequestParameters().get("command")); + if (!DataImporter.FULL_IMPORT_CMD.equals(command)) + throw new IllegalArgumentException(this.getClass().getSimpleName()+ + " only supports "+DataImporter.FULL_IMPORT_CMD); + + // Read the last_index_time out of the dataimport.properties if available + String cname = getStringFromContext("name", "mailimporter"); + String varName = ConfigNameConstants.IMPORTER_NS_SHORT + "." + cname + "." + + DocBuilder.LAST_INDEX_TIME; + Object varValue = context.getVariableResolver().resolve(varName); + if (varValue == null || "".equals(varValue)) { + varName = ConfigNameConstants.IMPORTER_NS_SHORT + "." + + DocBuilder.LAST_INDEX_TIME; + varValue = context.getVariableResolver().resolve(varName); + } + + if (varValue != null && varValue instanceof String) { + lastIndexTime = (String)varValue; + if (lastIndexTime != null && lastIndexTime.length() == 0) + lastIndexTime = null; + } + + if (lastIndexTime == null) + lastIndexTime = getStringFromContext("fetchMailsSince", ""); + LOG.info("Using lastIndexTime "+lastIndexTime+" for mail import"); + + this.fetchMailsSince = null; + if (lastIndexTime != null && lastIndexTime.length() > 0) { + try { + fetchMailsSince = sinceDateParser.parse(lastIndexTime); + LOG.info("Parsed fetchMailsSince=" + lastIndexTime); + } catch (ParseException e) { + throw new DataImportHandlerException(DataImportHandlerException.SEVERE, + "Invalid value for fetchMailSince: " + lastIndexTime, e); + } + } + fetchSize = getIntFromContext("fetchSize", 32 * 1024); cTimeout = getIntFromContext("connectTimeout", 30 * 1000); rTimeout = getIntFromContext("readTimeout", 60 * 1000); - processAttachment = getBoolFromContext( - getStringFromContext("processAttachment",null) == null ? "processAttachement":"processAttachment" - , true); - - tika = new Tika(); + String tmp = context.getEntityAttribute("includeOtherUserFolders"); + includeOtherUserFolders = (tmp != null && Boolean.valueOf(tmp.trim())); + tmp = context.getEntityAttribute("includeSharedFolders"); + includeSharedFolders = (tmp != null && Boolean.valueOf(tmp.trim())); + + setProcessAttachmentConfig(); + includeContent = getBoolFromContext("includeContent", true); + logConfig(); } - + + private void setProcessAttachmentConfig() { + processAttachment = true; + String tbval = context.getEntityAttribute("processAttachments"); + if (tbval == null) { + tbval = context.getEntityAttribute("processAttachement"); + if (tbval != null) processAttachment = Boolean.valueOf(tbval); + } else processAttachment = Boolean.valueOf(tbval); + } + @Override - public Map nextRow() { - Message mail; - Map row = null; + public Map nextRow() { + Message mail = null; + Map row = null; do { // try till there is a valid document or folders get exhausted. // when mail == NULL, it means end of processing - mail = getNextMail(); + mail = getNextMail(); + if (mail != null) row = getDocumentFromMail(mail); - } while (row == null && mail != null); + + if (row != null && row.get("folder") == null) + row.put("folder", mail.getFolder().getFullName()); + + } while (row == null && mail != null); return row; } - + private Message getNextMail() { if (!connected) { - if (!connectToMailBox()) - return null; + if (!connectToMailBox()) return null; connected = true; } if (folderIter == null) { @@ -131,119 +193,127 @@ public class MailEntityProcessor extends EntityProcessorBase { // loop till a valid mail or all folders exhausted. while (msgIter == null || !msgIter.hasNext()) { Folder next = folderIter.hasNext() ? folderIter.next() : null; - if (next == null) { - return null; - } + if (next == null) return null; + msgIter = new MessageIterator(next, batchSize); } return msgIter.next(); } - - private Map getDocumentFromMail(Message mail) { - Map row = new HashMap<>(); + + private Map getDocumentFromMail(Message mail) { + Map row = new HashMap<>(); try { addPartToDocument(mail, row, true); return row; } catch (Exception e) { + LOG.error("Failed to convert message [" + mail.toString() + + "] to document due to: " + e, e); return null; } } - - public void addPartToDocument(Part part, Map row, boolean outerMost) throws Exception { + + public void addPartToDocument(Part part, Map row, boolean outerMost) throws Exception { if (part instanceof Message) { - addEnvelopToDocument(part, row); + addEnvelopeToDocument(part, row); } - - String ct = part.getContentType(); + + String ct = part.getContentType().toLowerCase(); ContentType ctype = new ContentType(ct); if (part.isMimeType("multipart/*")) { - Multipart mp = (Multipart) part.getContent(); - int count = mp.getCount(); - if (part.isMimeType("multipart/alternative")) - count = 1; - for (int i = 0; i < count; i++) - addPartToDocument(mp.getBodyPart(i), row, false); + Object content = part.getContent(); + if (content != null && content instanceof Multipart) { + Multipart mp = (Multipart) part.getContent(); + int count = mp.getCount(); + if (part.isMimeType("multipart/alternative")) count = 1; + for (int i = 0; i < count; i++) + addPartToDocument(mp.getBodyPart(i), row, false); + } else { + LOG.warn("Multipart content is a not an instance of Multipart! Content is: " + + (content != null ? content.getClass().getName() : "null") + + ". Typically, this is due to the Java Activation JAR being loaded by the wrong classloader."); + } } else if (part.isMimeType("message/rfc822")) { addPartToDocument((Part) part.getContent(), row, false); } else { String disp = part.getDisposition(); - if (!processAttachment || (disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT))) return; - InputStream is = part.getInputStream(); - String fileName = part.getFileName(); - Metadata md = new Metadata(); - md.set(HttpHeaders.CONTENT_TYPE, ctype.getBaseType().toLowerCase(Locale.ROOT)); - md.set(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName); - String content = tika.parseToString(is, md); - if (disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT)) { - if (row.get(ATTACHMENT) == null) - row.put(ATTACHMENT, new ArrayList()); - List contents = (List) row.get(ATTACHMENT); - contents.add(content); - row.put(ATTACHMENT, contents); - if (row.get(ATTACHMENT_NAMES) == null) - row.put(ATTACHMENT_NAMES, new ArrayList()); - List names = (List) row.get(ATTACHMENT_NAMES); - names.add(fileName); - row.put(ATTACHMENT_NAMES, names); - } else { - if (row.get(CONTENT) == null) - row.put(CONTENT, new ArrayList()); + if (includeContent + && !(disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT))) { + InputStream is = part.getInputStream(); + Metadata contentTypeHint = new Metadata(); + contentTypeHint.set(Metadata.CONTENT_TYPE, ctype.getBaseType() + .toLowerCase(Locale.ENGLISH)); + String content = (new Tika()).parseToString(is, contentTypeHint); + if (row.get(CONTENT) == null) row.put(CONTENT, new ArrayList()); List contents = (List) row.get(CONTENT); - contents.add(content); + contents.add(content.trim()); row.put(CONTENT, contents); } + if (!processAttachment || disp == null + || !disp.equalsIgnoreCase(Part.ATTACHMENT)) return; + InputStream is = part.getInputStream(); + String fileName = part.getFileName(); + Metadata contentTypeHint = new Metadata(); + contentTypeHint.set(Metadata.CONTENT_TYPE, ctype.getBaseType() + .toLowerCase(Locale.ENGLISH)); + String content = (new Tika()).parseToString(is, contentTypeHint); + if (content == null || content.trim().length() == 0) return; + + if (row.get(ATTACHMENT) == null) row.put(ATTACHMENT, + new ArrayList()); + List contents = (List) row.get(ATTACHMENT); + contents.add(content.trim()); + row.put(ATTACHMENT, contents); + if (row.get(ATTACHMENT_NAMES) == null) row.put(ATTACHMENT_NAMES, + new ArrayList()); + List names = (List) row.get(ATTACHMENT_NAMES); + names.add(fileName); + row.put(ATTACHMENT_NAMES, names); } } - - private void addEnvelopToDocument(Part part, Map row) throws MessagingException { + + private void addEnvelopeToDocument(Part part, Map row) + throws MessagingException { MimeMessage mail = (MimeMessage) part; Address[] adresses; - if ((adresses = mail.getFrom()) != null && adresses.length > 0) - row.put(FROM, adresses[0].toString()); - + if ((adresses = mail.getFrom()) != null && adresses.length > 0) row.put( + FROM, adresses[0].toString()); + List to = new ArrayList<>(); - if ((adresses = mail.getRecipients(Message.RecipientType.TO)) != null) - addAddressToList(adresses, to); - if ((adresses = mail.getRecipients(Message.RecipientType.CC)) != null) - addAddressToList(adresses, to); - if ((adresses = mail.getRecipients(Message.RecipientType.BCC)) != null) - addAddressToList(adresses, to); - if (to.size() > 0) - row.put(TO_CC_BCC, to); - + if ((adresses = mail.getRecipients(Message.RecipientType.TO)) != null) addAddressToList( + adresses, to); + if ((adresses = mail.getRecipients(Message.RecipientType.CC)) != null) addAddressToList( + adresses, to); + if ((adresses = mail.getRecipients(Message.RecipientType.BCC)) != null) addAddressToList( + adresses, to); + if (to.size() > 0) row.put(TO_CC_BCC, to); + row.put(MESSAGE_ID, mail.getMessageID()); row.put(SUBJECT, mail.getSubject()); - + Date d = mail.getSentDate(); if (d != null) { row.put(SENT_DATE, d); } - + List flags = new ArrayList<>(); for (Flags.Flag flag : mail.getFlags().getSystemFlags()) { - if (flag == Flags.Flag.ANSWERED) - flags.add(FLAG_ANSWERED); - else if (flag == Flags.Flag.DELETED) - flags.add(FLAG_DELETED); - else if (flag == Flags.Flag.DRAFT) - flags.add(FLAG_DRAFT); - else if (flag == Flags.Flag.FLAGGED) - flags.add(FLAG_FLAGGED); - else if (flag == Flags.Flag.RECENT) - flags.add(FLAG_RECENT); - else if (flag == Flags.Flag.SEEN) - flags.add(FLAG_SEEN); + if (flag == Flags.Flag.ANSWERED) flags.add(FLAG_ANSWERED); + else if (flag == Flags.Flag.DELETED) flags.add(FLAG_DELETED); + else if (flag == Flags.Flag.DRAFT) flags.add(FLAG_DRAFT); + else if (flag == Flags.Flag.FLAGGED) flags.add(FLAG_FLAGGED); + else if (flag == Flags.Flag.RECENT) flags.add(FLAG_RECENT); + else if (flag == Flags.Flag.SEEN) flags.add(FLAG_SEEN); } flags.addAll(Arrays.asList(mail.getFlags().getUserFlags())); + if (flags.size() == 0) flags.add(FLAG_NONE); row.put(FLAGS, flags); - + String[] hdrs = mail.getHeader("X-Mailer"); - if (hdrs != null) - row.put(XMAILER, hdrs[0]); + if (hdrs != null) row.put(XMAILER, hdrs[0]); } - - - private void addAddressToList(Address[] adresses, List to) throws AddressException { + + private void addAddressToList(Address[] adresses, List to) + throws AddressException { for (Address address : adresses) { to.add(address.toString()); InternetAddress ia = (InternetAddress) address; @@ -254,25 +324,60 @@ public class MailEntityProcessor extends EntityProcessorBase { } } } - + private boolean connectToMailBox() { + // this is needed to load the activation mail stuff correctly + // otherwise, the JavaMail multipart support doesn't get configured + // correctly, which leads to a class cast exception when processing + // multipart messages: IMAPInputStream cannot be cast to + // javax.mail.Multipart + Thread.currentThread().setContextClassLoader(getClass().getClassLoader()); + try { Properties props = new Properties(); + if (System.getProperty("mail.debug") != null) + props.setProperty("mail.debug", System.getProperty("mail.debug")); + + if (("imap".equals(protocol) || "imaps".equals(protocol)) + && "imap.gmail.com".equals(host)) { + LOG.info("Consider using 'gimaps' protocol instead of '" + protocol + + "' for enabling GMail specific extensions for " + host); + } + props.setProperty("mail.store.protocol", protocol); - props.setProperty("mail.imap.fetchsize", "" + fetchSize); - props.setProperty("mail.imap.timeout", "" + rTimeout); - props.setProperty("mail.imap.connectiontimeout", "" + cTimeout); + + String imapPropPrefix = protocol.startsWith("gimap") ? "gimap" : "imap"; + props.setProperty("mail." + imapPropPrefix + ".fetchsize", "" + fetchSize); + props.setProperty("mail." + imapPropPrefix + ".timeout", "" + rTimeout); + props.setProperty("mail." + imapPropPrefix + ".connectiontimeout", "" + cTimeout); + + int port = -1; + int colonAt = host.indexOf(":"); + if (colonAt != -1) { + port = Integer.parseInt(host.substring(colonAt + 1)); + host = host.substring(0, colonAt); + } + Session session = Session.getDefaultInstance(props, null); mailbox = session.getStore(protocol); - mailbox.connect(host, user, password); - LOG.info("Connected to mailbox"); + if (port != -1) { + mailbox.connect(host, port, user, password); + } else { + mailbox.connect(host, user, password); + } + LOG.info("Connected to " + user + "'s mailbox on " + host); + return true; - } catch (MessagingException e) { + } catch (MessagingException e) { + String errMsg = String.format(Locale.ENGLISH, + "Failed to connect to %s server %s as user %s due to: %s", protocol, + host, user, e.toString()); + LOG.error(errMsg, e); throw new DataImportHandlerException(DataImportHandlerException.SEVERE, - "Connection failed", e); + errMsg, e); } } - + private void createFilters() { if (fetchMailsSince != null) { filters.add(new MailsSinceLastCheckFilter(fetchMailsSince)); @@ -286,49 +391,76 @@ public class MailEntityProcessor extends EntityProcessorBase { } } catch (Exception e) { throw new DataImportHandlerException(DataImportHandlerException.SEVERE, - "Custom filter could not be created", e); + "Custom filter could not be created", e); } } } - + private void logConfig() { if (!LOG.isInfoEnabled()) return; - StringBuilder config = new StringBuilder(); - config.append("user : ").append(user).append(System.getProperty("line.separator")); - config.append("pwd : ").append(password).append(System.getProperty("line.separator")); - config.append("protocol : ").append(protocol).append(System.getProperty("line.separator")); - config.append("host : ").append(host).append(System.getProperty("line.separator")); - config.append("folders : ").append(folderNames).append(System.getProperty("line.separator")); - config.append("recurse : ").append(recurse).append(System.getProperty("line.separator")); - config.append("exclude : ").append(exclude.toString()).append(System.getProperty("line.separator")); - config.append("include : ").append(include.toString()).append(System.getProperty("line.separator")); - config.append("batchSize : ").append(batchSize).append(System.getProperty("line.separator")); - config.append("fetchSize : ").append(fetchSize).append(System.getProperty("line.separator")); - config.append("read timeout : ").append(rTimeout).append(System.getProperty("line.separator")); - config.append("conection timeout : ").append(cTimeout).append(System.getProperty("line.separator")); - config.append("custom filter : ").append(customFilter).append(System.getProperty("line.separator")); - config.append("fetch mail since : ").append(fetchMailsSince).append(System.getProperty("line.separator")); + + String lineSep = System.getProperty("line.separator"); + + StringBuffer config = new StringBuffer(); + config.append("user : ").append(user).append(lineSep); + config + .append("pwd : ") + .append( + password != null && password.length() > 0 ? "" : "") + .append(lineSep); + config.append("protocol : ").append(protocol) + .append(lineSep); + config.append("host : ").append(host) + .append(lineSep); + config.append("folders : ").append(folderNames) + .append(lineSep); + config.append("recurse : ").append(recurse) + .append(lineSep); + config.append("exclude : ").append(exclude.toString()) + .append(lineSep); + config.append("include : ").append(include.toString()) + .append(lineSep); + config.append("batchSize : ").append(batchSize) + .append(lineSep); + config.append("fetchSize : ").append(fetchSize) + .append(lineSep); + config.append("read timeout : ").append(rTimeout) + .append(lineSep); + config.append("conection timeout : ").append(cTimeout) + .append(lineSep); + config.append("custom filter : ").append(customFilter) + .append(lineSep); + config.append("fetch mail since : ").append(fetchMailsSince) + .append(lineSep); + config.append("includeContent : ").append(includeContent) + .append(lineSep); + config.append("processAttachments : ").append(processAttachment) + .append(lineSep); + config.append("includeOtherUserFolders : ").append(includeOtherUserFolders) + .append(lineSep); + config.append("includeSharedFolders : ").append(includeSharedFolders) + .append(lineSep); LOG.info(config.toString()); } - + class FolderIterator implements Iterator { private Store mailbox; private List topLevelFolders; private List folders = null; private Folder lastFolder = null; - + public FolderIterator(Store mailBox) { this.mailbox = mailBox; folders = new ArrayList<>(); getTopLevelFolders(mailBox); + if (includeOtherUserFolders) getOtherUserFolders(); + if (includeSharedFolders) getSharedFolders(); } - - @Override + public boolean hasNext() { return !folders.isEmpty(); } - - @Override + public Folder next() { try { boolean hasMessages = false; @@ -358,83 +490,142 @@ public class MailEntityProcessor extends EntityProcessorBase { folders.add(0, children[i]); LOG.info("child name : " + children[i].getFullName()); } - if (children.length == 0) - LOG.info("NO children : "); + if (children.length == 0) LOG.info("NO children : "); } } - } - while (!hasMessages); + } while (!hasMessages); return next; - } catch (MessagingException e) { - //throw new DataImportHandlerException(DataImportHandlerException.SEVERE, - // "Folder open failed", e); + } catch (Exception e) { + LOG.warn("Failed to read folders due to: "+e); + // throw new + // DataImportHandlerException(DataImportHandlerException.SEVERE, + // "Folder open failed", e); } return null; } - - @Override + public void remove() { throw new UnsupportedOperationException("Its read only mode..."); } - + private void getTopLevelFolders(Store mailBox) { - if (folderNames != null) - topLevelFolders = Arrays.asList(folderNames.split(",")); + if (folderNames != null) topLevelFolders = Arrays.asList(folderNames + .split(",")); for (int i = 0; topLevelFolders != null && i < topLevelFolders.size(); i++) { try { folders.add(mailbox.getFolder(topLevelFolders.get(i))); } catch (MessagingException e) { // skip bad ones unless its the last one and still no good folder - if (folders.size() == 0 && i == topLevelFolders.size() - 1) - throw new DataImportHandlerException(DataImportHandlerException.SEVERE, - "Folder retreival failed"); + if (folders.size() == 0 && i == topLevelFolders.size() - 1) throw new DataImportHandlerException( + DataImportHandlerException.SEVERE, "Folder retreival failed"); } } if (topLevelFolders == null || topLevelFolders.size() == 0) { try { folders.add(mailBox.getDefaultFolder()); } catch (MessagingException e) { - throw new DataImportHandlerException(DataImportHandlerException.SEVERE, - "Folder retreival failed"); + throw new DataImportHandlerException( + DataImportHandlerException.SEVERE, "Folder retreival failed"); } } } - + + private void getOtherUserFolders() { + try { + Folder[] ufldrs = mailbox.getUserNamespaces(null); + if (ufldrs != null) { + LOG.info("Found " + ufldrs.length + " user namespace folders"); + for (Folder ufldr : ufldrs) + folders.add(ufldr); + } + } catch (MessagingException me) { + LOG.warn("Messaging exception retrieving user namespaces: " + + me.getMessage()); + } + } + + private void getSharedFolders() { + try { + Folder[] sfldrs = mailbox.getSharedNamespaces(); + if (sfldrs != null) { + LOG.info("Found " + sfldrs.length + " shared namespace folders"); + for (Folder sfldr : sfldrs) + folders.add(sfldr); + } + } catch (MessagingException me) { + LOG.warn("Messaging exception retrieving shared namespaces: " + + me.getMessage()); + } + } + private boolean excludeFolder(String name) { for (String s : exclude) { - if (name.matches(s)) - return true; + if (name.matches(s)) return true; } for (String s : include) { - if (name.matches(s)) - return false; + if (name.matches(s)) return false; } return include.size() > 0; } } - - class MessageIterator implements Iterator { + + class MessageIterator extends SearchTerm implements Iterator { private Folder folder; - private Message[] messagesInCurBatch; + private Message[] messagesInCurBatch = null; private int current = 0; private int currentBatch = 0; private int batchSize = 0; private int totalInFolder = 0; private boolean doBatching = true; - + public MessageIterator(Folder folder, int batchSize) { + super(); + try { this.folder = folder; this.batchSize = batchSize; SearchTerm st = getSearchTerm(); - if (st != null) { + + LOG.info("SearchTerm=" + st); + + if (st != null || folder instanceof GmailFolder) { doBatching = false; - messagesInCurBatch = folder.search(st); + // Searching can still take a while even though we're only pulling + // envelopes; unless you're using gmail server-side filter, which is + // fast + LOG.info("Searching folder " + folder.getName() + " for messages"); + long searchAtMs = System.currentTimeMillis(); + + // If using GMail, speed up the envelope processing by doing a + // server-side + // search for messages occurring on or after the fetch date (at + // midnight), + // which reduces the number of envelopes we need to pull from the + // server + // to apply the precise DateTerm filter; GMail server-side search has + // date + // granularity only but the local filters are also applied + + if (folder instanceof GmailFolder && fetchMailsSince != null) { + String afterCrit = "after:" + afterFmt.format(fetchMailsSince); + LOG.info("Added server-side gmail filter: " + afterCrit); + Message[] afterMessages = folder.search(new GmailRawSearchTerm( + afterCrit)); + + LOG.info("GMail server-side filter found " + afterMessages.length + + " messages received " + afterCrit + " in folder " + folder.getName()); + + // now pass in the server-side filtered messages to the local filter + messagesInCurBatch = folder.search((st != null ? st : this), afterMessages); + } else { + messagesInCurBatch = folder.search(st); + } totalInFolder = messagesInCurBatch.length; folder.fetch(messagesInCurBatch, fp); current = 0; + long tookMs = (System.currentTimeMillis() - searchAtMs); LOG.info("Total messages : " + totalInFolder); - LOG.info("Search criteria applied. Batching disabled"); + LOG.info("Search criteria applied. Batching disabled. Took " + tookMs + " (ms)"); } else { totalInFolder = folder.getMessageCount(); LOG.info("Total messages : " + totalInFolder); @@ -442,60 +633,55 @@ public class MailEntityProcessor extends EntityProcessorBase { } } catch (MessagingException e) { throw new DataImportHandlerException(DataImportHandlerException.SEVERE, - "Message retreival failed", e); + "Message retreival failed", e); } } - - private void getNextBatch(int batchSize, Folder folder) throws MessagingException { + + private void getNextBatch(int batchSize, Folder folder) + throws MessagingException { // after each batch invalidate cache if (messagesInCurBatch != null) { for (Message m : messagesInCurBatch) { - if (m instanceof IMAPMessage) - ((IMAPMessage) m).invalidateHeaders(); + if (m instanceof IMAPMessage) ((IMAPMessage) m).invalidateHeaders(); } } int lastMsg = (currentBatch + 1) * batchSize; lastMsg = lastMsg > totalInFolder ? totalInFolder : lastMsg; - messagesInCurBatch = folder.getMessages(currentBatch * batchSize + 1, lastMsg); + messagesInCurBatch = folder.getMessages(currentBatch * batchSize + 1, + lastMsg); folder.fetch(messagesInCurBatch, fp); current = 0; currentBatch++; LOG.info("Current Batch : " + currentBatch); LOG.info("Messages in this batch : " + messagesInCurBatch.length); } - - @Override + public boolean hasNext() { boolean hasMore = current < messagesInCurBatch.length; - if (!hasMore && doBatching - && currentBatch * batchSize < totalInFolder) { + if (!hasMore && doBatching && currentBatch * batchSize < totalInFolder) { // try next batch try { getNextBatch(batchSize, folder); hasMore = current < messagesInCurBatch.length; } catch (MessagingException e) { - throw new DataImportHandlerException(DataImportHandlerException.SEVERE, - "Message retreival failed", e); + throw new DataImportHandlerException( + DataImportHandlerException.SEVERE, "Message retreival failed", e); } } return hasMore; } - - @Override + public Message next() { return hasNext() ? messagesInCurBatch[current++] : null; } - - @Override + public void remove() { throw new UnsupportedOperationException("Its read only mode..."); } - + private SearchTerm getSearchTerm() { - if (filters.size() == 0) - return null; - if (filters.size() == 1) - return filters.get(0).getCustomSearch(folder); + if (filters.size() == 0) return null; + if (filters.size() == 1) return filters.get(0).getCustomSearch(folder); SearchTerm last = filters.get(0).getCustomSearch(folder); for (int i = 1; i < filters.size(); i++) { CustomFilter filter = filters.get(i); @@ -506,44 +692,83 @@ public class MailEntityProcessor extends EntityProcessorBase { } return last; } + + public boolean match(Message message) { + return true; + } } - + class MailsSinceLastCheckFilter implements CustomFilter { - + private Date since; - + public MailsSinceLastCheckFilter(Date date) { since = date; } - - @Override - public SearchTerm getCustomSearch(Folder folder) { - return new ReceivedDateTerm(ComparisonTerm.GE, since); + + @SuppressWarnings("serial") + public SearchTerm getCustomSearch(final Folder folder) { + LOG.info("Building mail filter for messages in " + folder.getName() + + " that occur after " + sinceDateParser.format(since)); + return new DateTerm(ComparisonTerm.GE, since) { + private int matched = 0; + private int seen = 0; + + @Override + public boolean match(Message msg) { + boolean isMatch = false; + ++seen; + try { + Date msgDate = msg.getReceivedDate(); + if (msgDate == null) msgDate = msg.getSentDate(); + + if (msgDate != null && msgDate.getTime() >= since.getTime()) { + ++matched; + isMatch = true; + } else { + String msgDateStr = (msgDate != null) ? sinceDateParser.format(msgDate) : "null"; + String sinceDateStr = (since != null) ? sinceDateParser.format(since) : "null"; + LOG.debug("Message " + msg.getSubject() + " was received at [" + msgDateStr + + "], since filter is [" + sinceDateStr + "]"); + } + } catch (MessagingException e) { + LOG.warn("Failed to process message due to: "+e, e); + } + + if (seen % 100 == 0) { + LOG.info("Matched " + matched + " of " + seen + " messages since: " + + sinceDateParser.format(since)); + } + + return isMatch; + } + }; } } - + // user settings stored in member variables private String user; private String password; private String host; private String protocol; - + private String folderNames; private List exclude = new ArrayList<>(); private List include = new ArrayList<>(); private boolean recurse; - + private int batchSize; private int fetchSize; private int cTimeout; private int rTimeout; - + private Date fetchMailsSince; private String customFilter; - + private boolean processAttachment = true; - - private Tika tika; + private boolean includeContent = true; + private boolean includeOtherUserFolders = false; + private boolean includeSharedFolders = false; // holds the current state private Store mailbox; @@ -553,16 +778,13 @@ public class MailEntityProcessor extends EntityProcessorBase { private List filters = new ArrayList<>(); private static FetchProfile fp = new FetchProfile(); private static final Logger LOG = LoggerFactory.getLogger(DataImporter.class); - - // diagnostics - private int rowCount = 0; - + static { fp.add(FetchProfile.Item.ENVELOPE); fp.add(FetchProfile.Item.FLAGS); fp.add("X-Mailer"); } - + // Fields To Index // single valued private static final String MESSAGE_ID = "messageId"; @@ -577,13 +799,14 @@ public class MailEntityProcessor extends EntityProcessorBase { private static final String ATTACHMENT = "attachment"; private static final String ATTACHMENT_NAMES = "attachmentNames"; // flag values + private static final String FLAG_NONE = "none"; private static final String FLAG_ANSWERED = "answered"; private static final String FLAG_DELETED = "deleted"; private static final String FLAG_DRAFT = "draft"; private static final String FLAG_FLAGGED = "flagged"; private static final String FLAG_RECENT = "recent"; private static final String FLAG_SEEN = "seen"; - + private int getIntFromContext(String prop, int ifNull) { int v = ifNull; try { @@ -593,11 +816,11 @@ public class MailEntityProcessor extends EntityProcessorBase { v = Integer.valueOf(val); } } catch (NumberFormatException e) { - //do nothing + // do nothing } return v; } - + private boolean getBoolFromContext(String prop, boolean ifNull) { boolean v = ifNull; String val = context.getEntityAttribute(prop); @@ -607,7 +830,7 @@ public class MailEntityProcessor extends EntityProcessorBase { } return v; } - + private String getStringFromContext(String prop, String ifNull) { String v = ifNull; String val = context.getEntityAttribute(prop); diff --git a/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestMailEntityProcessor.java b/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestMailEntityProcessor.java index e595c1ee6ec..b353e18795d 100644 --- a/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestMailEntityProcessor.java +++ b/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestMailEntityProcessor.java @@ -18,151 +18,252 @@ package org.apache.solr.handler.dataimport; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.handler.dataimport.config.Entity; +import org.junit.After; +import org.junit.Before; import org.junit.Ignore; import org.junit.Test; +import com.icegreen.greenmail.imap.ImapHostManager; +import com.icegreen.greenmail.store.MailFolder; +import com.icegreen.greenmail.user.GreenMailUser; +import com.icegreen.greenmail.util.GreenMail; +import com.icegreen.greenmail.util.GreenMailUtil; +import com.icegreen.greenmail.util.ServerSetup; +import com.icegreen.greenmail.imap.ImapConstants; + +import java.io.IOException; +import java.net.ServerSocket; import java.text.ParseException; import java.util.ArrayList; +import java.util.Calendar; +import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; -// Test mailbox is like this: foldername(mailcount) -// top1(2) -> child11(6) -// -> child12(0) -// top2(2) -> child21(1) -// -> grandchild211(2) -// -> grandchild212(1) -// -> child22(2) +import javax.mail.Flags; +import javax.mail.Session; +import javax.mail.internet.MimeMessage; /** - * Test for MailEntityProcessor. The tests are marked as ignored because we'd need a mail server (real or mocked) for - * these to work. - * - * TODO: Find a way to make the tests actually test code - * + * Test for MailEntityProcessor; uses GreenMail embedded Java mail server. * * @see org.apache.solr.handler.dataimport.MailEntityProcessor * @since solr 1.4 */ public class TestMailEntityProcessor extends AbstractDataImportHandlerTestCase { - + // Credentials - private static final String user = "user"; - private static final String password = "password"; - private static final String host = "host"; - private static final String protocol = "imaps"; + private static final String email = "test@localhost.com"; + private static final String user = "test"; + private static final String password = "secret"; + private static final String protocol = "imap"; - private static Map paramMap = new HashMap<>(); + // embedded test mail server + private ServerSetup serverSetup; + private GreenMail greenMail; + private GreenMailUser mailUser; + private String hostAndPort; + private String sep = ImapConstants.HIERARCHY_DELIMITER; + + private Calendar cal = Calendar.getInstance(); + + /** + * Setup an embedded GreenMail server for testing. + */ + @Override + @Before + public void setUp() throws Exception { + super.setUp(); + + int port = findAvailablePort(9103,9193); + serverSetup = new ServerSetup(port, null, protocol); + greenMail = new GreenMail(serverSetup); + greenMail.start(); + mailUser = greenMail.setUser(email, user, password); + hostAndPort = "localhost:"+port; + + // Test mailbox is like this: foldername(mailcount) + // top1(2) -> child11(6) + // -> child12(0) + // top2(2) -> child21(1) + // -> grandchild211(2) + // -> grandchild212(1) + // -> child22(2) + ImapHostManager imapMgr = greenMail.getManagers().getImapHostManager(); + setupFolder(imapMgr, "top1", 2); + setupFolder(imapMgr, "top1"+sep+"child11", 6); + setupFolder(imapMgr, "top2", 2); + setupFolder(imapMgr, "top2"+sep+"child21", 1); + setupFolder(imapMgr, "top2"+sep+"child21"+sep+"grandchild211", 2); + setupFolder(imapMgr, "top2"+sep+"child21"+sep+"grandchild212", 1); + setupFolder(imapMgr, "top2"+sep+"child22", 2); + setupFolder(imapMgr, "top3", 2); + } + + private int findAvailablePort(int min, int max) { + for (int port = min; port < max; port++) { + try { + new ServerSocket(port).close(); + return port; + } catch (IOException e) { + // Port is in use + } + } + throw new IllegalStateException("Could not find available port in range " + min + " to " + max); + } + + @Override + @After + public void tearDown() throws Exception { + greenMail.stop(); + super.tearDown(); + } + + /** + * Creates 1 or more messages in the specified folder. + */ + protected void setupFolder(ImapHostManager imapMgr, String folderName, int numMessages) throws Exception { + setupFolder(imapMgr, folderName, numMessages, 0); + } + + protected void setupFolder(ImapHostManager imapMgr, String folderName, int numMessages, int startAt) throws Exception { + cal.setTimeInMillis(System.currentTimeMillis()); + Date now = cal.getTime(); + MailFolder folder = imapMgr.getFolder(mailUser, folderName, false); + if (folder == null) + folder = imapMgr.createMailbox(mailUser, folderName); + + Session session = GreenMailUtil.getSession(serverSetup); + for (int m=0; m < numMessages; m++) { + int idx = m + startAt; + MimeMessage msg = new MimeMessage(session); + msg.setSubject("test"+idx); + msg.setFrom("from@localhost.com"); + msg.setText("test"+idx); + msg.setSentDate(now); + folder.appendMessage(msg, new Flags(Flags.Flag.RECENT), now); + } + folder.getMessages(); + } + @SuppressWarnings("unchecked") @Test - @Ignore("Needs a Mock Mail Server to work") - public void testConnection() { + public void testConnection() throws Exception { // also tests recurse = false and default settings - paramMap.put("folders", "top2"); + Map paramMap = new HashMap<>(); + paramMap.put("folders", "top1"); paramMap.put("recurse", "false"); - paramMap.put("processAttachement", "false"); + paramMap.put("processAttachments", "false"); + DataImporter di = new DataImporter(); di.loadAndInit(getConfigFromMap(paramMap)); - Entity ent = di.getConfig().getEntities().get(0); RequestInfo rp = new RequestInfo(null, createMap("command", "full-import"), null); SolrWriterImpl swi = new SolrWriterImpl(); di.runCmd(rp, swi); - assertEquals("top1 did not return 2 messages", swi.docs.size(), 2); + assertEquals("top1 did not return 2 messages", 2, swi.docs.size()); } - + + @SuppressWarnings("unchecked") @Test - @Ignore("Needs a Mock Mail Server to work") public void testRecursion() { + Map paramMap = new HashMap<>(); paramMap.put("folders", "top2"); paramMap.put("recurse", "true"); - paramMap.put("processAttachement", "false"); + paramMap.put("processAttachments", "false"); DataImporter di = new DataImporter(); di.loadAndInit(getConfigFromMap(paramMap)); - Entity ent = di.getConfig().getEntities().get(0); RequestInfo rp = new RequestInfo(null, createMap("command", "full-import"), null); SolrWriterImpl swi = new SolrWriterImpl(); di.runCmd(rp, swi); - assertEquals("top2 and its children did not return 8 messages", swi.docs.size(), 8); + assertEquals("top2 and its children did not return 8 messages", 8, swi.docs.size()); } + @SuppressWarnings("unchecked") @Test - @Ignore("Needs a Mock Mail Server to work") public void testExclude() { + Map paramMap = new HashMap<>(); paramMap.put("folders", "top2"); paramMap.put("recurse", "true"); - paramMap.put("processAttachement", "false"); + paramMap.put("processAttachments", "false"); paramMap.put("exclude", ".*grandchild.*"); DataImporter di = new DataImporter(); di.loadAndInit(getConfigFromMap(paramMap)); - Entity ent = di.getConfig().getEntities().get(0); RequestInfo rp = new RequestInfo(null, createMap("command", "full-import"), null); SolrWriterImpl swi = new SolrWriterImpl(); di.runCmd(rp, swi); - assertEquals("top2 and its direct children did not return 5 messages", swi.docs.size(), 5); + assertEquals("top2 and its direct children did not return 5 messages", 5, swi.docs.size()); } + @SuppressWarnings("unchecked") @Test - @Ignore("Needs a Mock Mail Server to work") public void testInclude() { + Map paramMap = new HashMap<>(); paramMap.put("folders", "top2"); paramMap.put("recurse", "true"); - paramMap.put("processAttachement", "false"); + paramMap.put("processAttachments", "false"); paramMap.put("include", ".*grandchild.*"); DataImporter di = new DataImporter(); di.loadAndInit(getConfigFromMap(paramMap)); - Entity ent = di.getConfig().getEntities().get(0); RequestInfo rp = new RequestInfo(null, createMap("command", "full-import"), null); SolrWriterImpl swi = new SolrWriterImpl(); di.runCmd(rp, swi); - assertEquals("top2 and its direct children did not return 3 messages", swi.docs.size(), 3); + assertEquals("top2 and its direct children did not return 3 messages", 3, swi.docs.size()); } + @SuppressWarnings("unchecked") @Test - @Ignore("Needs a Mock Mail Server to work") public void testIncludeAndExclude() { + Map paramMap = new HashMap<>(); paramMap.put("folders", "top1,top2"); paramMap.put("recurse", "true"); - paramMap.put("processAttachement", "false"); + paramMap.put("processAttachments", "false"); paramMap.put("exclude", ".*top1.*"); paramMap.put("include", ".*grandchild.*"); DataImporter di = new DataImporter(); di.loadAndInit(getConfigFromMap(paramMap)); - Entity ent = di.getConfig().getEntities().get(0); RequestInfo rp = new RequestInfo(null, createMap("command", "full-import"), null); SolrWriterImpl swi = new SolrWriterImpl(); di.runCmd(rp, swi); - assertEquals("top2 and its direct children did not return 3 messages", swi.docs.size(), 3); + assertEquals("top2 and its direct children did not return 3 messages", 3, swi.docs.size()); } + @SuppressWarnings("unchecked") @Test - @Ignore("Needs a Mock Mail Server to work") public void testFetchTimeSince() throws ParseException { - paramMap.put("folders", "top1/child11"); + Map paramMap = new HashMap<>(); + paramMap.put("folders", "top1"+sep+"child11"); paramMap.put("recurse", "true"); - paramMap.put("processAttachement", "false"); + paramMap.put("processAttachments", "false"); paramMap.put("fetchMailsSince", "2008-12-26 00:00:00"); DataImporter di = new DataImporter(); di.loadAndInit(getConfigFromMap(paramMap)); - Entity ent = di.getConfig().getEntities().get(0); RequestInfo rp = new RequestInfo(null, createMap("command", "full-import"), null); SolrWriterImpl swi = new SolrWriterImpl(); di.runCmd(rp, swi); - assertEquals("top2 and its direct children did not return 3 messages", swi.docs.size(), 3); + assertEquals("top1"+sep+"child11 and its direct children did not return 6 messages", 6, swi.docs.size()); } + // configures the data importer to use the MailEntityProcessor we're testing in this class private String getConfigFromMap(Map params) { String conf = - "" + - "" + - "" + - "" + - ""; + "" + + "" + + "" + + ""+ + ""+ + ""+ + ""+ + ""+ + ""+ + "" + + "" + + ""; params.put("user", user); params.put("password", password); - params.put("host", host); + params.put("host", hostAndPort); params.put("protocol", protocol); StringBuilder attribs = new StringBuilder(""); for (String key : params.keySet()) @@ -171,6 +272,7 @@ public class TestMailEntityProcessor extends AbstractDataImportHandlerTestCase { return conf.replace("someconfig", attribs.toString()); } + // collects documents written by the DataImporter (from the MailEntityProcessor) static class SolrWriterImpl extends SolrWriter { List docs = new ArrayList<>(); Boolean deleteAllCalled; @@ -181,7 +283,12 @@ public class TestMailEntityProcessor extends AbstractDataImportHandlerTestCase { } @Override - public boolean upload(SolrInputDocument doc) { + public void close() { + // no-op method to avoid NPE in super impl + } + + @Override + public boolean upload(SolrInputDocument doc) { return docs.add(doc); } diff --git a/solr/contrib/dataimporthandler/ivy.xml b/solr/contrib/dataimporthandler/ivy.xml index 1e4d9dc751c..bc9ab88393f 100644 --- a/solr/contrib/dataimporthandler/ivy.xml +++ b/solr/contrib/dataimporthandler/ivy.xml @@ -24,7 +24,8 @@ - + + diff --git a/solr/example/example-DIH/README.txt b/solr/example/example-DIH/README.txt index 9c2f3c37e24..98db213d2d0 100644 --- a/solr/example/example-DIH/README.txt +++ b/solr/example/example-DIH/README.txt @@ -35,7 +35,7 @@ To import data from the slashdot feed, connect to To import data from your imap server -1. Edit the example-DIH/solr/mail/conf/data-config.xml and add details about username, password, imap server +1. Edit the example-DIH/solr/mail/conf/mail-data-config.xml and add details about username, password, imap server 2. Connect to http://localhost:8983/solr/mail/dataimport?command=full-import To copy data from db Solr core, connect to diff --git a/solr/example/example-DIH/solr/mail/conf/mail-data-config.xml b/solr/example/example-DIH/solr/mail/conf/mail-data-config.xml new file mode 100644 index 00000000000..736aea7cc99 --- /dev/null +++ b/solr/example/example-DIH/solr/mail/conf/mail-data-config.xml @@ -0,0 +1,12 @@ + + + + + +