Add embeded (attachment) support to the outlook text extractor

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897258 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2010-01-08 16:44:08 +00:00
parent 07551a0925
commit f37c8f303a
4 changed files with 74 additions and 11 deletions

View File

@ -34,7 +34,8 @@
<changes>
<release version="3.7-SNAPSHOT" date="2010-??-??">
<action dev="POI-DEVELOPERS" type="fix">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action>
<action dev="POI-DEVELOPERS" type="add">Support attachments as embeded documents within the new OutlookTextExtractor</action>
<action dev="POI-DEVELOPERS" type="add">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action>
<action dev="POI-DEVELOPERS" type="fix">Some improvements to HSMF parsing of .msg files</action>
<action dev="POI-DEVELOPERS" type="fix">Initialise the link type of HSSFHyperLink, so that getType() on it works</action>
<action dev="POI-DEVELOPERS" type="fix">48425 - improved performance of DateUtil.isCellDateFormatted() </action>

View File

@ -16,6 +16,7 @@
==================================================================== */
package org.apache.poi.extractor;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
@ -31,6 +32,8 @@ import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hsmf.MAPIMessage;
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
@ -139,9 +142,14 @@ public class ExtractorFactory {
if(entry.getName().equals("VisioDocument")) {
return new VisioTextExtractor(poifsDir, fs);
}
if(entry.getName().equals("__substg1.0_1000001E") ||
if(
entry.getName().equals("__substg1.0_1000001E") ||
entry.getName().equals("__substg1.0_1000001F") ||
entry.getName().equals("__substg1.0_0047001E") ||
entry.getName().equals("__substg1.0_0037001E")) {
entry.getName().equals("__substg1.0_0047001F") ||
entry.getName().equals("__substg1.0_0037001E") ||
entry.getName().equals("__substg1.0_0037001F")
) {
return new OutlookTextExtactor(poifsDir, fs);
}
}
@ -157,8 +165,12 @@ public class ExtractorFactory {
* {@link POITextExtractor} for each embeded file.
*/
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
// Find all the embeded directories
// All the embded directories we spotted
ArrayList<Entry> dirs = new ArrayList<Entry>();
// For anything else not directly held in as a POIFS directory
ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
// Find all the embeded directories
POIFSFileSystem fs = ext.getFileSystem();
if(fs == null) {
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
@ -189,20 +201,44 @@ public class ExtractorFactory {
} else if(ext instanceof PowerPointExtractor) {
// Tricky, not stored directly in poifs
// TODO
} else if(ext instanceof OutlookTextExtactor) {
// Stored in the Attachment blocks
MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage();
for(AttachmentChunks attachment : msg.getAttachmentFiles()) {
if(attachment.attachData != null) {
byte[] data = attachment.attachData.getValue();
nonPOIFS.add( new ByteArrayInputStream(data) );
}
}
}
// Create the extractors
if(dirs == null || dirs.size() == 0) {
if(
(dirs == null || dirs.size() == 0) &&
(nonPOIFS == null || nonPOIFS.size() == 0)
){
return new POITextExtractor[0];
}
POITextExtractor[] te = new POITextExtractor[dirs.size()];
for(int i=0; i<te.length; i++) {
te[i] = createExtractor(
ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
for(int i=0; i<dirs.size(); i++) {
e.add( createExtractor(
(DirectoryNode)dirs.get(i), ext.getFileSystem()
);
) );
}
return te;
for(int i=0; i<nonPOIFS.size(); i++) {
try {
e.add( createExtractor(nonPOIFS.get(i)) );
} catch(IllegalArgumentException ie) {
// Ignore, just means it didn't contain
// a format we support as yet
} catch(XmlException xe) {
throw new IOException(xe.getMessage());
} catch(OpenXML4JException oe) {
throw new IOException(oe.getMessage());
}
}
return e.toArray(new POITextExtractor[e.size()]);
}
/**

View File

@ -59,6 +59,8 @@ public class TestExtractorFactory extends TestCase {
private File pptx;
private File msg;
private File msgEmb;
private File vsd;
protected void setUp() throws Exception {
@ -86,6 +88,7 @@ public class TestExtractorFactory extends TestCase {
POIDataSamples olTests = POIDataSamples.getHSMFInstance();
msg = olTests.getFile("quick.msg");
msgEmb = olTests.getFile("attachment_test_msg.msg");
}
public void testFile() throws Exception {
@ -404,9 +407,25 @@ public class TestExtractorFactory extends TestCase {
assertEquals(1, numPpt);
assertEquals(2, numXls);
assertEquals(1, numWord);
// Outlook
ext = (OutlookTextExtactor)
ExtractorFactory.createExtractor(msgEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0; numXls = 0; numPpt = 0;
assertEquals(1, embeds.length);
for(int i=0; i<embeds.length; i++) {
assertTrue(embeds[i].getText().length() > 20);
if(embeds[i] instanceof PowerPointExtractor) numPpt++;
else if(embeds[i] instanceof ExcelExtractor) numXls++;
else if(embeds[i] instanceof WordExtractor) numWord++;
}
assertEquals(0, numPpt);
assertEquals(0, numXls);
assertEquals(1, numWord);
// TODO - PowerPoint
// TODO - Visio
// TODO - Outlook
}
}

View File

@ -44,6 +44,13 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor {
this(new MAPIMessage(inp));
}
/**
* Returns the underlying MAPI message
*/
public MAPIMessage getMAPIMessage() {
return (MAPIMessage)document;
}
/**
* Outputs something a little like a RFC822 email
*/