mirror of https://github.com/apache/poi.git
Wire up the new HSMFTextExtactor to the ExtractorFactory
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897246 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c01272208e
commit
f7ccc5d5f5
|
@ -31,6 +31,7 @@ import org.apache.poi.POIXMLDocument;
|
||||||
import org.apache.poi.POIXMLTextExtractor;
|
import org.apache.poi.POIXMLTextExtractor;
|
||||||
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
||||||
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
||||||
|
import org.apache.poi.hsmf.extractor.HSMFTextExtactor;
|
||||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||||
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
||||||
|
@ -138,6 +139,11 @@ public class ExtractorFactory {
|
||||||
if(entry.getName().equals("VisioDocument")) {
|
if(entry.getName().equals("VisioDocument")) {
|
||||||
return new VisioTextExtractor(poifsDir, fs);
|
return new VisioTextExtractor(poifsDir, fs);
|
||||||
}
|
}
|
||||||
|
if(entry.getName().equals("__substg1.0_1000001E") ||
|
||||||
|
entry.getName().equals("__substg1.0_0047001E") ||
|
||||||
|
entry.getName().equals("__substg1.0_0037001E")) {
|
||||||
|
return new HSMFTextExtactor(poifsDir, fs);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
|
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.poi.POITextExtractor;
|
||||||
import org.apache.poi.POIDataSamples;
|
import org.apache.poi.POIDataSamples;
|
||||||
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
||||||
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
||||||
|
import org.apache.poi.hsmf.extractor.HSMFTextExtactor;
|
||||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
@ -57,6 +58,7 @@ public class TestExtractorFactory extends TestCase {
|
||||||
private File ppt;
|
private File ppt;
|
||||||
private File pptx;
|
private File pptx;
|
||||||
|
|
||||||
|
private File msg;
|
||||||
private File vsd;
|
private File vsd;
|
||||||
|
|
||||||
protected void setUp() throws Exception {
|
protected void setUp() throws Exception {
|
||||||
|
@ -81,6 +83,9 @@ public class TestExtractorFactory extends TestCase {
|
||||||
|
|
||||||
POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
|
POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
|
||||||
vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd");
|
vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd");
|
||||||
|
|
||||||
|
POIDataSamples olTests = POIDataSamples.getHSMFInstance();
|
||||||
|
msg = olTests.getFile("quick.msg");
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFile() throws Exception {
|
public void testFile() throws Exception {
|
||||||
|
@ -161,6 +166,15 @@ public class TestExtractorFactory extends TestCase {
|
||||||
ExtractorFactory.createExtractor(vsd).getText().length() > 50
|
ExtractorFactory.createExtractor(vsd).getText().length() > 50
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Outlook msg
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(msg)
|
||||||
|
instanceof HSMFTextExtactor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(msg).getText().length() > 50
|
||||||
|
);
|
||||||
|
|
||||||
// Text
|
// Text
|
||||||
try {
|
try {
|
||||||
ExtractorFactory.createExtractor(txt);
|
ExtractorFactory.createExtractor(txt);
|
||||||
|
@ -231,6 +245,15 @@ public class TestExtractorFactory extends TestCase {
|
||||||
ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50
|
ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Outlook msg
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(new FileInputStream(msg))
|
||||||
|
instanceof HSMFTextExtactor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(new FileInputStream(msg)).getText().length() > 50
|
||||||
|
);
|
||||||
|
|
||||||
// Text
|
// Text
|
||||||
try {
|
try {
|
||||||
ExtractorFactory.createExtractor(new FileInputStream(txt));
|
ExtractorFactory.createExtractor(new FileInputStream(txt));
|
||||||
|
@ -277,6 +300,15 @@ public class TestExtractorFactory extends TestCase {
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Outlook msg
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
|
||||||
|
instanceof HSMFTextExtactor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
|
||||||
|
);
|
||||||
|
|
||||||
// Text
|
// Text
|
||||||
try {
|
try {
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
|
||||||
|
@ -375,5 +407,6 @@ public class TestExtractorFactory extends TestCase {
|
||||||
|
|
||||||
// TODO - PowerPoint
|
// TODO - PowerPoint
|
||||||
// TODO - Visio
|
// TODO - Visio
|
||||||
|
// TODO - Outlook
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,6 +34,7 @@ import org.apache.poi.hsmf.datatypes.RecipientChunks;
|
||||||
import org.apache.poi.hsmf.datatypes.StringChunk;
|
import org.apache.poi.hsmf.datatypes.StringChunk;
|
||||||
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
|
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
|
||||||
import org.apache.poi.hsmf.parsers.POIFSChunkParser;
|
import org.apache.poi.hsmf.parsers.POIFSChunkParser;
|
||||||
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -78,15 +79,24 @@ public class MAPIMessage extends POIDocument {
|
||||||
this(new POIFSFileSystem(in));
|
this(new POIFSFileSystem(in));
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
* Constructor for reading MSG Files from an input stream.
|
* Constructor for reading MSG Files from a POIFS filesystem
|
||||||
* @param in
|
* @param in
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
public MAPIMessage(POIFSFileSystem fs) throws IOException {
|
public MAPIMessage(POIFSFileSystem fs) throws IOException {
|
||||||
super(fs);
|
this(fs.getRoot(), fs);
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Constructor for reading MSG Files from a certain
|
||||||
|
* point within a POIFS filesystem
|
||||||
|
* @param in
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public MAPIMessage(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
|
||||||
|
super(poifsDir, fs);
|
||||||
|
|
||||||
// Grab all the chunks
|
// Grab all the chunks
|
||||||
ChunkGroup[] chunkGroups = POIFSChunkParser.parse(fs);
|
ChunkGroup[] chunkGroups = POIFSChunkParser.parse(poifsDir);
|
||||||
|
|
||||||
// Grab interesting bits
|
// Grab interesting bits
|
||||||
ArrayList<AttachmentChunks> attachments = new ArrayList<AttachmentChunks>();
|
ArrayList<AttachmentChunks> attachments = new ArrayList<AttachmentChunks>();
|
||||||
|
|
|
@ -23,12 +23,16 @@ import java.text.SimpleDateFormat;
|
||||||
import org.apache.poi.POIOLE2TextExtractor;
|
import org.apache.poi.POIOLE2TextExtractor;
|
||||||
import org.apache.poi.hsmf.MAPIMessage;
|
import org.apache.poi.hsmf.MAPIMessage;
|
||||||
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
|
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
|
||||||
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
|
||||||
public class HSMFTextExtactor extends POIOLE2TextExtractor {
|
public class HSMFTextExtactor extends POIOLE2TextExtractor {
|
||||||
public HSMFTextExtactor(MAPIMessage msg) {
|
public HSMFTextExtactor(MAPIMessage msg) {
|
||||||
super(msg);
|
super(msg);
|
||||||
}
|
}
|
||||||
|
public HSMFTextExtactor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
|
||||||
|
this(new MAPIMessage(poifsDir, fs));
|
||||||
|
}
|
||||||
public HSMFTextExtactor(POIFSFileSystem fs) throws IOException {
|
public HSMFTextExtactor(POIFSFileSystem fs) throws IOException {
|
||||||
this(new MAPIMessage(fs));
|
this(new MAPIMessage(fs));
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue