diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java index 52912848e5..57b9aa914f 100644 --- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java +++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java @@ -191,10 +191,11 @@ public class ExtractorFactory { throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")"); } - public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException { - return createExtractor(fs.getRoot(), fs); + public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { + // Only ever an OLE2 one from the root of the FS + return (POIOLE2TextExtractor)createExtractor(fs.getRoot(), fs); } - public static POIOLE2TextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException { + public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { // Look for certain entries in the stream, to figure it // out from for(Iterator entries = poifsDir.getEntries(); entries.hasNext(); ) { @@ -234,6 +235,12 @@ public class ExtractorFactory { ) { return new OutlookTextExtactor(poifsDir, fs); } + if(entry.getName().equals("Package")) { + OPCPackage pkg = OPCPackage.open( + poifsDir.createDocumentInputStream(entry.getName()) + ); + return createExtractor(pkg); + } } throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); } @@ -246,7 +253,7 @@ public class ExtractorFactory { * empty array. Otherwise, you'll get one open * {@link POITextExtractor} for each embeded file. */ - public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException { + public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { // All the embded directories we spotted ArrayList dirs = new ArrayList(); // For anything else not directly held in as a POIFS directory diff --git a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java index 4def3d3268..57574c6ab3 100644 --- a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java +++ b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java @@ -60,6 +60,7 @@ public class TestExtractorFactory extends TestCase { private File docx; private File dotx; private File docEmb; + private File docEmbOOXML; private File ppt; private File pptx; @@ -88,6 +89,7 @@ public class TestExtractorFactory extends TestCase { docx = wpTests.getFile("SampleDoc.docx"); dotx = wpTests.getFile("test.dotx"); docEmb = wpTests.getFile("word_with_embeded.doc"); + docEmbOOXML = wpTests.getFile("word_with_embeded_ooxml.doc"); POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); ppt = slTests.getFile("SampleShow.ppt"); @@ -536,7 +538,7 @@ public class TestExtractorFactory extends TestCase { embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); assertEquals(6, embeds.length); - int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0; + int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX; for(int i=0; i 20); @@ -569,6 +571,27 @@ public class TestExtractorFactory extends TestCase { assertEquals(1, numWord); assertEquals(0, numMsg); + // Word which contains an OOXML file + ext = (POIOLE2TextExtractor) + ExtractorFactory.createExtractor(docEmbOOXML); + embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); + + numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0; + assertEquals(3, embeds.length); + for(int i=0; i 20); + if(embeds[i] instanceof PowerPointExtractor) numPpt++; + else if(embeds[i] instanceof ExcelExtractor) numXls++; + else if(embeds[i] instanceof WordExtractor) numWord++; + else if(embeds[i] instanceof OutlookTextExtactor) numMsg++; + else if(embeds[i] instanceof XWPFWordExtractor) numWordX++; + } + assertEquals(1, numPpt); + assertEquals(1, numXls); + assertEquals(0, numWord); + assertEquals(1, numWordX); + assertEquals(0, numMsg); + // Outlook ext = (OutlookTextExtactor) ExtractorFactory.createExtractor(msgEmb); diff --git a/test-data/document/word_with_embeded_ooxml.doc b/test-data/document/word_with_embeded_ooxml.doc new file mode 100644 index 0000000000..f25cacf91f Binary files /dev/null and b/test-data/document/word_with_embeded_ooxml.doc differ