diff --git a/src/scratchpad/src/org/apache/poi/extractor/ExtractorFactory.java b/src/scratchpad/src/org/apache/poi/extractor/ExtractorFactory.java index 548697c3c0..d6c7a1810d 100644 --- a/src/scratchpad/src/org/apache/poi/extractor/ExtractorFactory.java +++ b/src/scratchpad/src/org/apache/poi/extractor/ExtractorFactory.java @@ -32,7 +32,9 @@ import org.openxml4j.opc.PackageRelationshipCollection; import org.apache.poi.POITextExtractor; import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor; +import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.xslf.XSLFSlideShow; @@ -51,20 +53,21 @@ public class ExtractorFactory { public static final String CORE_DOCUMENT_REL = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"; - public POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { - FileInputStream finp = new FileInputStream(f); + public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { + InputStream inp = new PushbackInputStream( + new FileInputStream(f), 8); - if(POIFSFileSystem.hasPOIFSHeader(finp)) { - return createExtractor(new POIFSFileSystem(finp)); + if(POIFSFileSystem.hasPOIFSHeader(inp)) { + return createExtractor(new POIFSFileSystem(inp)); } - if(POIXMLDocument.hasOOXMLHeader(finp)) { - finp.close(); + if(POIXMLDocument.hasOOXMLHeader(inp)) { + inp.close(); return createExtractor(Package.open(f.toString())); } throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file"); } - public POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { + public static POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { // Figure out the kind of stream // If clearly doesn't do mark/reset, wrap up if(! inp.markSupported()) { @@ -80,7 +83,7 @@ public class ExtractorFactory { throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream"); } - public POIXMLTextExtractor createExtractor(Package pkg) throws IOException, OpenXML4JException, XmlException { + public static POIXMLTextExtractor createExtractor(Package pkg) throws IOException, OpenXML4JException, XmlException { PackageRelationshipCollection core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL); if(core.size() != 1) { @@ -100,14 +103,23 @@ public class ExtractorFactory { throw new IllegalArgumentException("No supported documents found in the OOXML package"); } - public POITextExtractor createExtractor(POIFSFileSystem fs) throws IOException { + public static POITextExtractor createExtractor(POIFSFileSystem fs) throws IOException { // Look for certain entries in the stream, to figure it // out from for(Iterator entries = fs.getRoot().getEntries(); entries.hasNext(); ) { Entry entry = (Entry)entries.next(); + + System.err.println(entry.getName()); if(entry.getName().equals("Workbook")) { return new ExcelExtractor(fs); } + if(entry.getName().equals("WordDocument")) { + return new WordExtractor(fs); + } + if(entry.getName().equals("PowerPoint Document")) { + return new PowerPointExtractor(fs); + } + // TODO - visio } throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); } diff --git a/src/scratchpad/testcases/org/apache/poi/extractor/TestExtractorFactory.java b/src/scratchpad/testcases/org/apache/poi/extractor/TestExtractorFactory.java new file mode 100644 index 0000000000..40f9462c58 --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/extractor/TestExtractorFactory.java @@ -0,0 +1,140 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.extractor; + +import java.io.File; + +import org.apache.poi.hslf.extractor.PowerPointExtractor; +import org.apache.poi.hssf.extractor.ExcelExtractor; +import org.apache.poi.hwpf.extractor.WordExtractor; +import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; +import org.apache.poi.xssf.extractor.XSSFExcelExtractor; +import org.apache.poi.xwpf.extractor.XWPFWordExtractor; + +import junit.framework.TestCase; + +/** + * Test that the extractor factory plays nicely + */ +public class TestExtractorFactory extends TestCase { + private String excel_dir; + private String word_dir; + private String powerpoint_dir; + + private File txt; + + private File xls; + private File xlsx; + + private File doc; + private File docx; + + private File ppt; + private File pptx; + + protected void setUp() throws Exception { + super.setUp(); + + excel_dir = System.getProperty("HSSF.testdata.path"); + word_dir = System.getProperty("HWPF.testdata.path"); + powerpoint_dir = System.getProperty("HSLF.testdata.path"); + + txt = new File(excel_dir, "SampleSS.txt"); + + xls = new File(excel_dir, "SampleSS.xls"); + xlsx = new File(excel_dir, "SampleSS.xlsx"); + + doc = new File(word_dir, "SampleDoc.doc"); + docx = new File(word_dir, "SampleDoc.docx"); + + ppt = new File(powerpoint_dir, "SampleShow.ppt"); + pptx = new File(powerpoint_dir, "SampleShow.pptx"); + } + + public void testFile() throws Exception { + // Excel + assertTrue( + ExtractorFactory.createExtractor(xls) + instanceof ExcelExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(xls).getText().length() > 200 + ); + + assertTrue( + ExtractorFactory.createExtractor(xlsx) + instanceof XSSFExcelExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(xlsx).getText().length() > 200 + ); + + // Word + assertTrue( + ExtractorFactory.createExtractor(doc) + instanceof WordExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(doc).getText().length() > 120 + ); + + assertTrue( + ExtractorFactory.createExtractor(docx) + instanceof XWPFWordExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(docx).getText().length() > 120 + ); + + // PowerPoint + assertTrue( + ExtractorFactory.createExtractor(ppt) + instanceof PowerPointExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(ppt).getText().length() > 120 + ); + + assertTrue( + ExtractorFactory.createExtractor(pptx) + instanceof XSLFPowerPointExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(pptx).getText().length() > 120 + ); + + // Visio + // TODO + + // Text + try { + ExtractorFactory.createExtractor(txt); + fail(); + } catch(IllegalArgumentException e) { + // Good + } + } + public void testInputStream() throws Exception { + + } + public void testPOIFS() throws Exception { + + } + public void testPackage() throws Exception { + + } +}