diff --git a/src/java/org/apache/poi/POITextExtractor.java b/src/java/org/apache/poi/POITextExtractor.java index 6514ad5d18..542b4d30da 100644 --- a/src/java/org/apache/poi/POITextExtractor.java +++ b/src/java/org/apache/poi/POITextExtractor.java @@ -31,6 +31,8 @@ import java.io.IOException; * @see org.apache.poi.hwpf.extractor.WordExtractor */ public abstract class POITextExtractor implements Closeable { + private Closeable fsToClose = null; + /** * Retrieves all the text from the document. * How cells, paragraphs etc are separated in the text @@ -46,6 +48,13 @@ public abstract class POITextExtractor implements Closeable { * metadata / properties, such as author and title. */ public abstract POITextExtractor getMetadataTextExtractor(); + + /** + * Used to ensure file handle cleanup. + */ + public void setFilesystem(Closeable fs) { + fsToClose = fs; + } /** * Allows to free resources of the Extractor as soon as @@ -55,6 +64,8 @@ public abstract class POITextExtractor implements Closeable { * The Extractor cannot be used after close has been called. */ public void close() throws IOException { - // nothing to do in abstract class, derived classes may perform actions. + if(fsToClose != null) { + fsToClose.close(); + } } } diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java index 906c026375..d7c6e3d5d0 100644 --- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java +++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java @@ -128,20 +128,25 @@ public class ExtractorFactory { return threadPreferEventExtractors.get(); } - public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { - InputStream inp = null; + NPOIFSFileSystem fs = null; try { - try { - NPOIFSFileSystem fs = new NPOIFSFileSystem(f); - return createExtractor(fs); - } catch (OfficeXmlFileException e) { - return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ)); - } catch (NotOLE2FileException ne) { - throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file"); + fs = new NPOIFSFileSystem(f); + POIOLE2TextExtractor extractor = createExtractor(fs); + extractor.setFilesystem(fs); + return extractor; + } catch (OfficeXmlFileException e) { + // ensure file-handle release + if(fs != null) { + fs.close(); } - } finally { - if(inp != null) inp.close(); + return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ)); + } catch (NotOLE2FileException ne) { + // ensure file-handle release + if(fs != null) { + fs.close(); + } + throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file"); } } @@ -161,65 +166,95 @@ public class ExtractorFactory { throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream"); } + /** + * Tries to determine the actual type of file and produces a matching text-extractor for it. + * + * @param pkg An {@link OPCPackage}. + * @return A {@link POIXMLTextExtractor} for the given file. + * @throws IOException If an error occurs while reading the file + * @throws OpenXML4JException If an error parsing the OpenXML file format is found. + * @throws XmlException If an XML parsing error occurs. + * @throws IllegalArgumentException If no matching file type could be found. + */ public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException { - // Check for the normal Office core document - PackageRelationshipCollection core = - pkg.getRelationshipsByType(CORE_DOCUMENT_REL); - - // If nothing was found, try some of the other OOXML-based core types - if (core.size() == 0) { - // Could it be an OOXML-Strict one? - core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL); - } - if (core.size() == 0) { - // Could it be a visio one? - core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL); - if (core.size() == 1) - return new XDGFVisioExtractor(pkg); - } - - // Should just be a single core document, complain if not - if (core.size() != 1) { - throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size()); - } - - // Grab the core document part, and try to identify from that - PackagePart corePart = pkg.getPart(core.getRelationship(0)); - - // Is it XSSF? - for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) { - if(corePart.getContentType().equals(rel.getContentType())) { - if(getPreferEventExtractor()) { - return new XSSFEventBasedExcelExtractor(pkg); - } - - return new XSSFExcelExtractor(pkg); - } - } - - // Is it XWPF? - for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) { - if(corePart.getContentType().equals(rel.getContentType())) { - return new XWPFWordExtractor(pkg); - } - } - - // Is it XSLF? - for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) { - if(corePart.getContentType().equals(rel.getContentType())) { - return new XSLFPowerPointExtractor(pkg); - } - } - - // special handling for SlideShow-Theme-files, - if(XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType())) { - return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg)); - } - - // ensure that we close the package again if there is an error opening it, however - // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! - pkg.revert(); - throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")"); + try { + // Check for the normal Office core document + PackageRelationshipCollection core = + pkg.getRelationshipsByType(CORE_DOCUMENT_REL); + + // If nothing was found, try some of the other OOXML-based core types + if (core.size() == 0) { + // Could it be an OOXML-Strict one? + core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL); + } + if (core.size() == 0) { + // Could it be a visio one? + core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL); + if (core.size() == 1) + return new XDGFVisioExtractor(pkg); + } + + // Should just be a single core document, complain if not + if (core.size() != 1) { + throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size()); + } + + // Grab the core document part, and try to identify from that + PackagePart corePart = pkg.getPart(core.getRelationship(0)); + + // Is it XSSF? + for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) { + if(corePart.getContentType().equals(rel.getContentType())) { + if(getPreferEventExtractor()) { + return new XSSFEventBasedExcelExtractor(pkg); + } + + return new XSSFExcelExtractor(pkg); + } + } + + // Is it XWPF? + for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) { + if(corePart.getContentType().equals(rel.getContentType())) { + return new XWPFWordExtractor(pkg); + } + } + + // Is it XSLF? + for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) { + if(corePart.getContentType().equals(rel.getContentType())) { + return new XSLFPowerPointExtractor(pkg); + } + } + + // special handling for SlideShow-Theme-files, + if(XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType())) { + return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg)); + } + + throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")"); + } catch (IOException e) { + // ensure that we close the package again if there is an error opening it, however + // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! + pkg.revert(); + throw e; + } catch (OpenXML4JException e) { + // ensure that we close the package again if there is an error opening it, however + // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! + pkg.revert(); + throw e; + } catch (XmlException e) { + // ensure that we close the package again if there is an error opening it, however + // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! + pkg.revert(); + throw e; + } catch (RuntimeException e) { + // ensure that we close the package again if there is an error opening it, however + // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! + pkg.revert(); + + throw e; + } } public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { diff --git a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java index fba530757a..acf0d46614 100644 --- a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java +++ b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java @@ -193,29 +193,35 @@ public class TestExtractorFactory { // Word + extractor = ExtractorFactory.createExtractor(doc); assertTrue( - ExtractorFactory.createExtractor(doc) + extractor instanceof WordExtractor ); assertTrue( - ExtractorFactory.createExtractor(doc).getText().length() > 120 + extractor.getText().length() > 120 ); + extractor.close(); + extractor = ExtractorFactory.createExtractor(doc6); assertTrue( - ExtractorFactory.createExtractor(doc6) + extractor instanceof Word6Extractor ); assertTrue( - ExtractorFactory.createExtractor(doc6).getText().length() > 20 + extractor.getText().length() > 20 ); + extractor.close(); + extractor = ExtractorFactory.createExtractor(doc95); assertTrue( - ExtractorFactory.createExtractor(doc95) + extractor instanceof Word6Extractor ); assertTrue( - ExtractorFactory.createExtractor(doc95).getText().length() > 120 + extractor.getText().length() > 120 ); + extractor.close(); extractor = ExtractorFactory.createExtractor(docx); assertTrue( @@ -241,62 +247,71 @@ public class TestExtractorFactory { ); extractor.close(); - // PowerPoint + // PowerPoint (PPT) + extractor = ExtractorFactory.createExtractor(ppt); assertTrue( - ExtractorFactory.createExtractor(ppt) + extractor instanceof PowerPointExtractor ); assertTrue( - ExtractorFactory.createExtractor(ppt).getText().length() > 120 + extractor.getText().length() > 120 ); + extractor.close(); + // PowerPoint (PPTX) extractor = ExtractorFactory.createExtractor(pptx); assertTrue( extractor instanceof XSLFPowerPointExtractor ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(pptx); assertTrue( extractor.getText().length() > 120 ); extractor.close(); // Visio - binary + extractor = ExtractorFactory.createExtractor(vsd); assertTrue( - ExtractorFactory.createExtractor(vsd) + extractor instanceof VisioTextExtractor ); assertTrue( - ExtractorFactory.createExtractor(vsd).getText().length() > 50 + extractor.getText().length() > 50 ); + extractor.close(); + // Visio - vsdx + extractor = ExtractorFactory.createExtractor(vsdx); assertTrue( - ExtractorFactory.createExtractor(vsdx) + extractor instanceof XDGFVisioExtractor ); assertTrue( - ExtractorFactory.createExtractor(vsdx).getText().length() > 20 + extractor.getText().length() > 20 ); + extractor.close(); // Publisher + extractor = ExtractorFactory.createExtractor(pub); assertTrue( - ExtractorFactory.createExtractor(pub) + extractor instanceof PublisherTextExtractor ); assertTrue( - ExtractorFactory.createExtractor(pub).getText().length() > 50 + extractor.getText().length() > 50 ); + extractor.close(); // Outlook msg + extractor = ExtractorFactory.createExtractor(msg); assertTrue( - ExtractorFactory.createExtractor(msg) + extractor instanceof OutlookTextExtactor ); assertTrue( - ExtractorFactory.createExtractor(msg).getText().length() > 50 + extractor.getText().length() > 50 ); + extractor.close(); // Text try { @@ -557,13 +572,15 @@ public class TestExtractorFactory { extractor.close(); // Visio + extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString())); assertTrue( - ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString())) + extractor instanceof XDGFVisioExtractor ); assertTrue( extractor.getText().length() > 20 ); + extractor.close(); // Text try { @@ -670,6 +687,7 @@ public class TestExtractorFactory { ExtractorFactory.createExtractor(xls); embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); assertEquals(0, embeds.length); + ext.close(); // Excel ext = (POIOLE2TextExtractor) @@ -690,6 +708,7 @@ public class TestExtractorFactory { assertEquals(2, numXls); assertEquals(2, numWord); assertEquals(0, numMsg); + ext.close(); // Word ext = (POIOLE2TextExtractor) @@ -709,6 +728,7 @@ public class TestExtractorFactory { assertEquals(2, numXls); assertEquals(1, numWord); assertEquals(0, numMsg); + ext.close(); // Word which contains an OOXML file ext = (POIOLE2TextExtractor) @@ -730,6 +750,7 @@ public class TestExtractorFactory { assertEquals(0, numWord); assertEquals(1, numWordX); assertEquals(0, numMsg); + ext.close(); // Outlook ext = (OutlookTextExtactor) @@ -749,6 +770,7 @@ public class TestExtractorFactory { assertEquals(0, numXls); assertEquals(1, numWord); assertEquals(0, numMsg); + ext.close(); // Outlook with another outlook file in it ext = (OutlookTextExtactor) @@ -768,7 +790,7 @@ public class TestExtractorFactory { assertEquals(0, numXls); assertEquals(0, numWord); assertEquals(1, numMsg); - + ext.close(); // TODO - PowerPoint // TODO - Publisher