Handle some cases better where file handles were left open by the ExtractorFactory, mostly when opening files failed, but also when using the NPOIFSFileSystem for initialization.

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1721064 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dominik Stadler 2015-12-20 20:39:01 +00:00
parent e50841ea16
commit a74cded68d
3 changed files with 160 additions and 92 deletions

View File

@ -31,6 +31,8 @@ import java.io.IOException;
* @see org.apache.poi.hwpf.extractor.WordExtractor * @see org.apache.poi.hwpf.extractor.WordExtractor
*/ */
public abstract class POITextExtractor implements Closeable { public abstract class POITextExtractor implements Closeable {
private Closeable fsToClose = null;
/** /**
* Retrieves all the text from the document. * Retrieves all the text from the document.
* How cells, paragraphs etc are separated in the text * How cells, paragraphs etc are separated in the text
@ -46,6 +48,13 @@ public abstract class POITextExtractor implements Closeable {
* metadata / properties, such as author and title. * metadata / properties, such as author and title.
*/ */
public abstract POITextExtractor getMetadataTextExtractor(); public abstract POITextExtractor getMetadataTextExtractor();
/**
* Used to ensure file handle cleanup.
*/
public void setFilesystem(Closeable fs) {
fsToClose = fs;
}
/** /**
* Allows to free resources of the Extractor as soon as * Allows to free resources of the Extractor as soon as
@ -55,6 +64,8 @@ public abstract class POITextExtractor implements Closeable {
* The Extractor cannot be used after close has been called. * The Extractor cannot be used after close has been called.
*/ */
public void close() throws IOException { public void close() throws IOException {
// nothing to do in abstract class, derived classes may perform actions. if(fsToClose != null) {
fsToClose.close();
}
} }
} }

View File

@ -128,20 +128,25 @@ public class ExtractorFactory {
return threadPreferEventExtractors.get(); return threadPreferEventExtractors.get();
} }
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
InputStream inp = null; NPOIFSFileSystem fs = null;
try { try {
try { fs = new NPOIFSFileSystem(f);
NPOIFSFileSystem fs = new NPOIFSFileSystem(f); POIOLE2TextExtractor extractor = createExtractor(fs);
return createExtractor(fs); extractor.setFilesystem(fs);
} catch (OfficeXmlFileException e) { return extractor;
return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ)); } catch (OfficeXmlFileException e) {
} catch (NotOLE2FileException ne) { // ensure file-handle release
throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file"); if(fs != null) {
fs.close();
} }
} finally { return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
if(inp != null) inp.close(); } catch (NotOLE2FileException ne) {
// ensure file-handle release
if(fs != null) {
fs.close();
}
throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
} }
} }
@ -161,65 +166,95 @@ public class ExtractorFactory {
throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream"); throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
} }
/**
* Tries to determine the actual type of file and produces a matching text-extractor for it.
*
* @param pkg An {@link OPCPackage}.
* @return A {@link POIXMLTextExtractor} for the given file.
* @throws IOException If an error occurs while reading the file
* @throws OpenXML4JException If an error parsing the OpenXML file format is found.
* @throws XmlException If an XML parsing error occurs.
* @throws IllegalArgumentException If no matching file type could be found.
*/
public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException { public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
// Check for the normal Office core document try {
PackageRelationshipCollection core = // Check for the normal Office core document
pkg.getRelationshipsByType(CORE_DOCUMENT_REL); PackageRelationshipCollection core =
pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
// If nothing was found, try some of the other OOXML-based core types
if (core.size() == 0) { // If nothing was found, try some of the other OOXML-based core types
// Could it be an OOXML-Strict one? if (core.size() == 0) {
core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL); // Could it be an OOXML-Strict one?
} core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
if (core.size() == 0) { }
// Could it be a visio one? if (core.size() == 0) {
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL); // Could it be a visio one?
if (core.size() == 1) core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
return new XDGFVisioExtractor(pkg); if (core.size() == 1)
} return new XDGFVisioExtractor(pkg);
}
// Should just be a single core document, complain if not
if (core.size() != 1) { // Should just be a single core document, complain if not
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size()); if (core.size() != 1) {
} throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
}
// Grab the core document part, and try to identify from that
PackagePart corePart = pkg.getPart(core.getRelationship(0)); // Grab the core document part, and try to identify from that
PackagePart corePart = pkg.getPart(core.getRelationship(0));
// Is it XSSF?
for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) { // Is it XSSF?
if(corePart.getContentType().equals(rel.getContentType())) { for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
if(getPreferEventExtractor()) { if(corePart.getContentType().equals(rel.getContentType())) {
return new XSSFEventBasedExcelExtractor(pkg); if(getPreferEventExtractor()) {
} return new XSSFEventBasedExcelExtractor(pkg);
}
return new XSSFExcelExtractor(pkg);
} return new XSSFExcelExtractor(pkg);
} }
}
// Is it XWPF?
for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) { // Is it XWPF?
if(corePart.getContentType().equals(rel.getContentType())) { for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
return new XWPFWordExtractor(pkg); if(corePart.getContentType().equals(rel.getContentType())) {
} return new XWPFWordExtractor(pkg);
} }
}
// Is it XSLF?
for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) { // Is it XSLF?
if(corePart.getContentType().equals(rel.getContentType())) { for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
return new XSLFPowerPointExtractor(pkg); if(corePart.getContentType().equals(rel.getContentType())) {
} return new XSLFPowerPointExtractor(pkg);
} }
}
// special handling for SlideShow-Theme-files,
if(XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType())) { // special handling for SlideShow-Theme-files,
return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg)); if(XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType())) {
} return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
}
// ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
pkg.revert(); } catch (IOException e) {
throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")"); // ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert();
throw e;
} catch (OpenXML4JException e) {
// ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert();
throw e;
} catch (XmlException e) {
// ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert();
throw e;
} catch (RuntimeException e) {
// ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert();
throw e;
}
} }
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {

View File

@ -193,29 +193,35 @@ public class TestExtractorFactory {
// Word // Word
extractor = ExtractorFactory.createExtractor(doc);
assertTrue( assertTrue(
ExtractorFactory.createExtractor(doc) extractor
instanceof WordExtractor instanceof WordExtractor
); );
assertTrue( assertTrue(
ExtractorFactory.createExtractor(doc).getText().length() > 120 extractor.getText().length() > 120
); );
extractor.close();
extractor = ExtractorFactory.createExtractor(doc6);
assertTrue( assertTrue(
ExtractorFactory.createExtractor(doc6) extractor
instanceof Word6Extractor instanceof Word6Extractor
); );
assertTrue( assertTrue(
ExtractorFactory.createExtractor(doc6).getText().length() > 20 extractor.getText().length() > 20
); );
extractor.close();
extractor = ExtractorFactory.createExtractor(doc95);
assertTrue( assertTrue(
ExtractorFactory.createExtractor(doc95) extractor
instanceof Word6Extractor instanceof Word6Extractor
); );
assertTrue( assertTrue(
ExtractorFactory.createExtractor(doc95).getText().length() > 120 extractor.getText().length() > 120
); );
extractor.close();
extractor = ExtractorFactory.createExtractor(docx); extractor = ExtractorFactory.createExtractor(docx);
assertTrue( assertTrue(
@ -241,62 +247,71 @@ public class TestExtractorFactory {
); );
extractor.close(); extractor.close();
// PowerPoint // PowerPoint (PPT)
extractor = ExtractorFactory.createExtractor(ppt);
assertTrue( assertTrue(
ExtractorFactory.createExtractor(ppt) extractor
instanceof PowerPointExtractor instanceof PowerPointExtractor
); );
assertTrue( assertTrue(
ExtractorFactory.createExtractor(ppt).getText().length() > 120 extractor.getText().length() > 120
); );
extractor.close();
// PowerPoint (PPTX)
extractor = ExtractorFactory.createExtractor(pptx); extractor = ExtractorFactory.createExtractor(pptx);
assertTrue( assertTrue(
extractor extractor
instanceof XSLFPowerPointExtractor instanceof XSLFPowerPointExtractor
); );
extractor.close();
extractor = ExtractorFactory.createExtractor(pptx);
assertTrue( assertTrue(
extractor.getText().length() > 120 extractor.getText().length() > 120
); );
extractor.close(); extractor.close();
// Visio - binary // Visio - binary
extractor = ExtractorFactory.createExtractor(vsd);
assertTrue( assertTrue(
ExtractorFactory.createExtractor(vsd) extractor
instanceof VisioTextExtractor instanceof VisioTextExtractor
); );
assertTrue( assertTrue(
ExtractorFactory.createExtractor(vsd).getText().length() > 50 extractor.getText().length() > 50
); );
extractor.close();
// Visio - vsdx // Visio - vsdx
extractor = ExtractorFactory.createExtractor(vsdx);
assertTrue( assertTrue(
ExtractorFactory.createExtractor(vsdx) extractor
instanceof XDGFVisioExtractor instanceof XDGFVisioExtractor
); );
assertTrue( assertTrue(
ExtractorFactory.createExtractor(vsdx).getText().length() > 20 extractor.getText().length() > 20
); );
extractor.close();
// Publisher // Publisher
extractor = ExtractorFactory.createExtractor(pub);
assertTrue( assertTrue(
ExtractorFactory.createExtractor(pub) extractor
instanceof PublisherTextExtractor instanceof PublisherTextExtractor
); );
assertTrue( assertTrue(
ExtractorFactory.createExtractor(pub).getText().length() > 50 extractor.getText().length() > 50
); );
extractor.close();
// Outlook msg // Outlook msg
extractor = ExtractorFactory.createExtractor(msg);
assertTrue( assertTrue(
ExtractorFactory.createExtractor(msg) extractor
instanceof OutlookTextExtactor instanceof OutlookTextExtactor
); );
assertTrue( assertTrue(
ExtractorFactory.createExtractor(msg).getText().length() > 50 extractor.getText().length() > 50
); );
extractor.close();
// Text // Text
try { try {
@ -557,13 +572,15 @@ public class TestExtractorFactory {
extractor.close(); extractor.close();
// Visio // Visio
extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
assertTrue( assertTrue(
ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString())) extractor
instanceof XDGFVisioExtractor instanceof XDGFVisioExtractor
); );
assertTrue( assertTrue(
extractor.getText().length() > 20 extractor.getText().length() > 20
); );
extractor.close();
// Text // Text
try { try {
@ -670,6 +687,7 @@ public class TestExtractorFactory {
ExtractorFactory.createExtractor(xls); ExtractorFactory.createExtractor(xls);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(0, embeds.length); assertEquals(0, embeds.length);
ext.close();
// Excel // Excel
ext = (POIOLE2TextExtractor) ext = (POIOLE2TextExtractor)
@ -690,6 +708,7 @@ public class TestExtractorFactory {
assertEquals(2, numXls); assertEquals(2, numXls);
assertEquals(2, numWord); assertEquals(2, numWord);
assertEquals(0, numMsg); assertEquals(0, numMsg);
ext.close();
// Word // Word
ext = (POIOLE2TextExtractor) ext = (POIOLE2TextExtractor)
@ -709,6 +728,7 @@ public class TestExtractorFactory {
assertEquals(2, numXls); assertEquals(2, numXls);
assertEquals(1, numWord); assertEquals(1, numWord);
assertEquals(0, numMsg); assertEquals(0, numMsg);
ext.close();
// Word which contains an OOXML file // Word which contains an OOXML file
ext = (POIOLE2TextExtractor) ext = (POIOLE2TextExtractor)
@ -730,6 +750,7 @@ public class TestExtractorFactory {
assertEquals(0, numWord); assertEquals(0, numWord);
assertEquals(1, numWordX); assertEquals(1, numWordX);
assertEquals(0, numMsg); assertEquals(0, numMsg);
ext.close();
// Outlook // Outlook
ext = (OutlookTextExtactor) ext = (OutlookTextExtactor)
@ -749,6 +770,7 @@ public class TestExtractorFactory {
assertEquals(0, numXls); assertEquals(0, numXls);
assertEquals(1, numWord); assertEquals(1, numWord);
assertEquals(0, numMsg); assertEquals(0, numMsg);
ext.close();
// Outlook with another outlook file in it // Outlook with another outlook file in it
ext = (OutlookTextExtactor) ext = (OutlookTextExtactor)
@ -768,7 +790,7 @@ public class TestExtractorFactory {
assertEquals(0, numXls); assertEquals(0, numXls);
assertEquals(0, numWord); assertEquals(0, numWord);
assertEquals(1, numMsg); assertEquals(1, numMsg);
ext.close();
// TODO - PowerPoint // TODO - PowerPoint
// TODO - Publisher // TODO - Publisher