Handle some cases better where file handles were left open by the ExtractorFactory, mostly when opening files failed, but also when using the NPOIFSFileSystem for initialization.

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1721064 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dominik Stadler 2015-12-20 20:39:01 +00:00
parent e50841ea16
commit a74cded68d
3 changed files with 160 additions and 92 deletions

View File

@ -31,6 +31,8 @@ import java.io.IOException;
* @see org.apache.poi.hwpf.extractor.WordExtractor
*/
public abstract class POITextExtractor implements Closeable {
private Closeable fsToClose = null;
/**
* Retrieves all the text from the document.
* How cells, paragraphs etc are separated in the text
@ -46,6 +48,13 @@ public abstract class POITextExtractor implements Closeable {
* metadata / properties, such as author and title.
*/
public abstract POITextExtractor getMetadataTextExtractor();
/**
* Used to ensure file handle cleanup.
*/
public void setFilesystem(Closeable fs) {
fsToClose = fs;
}
/**
* Allows to free resources of the Extractor as soon as
@ -55,6 +64,8 @@ public abstract class POITextExtractor implements Closeable {
* The Extractor cannot be used after close has been called.
*/
public void close() throws IOException {
// nothing to do in abstract class, derived classes may perform actions.
if(fsToClose != null) {
fsToClose.close();
}
}
}

View File

@ -128,20 +128,25 @@ public class ExtractorFactory {
return threadPreferEventExtractors.get();
}
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
InputStream inp = null;
NPOIFSFileSystem fs = null;
try {
try {
NPOIFSFileSystem fs = new NPOIFSFileSystem(f);
return createExtractor(fs);
} catch (OfficeXmlFileException e) {
return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
} catch (NotOLE2FileException ne) {
throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
fs = new NPOIFSFileSystem(f);
POIOLE2TextExtractor extractor = createExtractor(fs);
extractor.setFilesystem(fs);
return extractor;
} catch (OfficeXmlFileException e) {
// ensure file-handle release
if(fs != null) {
fs.close();
}
} finally {
if(inp != null) inp.close();
return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
} catch (NotOLE2FileException ne) {
// ensure file-handle release
if(fs != null) {
fs.close();
}
throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
}
}
@ -161,65 +166,95 @@ public class ExtractorFactory {
throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
}
/**
* Tries to determine the actual type of file and produces a matching text-extractor for it.
*
* @param pkg An {@link OPCPackage}.
* @return A {@link POIXMLTextExtractor} for the given file.
* @throws IOException If an error occurs while reading the file
* @throws OpenXML4JException If an error parsing the OpenXML file format is found.
* @throws XmlException If an XML parsing error occurs.
* @throws IllegalArgumentException If no matching file type could be found.
*/
public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
// Check for the normal Office core document
PackageRelationshipCollection core =
pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
// If nothing was found, try some of the other OOXML-based core types
if (core.size() == 0) {
// Could it be an OOXML-Strict one?
core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
}
if (core.size() == 0) {
// Could it be a visio one?
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
if (core.size() == 1)
return new XDGFVisioExtractor(pkg);
}
// Should just be a single core document, complain if not
if (core.size() != 1) {
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
}
// Grab the core document part, and try to identify from that
PackagePart corePart = pkg.getPart(core.getRelationship(0));
// Is it XSSF?
for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
if(corePart.getContentType().equals(rel.getContentType())) {
if(getPreferEventExtractor()) {
return new XSSFEventBasedExcelExtractor(pkg);
}
return new XSSFExcelExtractor(pkg);
}
}
// Is it XWPF?
for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
if(corePart.getContentType().equals(rel.getContentType())) {
return new XWPFWordExtractor(pkg);
}
}
// Is it XSLF?
for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
if(corePart.getContentType().equals(rel.getContentType())) {
return new XSLFPowerPointExtractor(pkg);
}
}
// special handling for SlideShow-Theme-files,
if(XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType())) {
return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
}
// ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert();
throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
try {
// Check for the normal Office core document
PackageRelationshipCollection core =
pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
// If nothing was found, try some of the other OOXML-based core types
if (core.size() == 0) {
// Could it be an OOXML-Strict one?
core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
}
if (core.size() == 0) {
// Could it be a visio one?
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
if (core.size() == 1)
return new XDGFVisioExtractor(pkg);
}
// Should just be a single core document, complain if not
if (core.size() != 1) {
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
}
// Grab the core document part, and try to identify from that
PackagePart corePart = pkg.getPart(core.getRelationship(0));
// Is it XSSF?
for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
if(corePart.getContentType().equals(rel.getContentType())) {
if(getPreferEventExtractor()) {
return new XSSFEventBasedExcelExtractor(pkg);
}
return new XSSFExcelExtractor(pkg);
}
}
// Is it XWPF?
for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
if(corePart.getContentType().equals(rel.getContentType())) {
return new XWPFWordExtractor(pkg);
}
}
// Is it XSLF?
for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
if(corePart.getContentType().equals(rel.getContentType())) {
return new XSLFPowerPointExtractor(pkg);
}
}
// special handling for SlideShow-Theme-files,
if(XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType())) {
return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
}
throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
} catch (IOException e) {
// ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert();
throw e;
} catch (OpenXML4JException e) {
// ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert();
throw e;
} catch (XmlException e) {
// ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert();
throw e;
} catch (RuntimeException e) {
// ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert();
throw e;
}
}
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {

View File

@ -193,29 +193,35 @@ public class TestExtractorFactory {
// Word
extractor = ExtractorFactory.createExtractor(doc);
assertTrue(
ExtractorFactory.createExtractor(doc)
extractor
instanceof WordExtractor
);
assertTrue(
ExtractorFactory.createExtractor(doc).getText().length() > 120
extractor.getText().length() > 120
);
extractor.close();
extractor = ExtractorFactory.createExtractor(doc6);
assertTrue(
ExtractorFactory.createExtractor(doc6)
extractor
instanceof Word6Extractor
);
assertTrue(
ExtractorFactory.createExtractor(doc6).getText().length() > 20
extractor.getText().length() > 20
);
extractor.close();
extractor = ExtractorFactory.createExtractor(doc95);
assertTrue(
ExtractorFactory.createExtractor(doc95)
extractor
instanceof Word6Extractor
);
assertTrue(
ExtractorFactory.createExtractor(doc95).getText().length() > 120
extractor.getText().length() > 120
);
extractor.close();
extractor = ExtractorFactory.createExtractor(docx);
assertTrue(
@ -241,62 +247,71 @@ public class TestExtractorFactory {
);
extractor.close();
// PowerPoint
// PowerPoint (PPT)
extractor = ExtractorFactory.createExtractor(ppt);
assertTrue(
ExtractorFactory.createExtractor(ppt)
extractor
instanceof PowerPointExtractor
);
assertTrue(
ExtractorFactory.createExtractor(ppt).getText().length() > 120
extractor.getText().length() > 120
);
extractor.close();
// PowerPoint (PPTX)
extractor = ExtractorFactory.createExtractor(pptx);
assertTrue(
extractor
instanceof XSLFPowerPointExtractor
);
extractor.close();
extractor = ExtractorFactory.createExtractor(pptx);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
// Visio - binary
extractor = ExtractorFactory.createExtractor(vsd);
assertTrue(
ExtractorFactory.createExtractor(vsd)
extractor
instanceof VisioTextExtractor
);
assertTrue(
ExtractorFactory.createExtractor(vsd).getText().length() > 50
extractor.getText().length() > 50
);
extractor.close();
// Visio - vsdx
extractor = ExtractorFactory.createExtractor(vsdx);
assertTrue(
ExtractorFactory.createExtractor(vsdx)
extractor
instanceof XDGFVisioExtractor
);
assertTrue(
ExtractorFactory.createExtractor(vsdx).getText().length() > 20
extractor.getText().length() > 20
);
extractor.close();
// Publisher
extractor = ExtractorFactory.createExtractor(pub);
assertTrue(
ExtractorFactory.createExtractor(pub)
extractor
instanceof PublisherTextExtractor
);
assertTrue(
ExtractorFactory.createExtractor(pub).getText().length() > 50
extractor.getText().length() > 50
);
extractor.close();
// Outlook msg
extractor = ExtractorFactory.createExtractor(msg);
assertTrue(
ExtractorFactory.createExtractor(msg)
extractor
instanceof OutlookTextExtactor
);
assertTrue(
ExtractorFactory.createExtractor(msg).getText().length() > 50
extractor.getText().length() > 50
);
extractor.close();
// Text
try {
@ -557,13 +572,15 @@ public class TestExtractorFactory {
extractor.close();
// Visio
extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
assertTrue(
ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()))
extractor
instanceof XDGFVisioExtractor
);
assertTrue(
extractor.getText().length() > 20
);
extractor.close();
// Text
try {
@ -670,6 +687,7 @@ public class TestExtractorFactory {
ExtractorFactory.createExtractor(xls);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(0, embeds.length);
ext.close();
// Excel
ext = (POIOLE2TextExtractor)
@ -690,6 +708,7 @@ public class TestExtractorFactory {
assertEquals(2, numXls);
assertEquals(2, numWord);
assertEquals(0, numMsg);
ext.close();
// Word
ext = (POIOLE2TextExtractor)
@ -709,6 +728,7 @@ public class TestExtractorFactory {
assertEquals(2, numXls);
assertEquals(1, numWord);
assertEquals(0, numMsg);
ext.close();
// Word which contains an OOXML file
ext = (POIOLE2TextExtractor)
@ -730,6 +750,7 @@ public class TestExtractorFactory {
assertEquals(0, numWord);
assertEquals(1, numWordX);
assertEquals(0, numMsg);
ext.close();
// Outlook
ext = (OutlookTextExtactor)
@ -749,6 +770,7 @@ public class TestExtractorFactory {
assertEquals(0, numXls);
assertEquals(1, numWord);
assertEquals(0, numMsg);
ext.close();
// Outlook with another outlook file in it
ext = (OutlookTextExtactor)
@ -768,7 +790,7 @@ public class TestExtractorFactory {
assertEquals(0, numXls);
assertEquals(0, numWord);
assertEquals(1, numMsg);
ext.close();
// TODO - PowerPoint
// TODO - Publisher