mirror of https://github.com/apache/poi.git
Handle some cases better where file handles were left open by the ExtractorFactory, mostly when opening files failed, but also when using the NPOIFSFileSystem for initialization.
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1721064 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e50841ea16
commit
a74cded68d
|
@ -31,6 +31,8 @@ import java.io.IOException;
|
|||
* @see org.apache.poi.hwpf.extractor.WordExtractor
|
||||
*/
|
||||
public abstract class POITextExtractor implements Closeable {
|
||||
private Closeable fsToClose = null;
|
||||
|
||||
/**
|
||||
* Retrieves all the text from the document.
|
||||
* How cells, paragraphs etc are separated in the text
|
||||
|
@ -46,6 +48,13 @@ public abstract class POITextExtractor implements Closeable {
|
|||
* metadata / properties, such as author and title.
|
||||
*/
|
||||
public abstract POITextExtractor getMetadataTextExtractor();
|
||||
|
||||
/**
|
||||
* Used to ensure file handle cleanup.
|
||||
*/
|
||||
public void setFilesystem(Closeable fs) {
|
||||
fsToClose = fs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Allows to free resources of the Extractor as soon as
|
||||
|
@ -55,6 +64,8 @@ public abstract class POITextExtractor implements Closeable {
|
|||
* The Extractor cannot be used after close has been called.
|
||||
*/
|
||||
public void close() throws IOException {
|
||||
// nothing to do in abstract class, derived classes may perform actions.
|
||||
if(fsToClose != null) {
|
||||
fsToClose.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -128,20 +128,25 @@ public class ExtractorFactory {
|
|||
return threadPreferEventExtractors.get();
|
||||
}
|
||||
|
||||
|
||||
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||
InputStream inp = null;
|
||||
NPOIFSFileSystem fs = null;
|
||||
try {
|
||||
try {
|
||||
NPOIFSFileSystem fs = new NPOIFSFileSystem(f);
|
||||
return createExtractor(fs);
|
||||
} catch (OfficeXmlFileException e) {
|
||||
return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
|
||||
} catch (NotOLE2FileException ne) {
|
||||
throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
|
||||
fs = new NPOIFSFileSystem(f);
|
||||
POIOLE2TextExtractor extractor = createExtractor(fs);
|
||||
extractor.setFilesystem(fs);
|
||||
return extractor;
|
||||
} catch (OfficeXmlFileException e) {
|
||||
// ensure file-handle release
|
||||
if(fs != null) {
|
||||
fs.close();
|
||||
}
|
||||
} finally {
|
||||
if(inp != null) inp.close();
|
||||
return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
|
||||
} catch (NotOLE2FileException ne) {
|
||||
// ensure file-handle release
|
||||
if(fs != null) {
|
||||
fs.close();
|
||||
}
|
||||
throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -161,65 +166,95 @@ public class ExtractorFactory {
|
|||
throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to determine the actual type of file and produces a matching text-extractor for it.
|
||||
*
|
||||
* @param pkg An {@link OPCPackage}.
|
||||
* @return A {@link POIXMLTextExtractor} for the given file.
|
||||
* @throws IOException If an error occurs while reading the file
|
||||
* @throws OpenXML4JException If an error parsing the OpenXML file format is found.
|
||||
* @throws XmlException If an XML parsing error occurs.
|
||||
* @throws IllegalArgumentException If no matching file type could be found.
|
||||
*/
|
||||
public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
|
||||
// Check for the normal Office core document
|
||||
PackageRelationshipCollection core =
|
||||
pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
|
||||
|
||||
// If nothing was found, try some of the other OOXML-based core types
|
||||
if (core.size() == 0) {
|
||||
// Could it be an OOXML-Strict one?
|
||||
core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
|
||||
}
|
||||
if (core.size() == 0) {
|
||||
// Could it be a visio one?
|
||||
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
|
||||
if (core.size() == 1)
|
||||
return new XDGFVisioExtractor(pkg);
|
||||
}
|
||||
|
||||
// Should just be a single core document, complain if not
|
||||
if (core.size() != 1) {
|
||||
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
|
||||
}
|
||||
|
||||
// Grab the core document part, and try to identify from that
|
||||
PackagePart corePart = pkg.getPart(core.getRelationship(0));
|
||||
|
||||
// Is it XSSF?
|
||||
for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
|
||||
if(corePart.getContentType().equals(rel.getContentType())) {
|
||||
if(getPreferEventExtractor()) {
|
||||
return new XSSFEventBasedExcelExtractor(pkg);
|
||||
}
|
||||
|
||||
return new XSSFExcelExtractor(pkg);
|
||||
}
|
||||
}
|
||||
|
||||
// Is it XWPF?
|
||||
for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
|
||||
if(corePart.getContentType().equals(rel.getContentType())) {
|
||||
return new XWPFWordExtractor(pkg);
|
||||
}
|
||||
}
|
||||
|
||||
// Is it XSLF?
|
||||
for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
|
||||
if(corePart.getContentType().equals(rel.getContentType())) {
|
||||
return new XSLFPowerPointExtractor(pkg);
|
||||
}
|
||||
}
|
||||
|
||||
// special handling for SlideShow-Theme-files,
|
||||
if(XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType())) {
|
||||
return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
|
||||
}
|
||||
|
||||
// ensure that we close the package again if there is an error opening it, however
|
||||
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
|
||||
pkg.revert();
|
||||
throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
|
||||
try {
|
||||
// Check for the normal Office core document
|
||||
PackageRelationshipCollection core =
|
||||
pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
|
||||
|
||||
// If nothing was found, try some of the other OOXML-based core types
|
||||
if (core.size() == 0) {
|
||||
// Could it be an OOXML-Strict one?
|
||||
core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
|
||||
}
|
||||
if (core.size() == 0) {
|
||||
// Could it be a visio one?
|
||||
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
|
||||
if (core.size() == 1)
|
||||
return new XDGFVisioExtractor(pkg);
|
||||
}
|
||||
|
||||
// Should just be a single core document, complain if not
|
||||
if (core.size() != 1) {
|
||||
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
|
||||
}
|
||||
|
||||
// Grab the core document part, and try to identify from that
|
||||
PackagePart corePart = pkg.getPart(core.getRelationship(0));
|
||||
|
||||
// Is it XSSF?
|
||||
for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
|
||||
if(corePart.getContentType().equals(rel.getContentType())) {
|
||||
if(getPreferEventExtractor()) {
|
||||
return new XSSFEventBasedExcelExtractor(pkg);
|
||||
}
|
||||
|
||||
return new XSSFExcelExtractor(pkg);
|
||||
}
|
||||
}
|
||||
|
||||
// Is it XWPF?
|
||||
for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
|
||||
if(corePart.getContentType().equals(rel.getContentType())) {
|
||||
return new XWPFWordExtractor(pkg);
|
||||
}
|
||||
}
|
||||
|
||||
// Is it XSLF?
|
||||
for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
|
||||
if(corePart.getContentType().equals(rel.getContentType())) {
|
||||
return new XSLFPowerPointExtractor(pkg);
|
||||
}
|
||||
}
|
||||
|
||||
// special handling for SlideShow-Theme-files,
|
||||
if(XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType())) {
|
||||
return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
|
||||
} catch (IOException e) {
|
||||
// ensure that we close the package again if there is an error opening it, however
|
||||
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
|
||||
pkg.revert();
|
||||
throw e;
|
||||
} catch (OpenXML4JException e) {
|
||||
// ensure that we close the package again if there is an error opening it, however
|
||||
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
|
||||
pkg.revert();
|
||||
throw e;
|
||||
} catch (XmlException e) {
|
||||
// ensure that we close the package again if there is an error opening it, however
|
||||
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
|
||||
pkg.revert();
|
||||
throw e;
|
||||
} catch (RuntimeException e) {
|
||||
// ensure that we close the package again if there is an error opening it, however
|
||||
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
|
||||
pkg.revert();
|
||||
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||
|
|
|
@ -193,29 +193,35 @@ public class TestExtractorFactory {
|
|||
|
||||
|
||||
// Word
|
||||
extractor = ExtractorFactory.createExtractor(doc);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(doc)
|
||||
extractor
|
||||
instanceof WordExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(doc).getText().length() > 120
|
||||
extractor.getText().length() > 120
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(doc6);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(doc6)
|
||||
extractor
|
||||
instanceof Word6Extractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(doc6).getText().length() > 20
|
||||
extractor.getText().length() > 20
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(doc95);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(doc95)
|
||||
extractor
|
||||
instanceof Word6Extractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(doc95).getText().length() > 120
|
||||
extractor.getText().length() > 120
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(docx);
|
||||
assertTrue(
|
||||
|
@ -241,62 +247,71 @@ public class TestExtractorFactory {
|
|||
);
|
||||
extractor.close();
|
||||
|
||||
// PowerPoint
|
||||
// PowerPoint (PPT)
|
||||
extractor = ExtractorFactory.createExtractor(ppt);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(ppt)
|
||||
extractor
|
||||
instanceof PowerPointExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(ppt).getText().length() > 120
|
||||
extractor.getText().length() > 120
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
// PowerPoint (PPTX)
|
||||
extractor = ExtractorFactory.createExtractor(pptx);
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof XSLFPowerPointExtractor
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(pptx);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 120
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
// Visio - binary
|
||||
extractor = ExtractorFactory.createExtractor(vsd);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(vsd)
|
||||
extractor
|
||||
instanceof VisioTextExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(vsd).getText().length() > 50
|
||||
extractor.getText().length() > 50
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
// Visio - vsdx
|
||||
extractor = ExtractorFactory.createExtractor(vsdx);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(vsdx)
|
||||
extractor
|
||||
instanceof XDGFVisioExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(vsdx).getText().length() > 20
|
||||
extractor.getText().length() > 20
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
// Publisher
|
||||
extractor = ExtractorFactory.createExtractor(pub);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(pub)
|
||||
extractor
|
||||
instanceof PublisherTextExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(pub).getText().length() > 50
|
||||
extractor.getText().length() > 50
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
// Outlook msg
|
||||
extractor = ExtractorFactory.createExtractor(msg);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(msg)
|
||||
extractor
|
||||
instanceof OutlookTextExtactor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(msg).getText().length() > 50
|
||||
extractor.getText().length() > 50
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
// Text
|
||||
try {
|
||||
|
@ -557,13 +572,15 @@ public class TestExtractorFactory {
|
|||
extractor.close();
|
||||
|
||||
// Visio
|
||||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()))
|
||||
extractor
|
||||
instanceof XDGFVisioExtractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 20
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
// Text
|
||||
try {
|
||||
|
@ -670,6 +687,7 @@ public class TestExtractorFactory {
|
|||
ExtractorFactory.createExtractor(xls);
|
||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||
assertEquals(0, embeds.length);
|
||||
ext.close();
|
||||
|
||||
// Excel
|
||||
ext = (POIOLE2TextExtractor)
|
||||
|
@ -690,6 +708,7 @@ public class TestExtractorFactory {
|
|||
assertEquals(2, numXls);
|
||||
assertEquals(2, numWord);
|
||||
assertEquals(0, numMsg);
|
||||
ext.close();
|
||||
|
||||
// Word
|
||||
ext = (POIOLE2TextExtractor)
|
||||
|
@ -709,6 +728,7 @@ public class TestExtractorFactory {
|
|||
assertEquals(2, numXls);
|
||||
assertEquals(1, numWord);
|
||||
assertEquals(0, numMsg);
|
||||
ext.close();
|
||||
|
||||
// Word which contains an OOXML file
|
||||
ext = (POIOLE2TextExtractor)
|
||||
|
@ -730,6 +750,7 @@ public class TestExtractorFactory {
|
|||
assertEquals(0, numWord);
|
||||
assertEquals(1, numWordX);
|
||||
assertEquals(0, numMsg);
|
||||
ext.close();
|
||||
|
||||
// Outlook
|
||||
ext = (OutlookTextExtactor)
|
||||
|
@ -749,6 +770,7 @@ public class TestExtractorFactory {
|
|||
assertEquals(0, numXls);
|
||||
assertEquals(1, numWord);
|
||||
assertEquals(0, numMsg);
|
||||
ext.close();
|
||||
|
||||
// Outlook with another outlook file in it
|
||||
ext = (OutlookTextExtactor)
|
||||
|
@ -768,7 +790,7 @@ public class TestExtractorFactory {
|
|||
assertEquals(0, numXls);
|
||||
assertEquals(0, numWord);
|
||||
assertEquals(1, numMsg);
|
||||
|
||||
ext.close();
|
||||
|
||||
// TODO - PowerPoint
|
||||
// TODO - Publisher
|
||||
|
|
Loading…
Reference in New Issue