mirror of https://github.com/apache/poi.git
Handle some cases better where file handles were left open by the ExtractorFactory, mostly when opening files failed, but also when using the NPOIFSFileSystem for initialization.
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1721064 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e50841ea16
commit
a74cded68d
|
@ -31,6 +31,8 @@ import java.io.IOException;
|
||||||
* @see org.apache.poi.hwpf.extractor.WordExtractor
|
* @see org.apache.poi.hwpf.extractor.WordExtractor
|
||||||
*/
|
*/
|
||||||
public abstract class POITextExtractor implements Closeable {
|
public abstract class POITextExtractor implements Closeable {
|
||||||
|
private Closeable fsToClose = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Retrieves all the text from the document.
|
* Retrieves all the text from the document.
|
||||||
* How cells, paragraphs etc are separated in the text
|
* How cells, paragraphs etc are separated in the text
|
||||||
|
@ -46,6 +48,13 @@ public abstract class POITextExtractor implements Closeable {
|
||||||
* metadata / properties, such as author and title.
|
* metadata / properties, such as author and title.
|
||||||
*/
|
*/
|
||||||
public abstract POITextExtractor getMetadataTextExtractor();
|
public abstract POITextExtractor getMetadataTextExtractor();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Used to ensure file handle cleanup.
|
||||||
|
*/
|
||||||
|
public void setFilesystem(Closeable fs) {
|
||||||
|
fsToClose = fs;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Allows to free resources of the Extractor as soon as
|
* Allows to free resources of the Extractor as soon as
|
||||||
|
@ -55,6 +64,8 @@ public abstract class POITextExtractor implements Closeable {
|
||||||
* The Extractor cannot be used after close has been called.
|
* The Extractor cannot be used after close has been called.
|
||||||
*/
|
*/
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
// nothing to do in abstract class, derived classes may perform actions.
|
if(fsToClose != null) {
|
||||||
|
fsToClose.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -128,20 +128,25 @@ public class ExtractorFactory {
|
||||||
return threadPreferEventExtractors.get();
|
return threadPreferEventExtractors.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||||
InputStream inp = null;
|
NPOIFSFileSystem fs = null;
|
||||||
try {
|
try {
|
||||||
try {
|
fs = new NPOIFSFileSystem(f);
|
||||||
NPOIFSFileSystem fs = new NPOIFSFileSystem(f);
|
POIOLE2TextExtractor extractor = createExtractor(fs);
|
||||||
return createExtractor(fs);
|
extractor.setFilesystem(fs);
|
||||||
} catch (OfficeXmlFileException e) {
|
return extractor;
|
||||||
return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
|
} catch (OfficeXmlFileException e) {
|
||||||
} catch (NotOLE2FileException ne) {
|
// ensure file-handle release
|
||||||
throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
|
if(fs != null) {
|
||||||
|
fs.close();
|
||||||
}
|
}
|
||||||
} finally {
|
return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
|
||||||
if(inp != null) inp.close();
|
} catch (NotOLE2FileException ne) {
|
||||||
|
// ensure file-handle release
|
||||||
|
if(fs != null) {
|
||||||
|
fs.close();
|
||||||
|
}
|
||||||
|
throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -161,65 +166,95 @@ public class ExtractorFactory {
|
||||||
throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
|
throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tries to determine the actual type of file and produces a matching text-extractor for it.
|
||||||
|
*
|
||||||
|
* @param pkg An {@link OPCPackage}.
|
||||||
|
* @return A {@link POIXMLTextExtractor} for the given file.
|
||||||
|
* @throws IOException If an error occurs while reading the file
|
||||||
|
* @throws OpenXML4JException If an error parsing the OpenXML file format is found.
|
||||||
|
* @throws XmlException If an XML parsing error occurs.
|
||||||
|
* @throws IllegalArgumentException If no matching file type could be found.
|
||||||
|
*/
|
||||||
public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
|
public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
|
||||||
// Check for the normal Office core document
|
try {
|
||||||
PackageRelationshipCollection core =
|
// Check for the normal Office core document
|
||||||
pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
|
PackageRelationshipCollection core =
|
||||||
|
pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
|
||||||
// If nothing was found, try some of the other OOXML-based core types
|
|
||||||
if (core.size() == 0) {
|
// If nothing was found, try some of the other OOXML-based core types
|
||||||
// Could it be an OOXML-Strict one?
|
if (core.size() == 0) {
|
||||||
core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
|
// Could it be an OOXML-Strict one?
|
||||||
}
|
core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
|
||||||
if (core.size() == 0) {
|
}
|
||||||
// Could it be a visio one?
|
if (core.size() == 0) {
|
||||||
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
|
// Could it be a visio one?
|
||||||
if (core.size() == 1)
|
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
|
||||||
return new XDGFVisioExtractor(pkg);
|
if (core.size() == 1)
|
||||||
}
|
return new XDGFVisioExtractor(pkg);
|
||||||
|
}
|
||||||
// Should just be a single core document, complain if not
|
|
||||||
if (core.size() != 1) {
|
// Should just be a single core document, complain if not
|
||||||
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
|
if (core.size() != 1) {
|
||||||
}
|
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
|
||||||
|
}
|
||||||
// Grab the core document part, and try to identify from that
|
|
||||||
PackagePart corePart = pkg.getPart(core.getRelationship(0));
|
// Grab the core document part, and try to identify from that
|
||||||
|
PackagePart corePart = pkg.getPart(core.getRelationship(0));
|
||||||
// Is it XSSF?
|
|
||||||
for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
|
// Is it XSSF?
|
||||||
if(corePart.getContentType().equals(rel.getContentType())) {
|
for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
|
||||||
if(getPreferEventExtractor()) {
|
if(corePart.getContentType().equals(rel.getContentType())) {
|
||||||
return new XSSFEventBasedExcelExtractor(pkg);
|
if(getPreferEventExtractor()) {
|
||||||
}
|
return new XSSFEventBasedExcelExtractor(pkg);
|
||||||
|
}
|
||||||
return new XSSFExcelExtractor(pkg);
|
|
||||||
}
|
return new XSSFExcelExtractor(pkg);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
// Is it XWPF?
|
|
||||||
for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
|
// Is it XWPF?
|
||||||
if(corePart.getContentType().equals(rel.getContentType())) {
|
for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
|
||||||
return new XWPFWordExtractor(pkg);
|
if(corePart.getContentType().equals(rel.getContentType())) {
|
||||||
}
|
return new XWPFWordExtractor(pkg);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
// Is it XSLF?
|
|
||||||
for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
|
// Is it XSLF?
|
||||||
if(corePart.getContentType().equals(rel.getContentType())) {
|
for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
|
||||||
return new XSLFPowerPointExtractor(pkg);
|
if(corePart.getContentType().equals(rel.getContentType())) {
|
||||||
}
|
return new XSLFPowerPointExtractor(pkg);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
// special handling for SlideShow-Theme-files,
|
|
||||||
if(XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType())) {
|
// special handling for SlideShow-Theme-files,
|
||||||
return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
|
if(XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType())) {
|
||||||
}
|
return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
|
||||||
|
}
|
||||||
// ensure that we close the package again if there is an error opening it, however
|
|
||||||
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
|
throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
|
||||||
pkg.revert();
|
} catch (IOException e) {
|
||||||
throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
|
// ensure that we close the package again if there is an error opening it, however
|
||||||
|
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
|
||||||
|
pkg.revert();
|
||||||
|
throw e;
|
||||||
|
} catch (OpenXML4JException e) {
|
||||||
|
// ensure that we close the package again if there is an error opening it, however
|
||||||
|
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
|
||||||
|
pkg.revert();
|
||||||
|
throw e;
|
||||||
|
} catch (XmlException e) {
|
||||||
|
// ensure that we close the package again if there is an error opening it, however
|
||||||
|
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
|
||||||
|
pkg.revert();
|
||||||
|
throw e;
|
||||||
|
} catch (RuntimeException e) {
|
||||||
|
// ensure that we close the package again if there is an error opening it, however
|
||||||
|
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
|
||||||
|
pkg.revert();
|
||||||
|
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||||
|
|
|
@ -193,29 +193,35 @@ public class TestExtractorFactory {
|
||||||
|
|
||||||
|
|
||||||
// Word
|
// Word
|
||||||
|
extractor = ExtractorFactory.createExtractor(doc);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(doc)
|
extractor
|
||||||
instanceof WordExtractor
|
instanceof WordExtractor
|
||||||
);
|
);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(doc).getText().length() > 120
|
extractor.getText().length() > 120
|
||||||
);
|
);
|
||||||
|
extractor.close();
|
||||||
|
|
||||||
|
extractor = ExtractorFactory.createExtractor(doc6);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(doc6)
|
extractor
|
||||||
instanceof Word6Extractor
|
instanceof Word6Extractor
|
||||||
);
|
);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(doc6).getText().length() > 20
|
extractor.getText().length() > 20
|
||||||
);
|
);
|
||||||
|
extractor.close();
|
||||||
|
|
||||||
|
extractor = ExtractorFactory.createExtractor(doc95);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(doc95)
|
extractor
|
||||||
instanceof Word6Extractor
|
instanceof Word6Extractor
|
||||||
);
|
);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(doc95).getText().length() > 120
|
extractor.getText().length() > 120
|
||||||
);
|
);
|
||||||
|
extractor.close();
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(docx);
|
extractor = ExtractorFactory.createExtractor(docx);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
|
@ -241,62 +247,71 @@ public class TestExtractorFactory {
|
||||||
);
|
);
|
||||||
extractor.close();
|
extractor.close();
|
||||||
|
|
||||||
// PowerPoint
|
// PowerPoint (PPT)
|
||||||
|
extractor = ExtractorFactory.createExtractor(ppt);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(ppt)
|
extractor
|
||||||
instanceof PowerPointExtractor
|
instanceof PowerPointExtractor
|
||||||
);
|
);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(ppt).getText().length() > 120
|
extractor.getText().length() > 120
|
||||||
);
|
);
|
||||||
|
extractor.close();
|
||||||
|
|
||||||
|
// PowerPoint (PPTX)
|
||||||
extractor = ExtractorFactory.createExtractor(pptx);
|
extractor = ExtractorFactory.createExtractor(pptx);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
extractor
|
extractor
|
||||||
instanceof XSLFPowerPointExtractor
|
instanceof XSLFPowerPointExtractor
|
||||||
);
|
);
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(pptx);
|
|
||||||
assertTrue(
|
assertTrue(
|
||||||
extractor.getText().length() > 120
|
extractor.getText().length() > 120
|
||||||
);
|
);
|
||||||
extractor.close();
|
extractor.close();
|
||||||
|
|
||||||
// Visio - binary
|
// Visio - binary
|
||||||
|
extractor = ExtractorFactory.createExtractor(vsd);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(vsd)
|
extractor
|
||||||
instanceof VisioTextExtractor
|
instanceof VisioTextExtractor
|
||||||
);
|
);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(vsd).getText().length() > 50
|
extractor.getText().length() > 50
|
||||||
);
|
);
|
||||||
|
extractor.close();
|
||||||
|
|
||||||
// Visio - vsdx
|
// Visio - vsdx
|
||||||
|
extractor = ExtractorFactory.createExtractor(vsdx);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(vsdx)
|
extractor
|
||||||
instanceof XDGFVisioExtractor
|
instanceof XDGFVisioExtractor
|
||||||
);
|
);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(vsdx).getText().length() > 20
|
extractor.getText().length() > 20
|
||||||
);
|
);
|
||||||
|
extractor.close();
|
||||||
|
|
||||||
// Publisher
|
// Publisher
|
||||||
|
extractor = ExtractorFactory.createExtractor(pub);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(pub)
|
extractor
|
||||||
instanceof PublisherTextExtractor
|
instanceof PublisherTextExtractor
|
||||||
);
|
);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(pub).getText().length() > 50
|
extractor.getText().length() > 50
|
||||||
);
|
);
|
||||||
|
extractor.close();
|
||||||
|
|
||||||
// Outlook msg
|
// Outlook msg
|
||||||
|
extractor = ExtractorFactory.createExtractor(msg);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(msg)
|
extractor
|
||||||
instanceof OutlookTextExtactor
|
instanceof OutlookTextExtactor
|
||||||
);
|
);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(msg).getText().length() > 50
|
extractor.getText().length() > 50
|
||||||
);
|
);
|
||||||
|
extractor.close();
|
||||||
|
|
||||||
// Text
|
// Text
|
||||||
try {
|
try {
|
||||||
|
@ -557,13 +572,15 @@ public class TestExtractorFactory {
|
||||||
extractor.close();
|
extractor.close();
|
||||||
|
|
||||||
// Visio
|
// Visio
|
||||||
|
extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()))
|
extractor
|
||||||
instanceof XDGFVisioExtractor
|
instanceof XDGFVisioExtractor
|
||||||
);
|
);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
extractor.getText().length() > 20
|
extractor.getText().length() > 20
|
||||||
);
|
);
|
||||||
|
extractor.close();
|
||||||
|
|
||||||
// Text
|
// Text
|
||||||
try {
|
try {
|
||||||
|
@ -670,6 +687,7 @@ public class TestExtractorFactory {
|
||||||
ExtractorFactory.createExtractor(xls);
|
ExtractorFactory.createExtractor(xls);
|
||||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||||
assertEquals(0, embeds.length);
|
assertEquals(0, embeds.length);
|
||||||
|
ext.close();
|
||||||
|
|
||||||
// Excel
|
// Excel
|
||||||
ext = (POIOLE2TextExtractor)
|
ext = (POIOLE2TextExtractor)
|
||||||
|
@ -690,6 +708,7 @@ public class TestExtractorFactory {
|
||||||
assertEquals(2, numXls);
|
assertEquals(2, numXls);
|
||||||
assertEquals(2, numWord);
|
assertEquals(2, numWord);
|
||||||
assertEquals(0, numMsg);
|
assertEquals(0, numMsg);
|
||||||
|
ext.close();
|
||||||
|
|
||||||
// Word
|
// Word
|
||||||
ext = (POIOLE2TextExtractor)
|
ext = (POIOLE2TextExtractor)
|
||||||
|
@ -709,6 +728,7 @@ public class TestExtractorFactory {
|
||||||
assertEquals(2, numXls);
|
assertEquals(2, numXls);
|
||||||
assertEquals(1, numWord);
|
assertEquals(1, numWord);
|
||||||
assertEquals(0, numMsg);
|
assertEquals(0, numMsg);
|
||||||
|
ext.close();
|
||||||
|
|
||||||
// Word which contains an OOXML file
|
// Word which contains an OOXML file
|
||||||
ext = (POIOLE2TextExtractor)
|
ext = (POIOLE2TextExtractor)
|
||||||
|
@ -730,6 +750,7 @@ public class TestExtractorFactory {
|
||||||
assertEquals(0, numWord);
|
assertEquals(0, numWord);
|
||||||
assertEquals(1, numWordX);
|
assertEquals(1, numWordX);
|
||||||
assertEquals(0, numMsg);
|
assertEquals(0, numMsg);
|
||||||
|
ext.close();
|
||||||
|
|
||||||
// Outlook
|
// Outlook
|
||||||
ext = (OutlookTextExtactor)
|
ext = (OutlookTextExtactor)
|
||||||
|
@ -749,6 +770,7 @@ public class TestExtractorFactory {
|
||||||
assertEquals(0, numXls);
|
assertEquals(0, numXls);
|
||||||
assertEquals(1, numWord);
|
assertEquals(1, numWord);
|
||||||
assertEquals(0, numMsg);
|
assertEquals(0, numMsg);
|
||||||
|
ext.close();
|
||||||
|
|
||||||
// Outlook with another outlook file in it
|
// Outlook with another outlook file in it
|
||||||
ext = (OutlookTextExtactor)
|
ext = (OutlookTextExtactor)
|
||||||
|
@ -768,7 +790,7 @@ public class TestExtractorFactory {
|
||||||
assertEquals(0, numXls);
|
assertEquals(0, numXls);
|
||||||
assertEquals(0, numWord);
|
assertEquals(0, numWord);
|
||||||
assertEquals(1, numMsg);
|
assertEquals(1, numMsg);
|
||||||
|
ext.close();
|
||||||
|
|
||||||
// TODO - PowerPoint
|
// TODO - PowerPoint
|
||||||
// TODO - Publisher
|
// TODO - Publisher
|
||||||
|
|
Loading…
Reference in New Issue