mirror of https://github.com/apache/poi.git
use last file extension if multiple extensions; close opened stream; extract String constants
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1776827 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f748e5087c
commit
2ea45dd2c9
|
@ -17,6 +17,8 @@
|
||||||
|
|
||||||
package org.apache.poi.ss.extractor;
|
package org.apache.poi.ss.extractor;
|
||||||
|
|
||||||
|
import static org.apache.poi.util.StringUtil.endsWithIgnoreCase;
|
||||||
|
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
@ -50,6 +52,18 @@ import org.apache.poi.xssf.usermodel.XSSFObjectData;
|
||||||
public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
|
public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
|
||||||
private static final POILogger LOG = POILogFactory.getLogger(EmbeddedExtractor.class);
|
private static final POILogger LOG = POILogFactory.getLogger(EmbeddedExtractor.class);
|
||||||
|
|
||||||
|
// contentType
|
||||||
|
private static final String CONTENT_TYPE_BYTES = "binary/octet-stream";
|
||||||
|
private static final String CONTENT_TYPE_PDF = "application/pdf";
|
||||||
|
private static final String CONTENT_TYPE_DOC = "application/msword";
|
||||||
|
private static final String CONTENT_TYPE_XLS = "application/vnd.ms-excel";
|
||||||
|
|
||||||
|
// default file extension
|
||||||
|
private static final String PDF_EXT = ".pdf";
|
||||||
|
private static final String DOC_EXT = ".doc";
|
||||||
|
private static final String XLS_EXT = ".xls";
|
||||||
|
private static final String OLE_EXT = ".ole";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return the list of known extractors, if you provide custom extractors, override this method
|
* @return the list of known extractors, if you provide custom extractors, override this method
|
||||||
*/
|
*/
|
||||||
|
@ -98,11 +112,10 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
|
||||||
if (od.hasDirectoryEntry()) {
|
if (od.hasDirectoryEntry()) {
|
||||||
data = extractOne((DirectoryNode)od.getDirectory());
|
data = extractOne((DirectoryNode)od.getDirectory());
|
||||||
} else {
|
} else {
|
||||||
String contentType = "binary/octet-stream";
|
|
||||||
if (od instanceof XSSFObjectData) {
|
if (od instanceof XSSFObjectData) {
|
||||||
contentType = ((XSSFObjectData)od).getObjectPart().getContentType();
|
String contentType = ((XSSFObjectData)od).getObjectPart().getContentType();
|
||||||
}
|
}
|
||||||
data = new EmbeddedData(od.getFileName(), od.getObjectData(), contentType);
|
data = new EmbeddedData(od.getFileName(), od.getObjectData(), CONTENT_TYPE_BYTES);
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
LOG.log(POILogger.WARN, "Entry not found / readable - ignoring OLE embedding", e);
|
LOG.log(POILogger.WARN, "Entry not found / readable - ignoring OLE embedding", e);
|
||||||
|
@ -119,7 +132,7 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
|
||||||
|
|
||||||
data.setShape(shape);
|
data.setShape(shape);
|
||||||
String filename = data.getFilename();
|
String filename = data.getFilename();
|
||||||
String extension = (filename == null || filename.indexOf('.') == -1) ? ".bin" : filename.substring(filename.indexOf('.'));
|
String extension = (filename == null || filename.lastIndexOf('.') == -1) ? ".bin" : filename.substring(filename.lastIndexOf('.'));
|
||||||
|
|
||||||
// try to find an alternative name
|
// try to find an alternative name
|
||||||
if (filename == null || "".equals(filename) || filename.startsWith("MBD") || filename.startsWith("Root Entry")) {
|
if (filename == null || "".equals(filename) || filename.startsWith("MBD") || filename.startsWith("Root Entry")) {
|
||||||
|
@ -157,7 +170,7 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
|
||||||
dest.writeFilesystem(bos);
|
dest.writeFilesystem(bos);
|
||||||
dest.close();
|
dest.close();
|
||||||
|
|
||||||
return new EmbeddedData(dn.getName(), bos.toByteArray(), "binary/octet-stream");
|
return new EmbeddedData(dn.getName(), bos.toByteArray(), CONTENT_TYPE_BYTES);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected EmbeddedData extract(Picture source) throws IOException {
|
protected EmbeddedData extract(Picture source) throws IOException {
|
||||||
|
@ -176,7 +189,7 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
|
||||||
try {
|
try {
|
||||||
// TODO: inspect the CompObj record for more details, i.e. the content type
|
// TODO: inspect the CompObj record for more details, i.e. the content type
|
||||||
Ole10Native ole10 = Ole10Native.createFromEmbeddedOleObject(dn);
|
Ole10Native ole10 = Ole10Native.createFromEmbeddedOleObject(dn);
|
||||||
return new EmbeddedData(ole10.getFileName(), ole10.getDataBuffer(), "binary/octet-stream");
|
return new EmbeddedData(ole10.getFileName(), ole10.getDataBuffer(), CONTENT_TYPE_BYTES);
|
||||||
} catch (Ole10NativeException e) {
|
} catch (Ole10NativeException e) {
|
||||||
throw new IOException(e);
|
throw new IOException(e);
|
||||||
}
|
}
|
||||||
|
@ -198,7 +211,7 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
|
||||||
InputStream is = dn.createDocumentInputStream("CONTENTS");
|
InputStream is = dn.createDocumentInputStream("CONTENTS");
|
||||||
IOUtils.copy(is, bos);
|
IOUtils.copy(is, bos);
|
||||||
is.close();
|
is.close();
|
||||||
return new EmbeddedData(dn.getName()+".pdf", bos.toByteArray(), "application/pdf");
|
return new EmbeddedData(dn.getName() + PDF_EXT, bos.toByteArray(), CONTENT_TYPE_PDF);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -238,10 +251,10 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
|
||||||
byte[] pdfBytes = new byte[pictureBytesLen];
|
byte[] pdfBytes = new byte[pictureBytesLen];
|
||||||
System.arraycopy(pictureBytes, idxStart, pdfBytes, 0, pictureBytesLen);
|
System.arraycopy(pictureBytes, idxStart, pdfBytes, 0, pictureBytesLen);
|
||||||
String filename = source.getShapeName().trim();
|
String filename = source.getShapeName().trim();
|
||||||
if (!filename.toLowerCase(Locale.ROOT).endsWith(".pdf")) {
|
if (!endsWithIgnoreCase(filename, PDF_EXT)) {
|
||||||
filename += ".pdf";
|
filename += PDF_EXT;
|
||||||
}
|
}
|
||||||
return new EmbeddedData(filename, pdfBytes, "application/pdf");
|
return new EmbeddedData(filename, pdfBytes, CONTENT_TYPE_PDF);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -259,8 +272,8 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
|
||||||
@Override
|
@Override
|
||||||
public EmbeddedData extract(DirectoryNode dn) throws IOException {
|
public EmbeddedData extract(DirectoryNode dn) throws IOException {
|
||||||
EmbeddedData ed = super.extract(dn);
|
EmbeddedData ed = super.extract(dn);
|
||||||
ed.setFilename(dn.getName()+".doc");
|
ed.setFilename(dn.getName() + DOC_EXT);
|
||||||
ed.setContentType("application/msword");
|
ed.setContentType(CONTENT_TYPE_DOC);
|
||||||
return ed;
|
return ed;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -277,8 +290,8 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
|
||||||
@Override
|
@Override
|
||||||
public EmbeddedData extract(DirectoryNode dn) throws IOException {
|
public EmbeddedData extract(DirectoryNode dn) throws IOException {
|
||||||
EmbeddedData ed = super.extract(dn);
|
EmbeddedData ed = super.extract(dn);
|
||||||
ed.setFilename(dn.getName()+".xls");
|
ed.setFilename(dn.getName() + XLS_EXT);
|
||||||
ed.setContentType("application/vnd.ms-excel");
|
ed.setContentType(CONTENT_TYPE_XLS);
|
||||||
return ed;
|
return ed;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -291,7 +304,7 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
|
||||||
@Override
|
@Override
|
||||||
public EmbeddedData extract(DirectoryNode dn) throws IOException {
|
public EmbeddedData extract(DirectoryNode dn) throws IOException {
|
||||||
EmbeddedData ed = super.extract(dn);
|
EmbeddedData ed = super.extract(dn);
|
||||||
ed.setFilename(dn.getName()+".ole");
|
ed.setFilename(dn.getName() + OLE_EXT);
|
||||||
// TODO: read the content type from CombObj stream
|
// TODO: read the content type from CombObj stream
|
||||||
return ed;
|
return ed;
|
||||||
}
|
}
|
||||||
|
@ -306,11 +319,14 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
|
||||||
copyNodes(srcDir, destDir);
|
copyNodes(srcDir, destDir);
|
||||||
} else {
|
} else {
|
||||||
InputStream is = src.createDocumentInputStream(e);
|
InputStream is = src.createDocumentInputStream(e);
|
||||||
|
try {
|
||||||
dest.createDocument(e.getName(), is);
|
dest.createDocument(e.getName(), is);
|
||||||
|
} finally {
|
||||||
is.close();
|
is.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue