EmbeddedExtractor (for *SSF) - added OOXML support

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1777394 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Andreas Beeker 2017-01-05 01:10:45 +00:00
parent 753ad37e13
commit 368052f71e
2 changed files with 111 additions and 39 deletions

View File

@ -34,12 +34,38 @@ public class ClassID
public static final ClassID PPT_SHOW = new ClassID("{64818D10-4F9B-11CF-86EA-00AA00B929E8}");
public static final ClassID XLS_WORKBOOK = new ClassID("{00020841-0000-0000-C000-000000000046}");
public static final ClassID TXT_ONLY = new ClassID("{5e941d80-bf96-11cd-b579-08002b30bfeb}");
public static final ClassID EXCEL97 = new ClassID("{00020820-0000-0000-C000-000000000046}");
// Excel V3
public static final ClassID EXCEL_V3 = new ClassID("{00030000-0000-0000-C000-000000000046}");
public static final ClassID EXCEL_V3_CHART = new ClassID("{00030001-0000-0000-C000-000000000046}");
public static final ClassID EXCEL_V3_MACRO = new ClassID("{00030002-0000-0000-C000-000000000046}");
// Excel V5
public static final ClassID EXCEL95 = new ClassID("{00020810-0000-0000-C000-000000000046}");
public static final ClassID EXCEL95_CHART = new ClassID("{00020811-0000-0000-C000-000000000046}");
// Excel V8
public static final ClassID EXCEL97 = new ClassID("{00020820-0000-0000-C000-000000000046}");
public static final ClassID EXCEL97_CHART = new ClassID("{00020821-0000-0000-C000-000000000046}");
// Excel V11
public static final ClassID EXCEL2003 = new ClassID("{00020812-0000-0000-C000-000000000046}");
// Excel V12
public static final ClassID EXCEL2007 = new ClassID("{00020830-0000-0000-C000-000000000046}");
public static final ClassID EXCEL2007_MACRO= new ClassID("{00020832-0000-0000-C000-000000000046}");
public static final ClassID EXCEL2007_XLSB = new ClassID("{00020833-0000-0000-C000-000000000046}");
// Excel V14
public static final ClassID EXCEL2010 = new ClassID("{00024500-0000-0000-C000-000000000046}");
public static final ClassID EXCEL2010_CHART= new ClassID("{00024505-0014-0000-C000-000000000046}");
public static final ClassID EXCEL2010_ODS = new ClassID("{EABCECDB-CC1C-4A6F-B4E3-7F888A5ADFC8}");
public static final ClassID WORD97 = new ClassID("{00020906-0000-0000-C000-000000000046}");
public static final ClassID WORD95 = new ClassID("{00020900-0000-0000-C000-000000000046}");
public static final ClassID WORD2007 = new ClassID("{F4754C9B-64F5-4B40-8AF4-679732AC0607}");
public static final ClassID WORD2007_MACRO = new ClassID("{18A06B6B-2F3F-4E2B-A611-52BE631B2D22}");
public static final ClassID POWERPOINT97 = new ClassID("{64818D10-4F9B-11CF-86EA-00AA00B929E8}");
public static final ClassID POWERPOINT95 = new ClassID("{EA7BAE70-FB3B-11CD-A903-00AA00510EA3}");
public static final ClassID POWERPOINT2007 = new ClassID("{CF4F55F4-8F87-4D47-80BB-5808164BB3F8}");
public static final ClassID POWERPOINT2007_MACRO = new ClassID("{DC020317-E6E2-4A62-B9FA-B3EFE16626F4}");
public static final ClassID EQUATION30 = new ClassID("{0002CE02-0000-0000-C000-000000000046}");
/** <p>The number of bytes occupied by this object in the byte

View File

@ -27,10 +27,10 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import org.apache.poi.hpsf.ClassID;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
@ -43,12 +43,18 @@ import org.apache.poi.ss.usermodel.Shape;
import org.apache.poi.ss.usermodel.ShapeContainer;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.util.Beta;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.LocaleUtil;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
import org.apache.poi.xssf.usermodel.XSSFObjectData;
/**
* This extractor class tries to identify various embedded documents within Excel files
* and provide them via a common interface, i.e. the EmbeddedData instances
*/
@Beta
public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
private static final POILogger LOG = POILogFactory.getLogger(EmbeddedExtractor.class);
@ -58,19 +64,13 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
private static final String CONTENT_TYPE_DOC = "application/msword";
private static final String CONTENT_TYPE_XLS = "application/vnd.ms-excel";
// default file extension
private static final String PDF_EXT = ".pdf";
private static final String DOC_EXT = ".doc";
private static final String XLS_EXT = ".xls";
private static final String OLE_EXT = ".ole";
/**
* @return the list of known extractors, if you provide custom extractors, override this method
*/
@Override
public Iterator<EmbeddedExtractor> iterator() {
EmbeddedExtractor[] ee = {
new Ole10Extractor(), new PdfExtractor(), new WordExtractor(), new ExcelExtractor(), new FsExtractor()
new Ole10Extractor(), new PdfExtractor(), new BiffExtractor(), new OOXMLExtractor(), new FsExtractor()
};
return Arrays.asList(ee).iterator();
}
@ -112,10 +112,11 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
if (od.hasDirectoryEntry()) {
data = extractOne((DirectoryNode)od.getDirectory());
} else {
String contentType = CONTENT_TYPE_BYTES;
if (od instanceof XSSFObjectData) {
String contentType = ((XSSFObjectData)od).getObjectPart().getContentType();
contentType = ((XSSFObjectData)od).getObjectPart().getContentType();
}
data = new EmbeddedData(od.getFileName(), od.getObjectData(), CONTENT_TYPE_BYTES);
data = new EmbeddedData(od.getFileName(), od.getObjectData(), contentType);
}
} catch (Exception e) {
LOG.log(POILogger.WARN, "Entry not found / readable - ignoring OLE embedding", e);
@ -211,7 +212,7 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
InputStream is = dn.createDocumentInputStream("CONTENTS");
IOUtils.copy(is, bos);
is.close();
return new EmbeddedData(dn.getName() + PDF_EXT, bos.toByteArray(), CONTENT_TYPE_PDF);
return new EmbeddedData(dn.getName() + ".pdf", bos.toByteArray(), CONTENT_TYPE_PDF);
}
@Override
@ -251,8 +252,8 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
byte[] pdfBytes = new byte[pictureBytesLen];
System.arraycopy(pictureBytes, idxStart, pdfBytes, 0, pictureBytesLen);
String filename = source.getShapeName().trim();
if (!endsWithIgnoreCase(filename, PDF_EXT)) {
filename += PDF_EXT;
if (!endsWithIgnoreCase(filename, ".pdf")) {
filename += ".pdf";
}
return new EmbeddedData(filename, pdfBytes, CONTENT_TYPE_PDF);
}
@ -260,9 +261,66 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
}
static class WordExtractor extends EmbeddedExtractor {
static class OOXMLExtractor extends EmbeddedExtractor {
@Override
public boolean canExtract(DirectoryNode dn) {
return dn.hasEntry("package");
}
@Override
public EmbeddedData extract(DirectoryNode dn) throws IOException {
ClassID clsId = dn.getStorageClsid();
String contentType, ext;
if (ClassID.WORD2007.equals(clsId)) {
ext = ".docx";
contentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
} else if (ClassID.WORD2007_MACRO.equals(clsId)) {
ext = ".docm";
contentType = "application/vnd.ms-word.document.macroEnabled.12";
} else if (ClassID.EXCEL2007.equals(clsId) || ClassID.EXCEL2003.equals(clsId) || ClassID.EXCEL2010.equals(clsId)) {
ext = ".xlsx";
contentType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
} else if (ClassID.EXCEL2007_MACRO.equals(clsId)) {
ext = ".xlsm";
contentType = "application/vnd.ms-excel.sheet.macroEnabled.12";
} else if (ClassID.EXCEL2007_XLSB.equals(clsId)) {
ext = ".xlsb";
contentType = "application/vnd.ms-excel.sheet.binary.macroEnabled.12";
} else if (ClassID.POWERPOINT2007.equals(clsId)) {
ext = ".pptx";
contentType = "application/vnd.openxmlformats-officedocument.presentationml.presentation";
} else if (ClassID.POWERPOINT2007_MACRO.equals(clsId)) {
ext = ".ppsm";
contentType = "application/vnd.ms-powerpoint.slideshow.macroEnabled.12";
} else {
ext = ".zip";
contentType = "application/zip";
}
DocumentInputStream dis = dn.createDocumentInputStream("package");
byte data[] = IOUtils.toByteArray(dis);
dis.close();
return new EmbeddedData(dn.getName()+ext, data, contentType);
}
}
static class BiffExtractor extends EmbeddedExtractor {
@Override
public boolean canExtract(DirectoryNode dn) {
return canExtractExcel(dn) || canExtractWord(dn);
}
protected boolean canExtractExcel(DirectoryNode dn) {
ClassID clsId = dn.getStorageClsid();
return (ClassID.EXCEL95.equals(clsId)
|| ClassID.EXCEL97.equals(clsId)
|| dn.hasEntry("Workbook") /*...*/);
}
protected boolean canExtractWord(DirectoryNode dn) {
ClassID clsId = dn.getStorageClsid();
return (ClassID.WORD95.equals(clsId)
|| ClassID.WORD97.equals(clsId)
@ -272,26 +330,14 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
@Override
public EmbeddedData extract(DirectoryNode dn) throws IOException {
EmbeddedData ed = super.extract(dn);
ed.setFilename(dn.getName() + DOC_EXT);
ed.setContentType(CONTENT_TYPE_DOC);
return ed;
}
}
static class ExcelExtractor extends EmbeddedExtractor {
@Override
public boolean canExtract(DirectoryNode dn) {
ClassID clsId = dn.getStorageClsid();
return (ClassID.EXCEL95.equals(clsId)
|| ClassID.EXCEL97.equals(clsId)
|| dn.hasEntry("Workbook") /*...*/);
}
@Override
public EmbeddedData extract(DirectoryNode dn) throws IOException {
EmbeddedData ed = super.extract(dn);
ed.setFilename(dn.getName() + XLS_EXT);
if (canExtractExcel(dn)) {
ed.setFilename(dn.getName() + ".xls");
ed.setContentType(CONTENT_TYPE_XLS);
} else if (canExtractWord(dn)) {
ed.setFilename(dn.getName() + ".doc");
ed.setContentType(CONTENT_TYPE_DOC);
}
return ed;
}
}
@ -304,7 +350,7 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
@Override
public EmbeddedData extract(DirectoryNode dn) throws IOException {
EmbeddedData ed = super.extract(dn);
ed.setFilename(dn.getName() + OLE_EXT);
ed.setFilename(dn.getName() + ".ole");
// TODO: read the content type from CombObj stream
return ed;
}