mirror of https://github.com/apache/poi.git
EmbeddedExtractor (for *SSF) - added OOXML support
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1777394 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
753ad37e13
commit
368052f71e
|
@ -30,17 +30,43 @@ import org.apache.poi.util.StringUtil;
|
||||||
*/
|
*/
|
||||||
public class ClassID
|
public class ClassID
|
||||||
{
|
{
|
||||||
public static final ClassID OLE10_PACKAGE = new ClassID("{0003000C-0000-0000-C000-000000000046}");
|
public static final ClassID OLE10_PACKAGE = new ClassID("{0003000C-0000-0000-C000-000000000046}");
|
||||||
public static final ClassID PPT_SHOW = new ClassID("{64818D10-4F9B-11CF-86EA-00AA00B929E8}");
|
public static final ClassID PPT_SHOW = new ClassID("{64818D10-4F9B-11CF-86EA-00AA00B929E8}");
|
||||||
public static final ClassID XLS_WORKBOOK = new ClassID("{00020841-0000-0000-C000-000000000046}");
|
public static final ClassID XLS_WORKBOOK = new ClassID("{00020841-0000-0000-C000-000000000046}");
|
||||||
public static final ClassID TXT_ONLY = new ClassID("{5e941d80-bf96-11cd-b579-08002b30bfeb}");
|
public static final ClassID TXT_ONLY = new ClassID("{5e941d80-bf96-11cd-b579-08002b30bfeb}");
|
||||||
public static final ClassID EXCEL97 = new ClassID("{00020820-0000-0000-C000-000000000046}");
|
|
||||||
public static final ClassID EXCEL95 = new ClassID("{00020810-0000-0000-C000-000000000046}");
|
// Excel V3
|
||||||
public static final ClassID WORD97 = new ClassID("{00020906-0000-0000-C000-000000000046}");
|
public static final ClassID EXCEL_V3 = new ClassID("{00030000-0000-0000-C000-000000000046}");
|
||||||
public static final ClassID WORD95 = new ClassID("{00020900-0000-0000-C000-000000000046}");
|
public static final ClassID EXCEL_V3_CHART = new ClassID("{00030001-0000-0000-C000-000000000046}");
|
||||||
public static final ClassID POWERPOINT97 = new ClassID("{64818D10-4F9B-11CF-86EA-00AA00B929E8}");
|
public static final ClassID EXCEL_V3_MACRO = new ClassID("{00030002-0000-0000-C000-000000000046}");
|
||||||
public static final ClassID POWERPOINT95 = new ClassID("{EA7BAE70-FB3B-11CD-A903-00AA00510EA3}");
|
// Excel V5
|
||||||
public static final ClassID EQUATION30 = new ClassID("{0002CE02-0000-0000-C000-000000000046}");
|
public static final ClassID EXCEL95 = new ClassID("{00020810-0000-0000-C000-000000000046}");
|
||||||
|
public static final ClassID EXCEL95_CHART = new ClassID("{00020811-0000-0000-C000-000000000046}");
|
||||||
|
// Excel V8
|
||||||
|
public static final ClassID EXCEL97 = new ClassID("{00020820-0000-0000-C000-000000000046}");
|
||||||
|
public static final ClassID EXCEL97_CHART = new ClassID("{00020821-0000-0000-C000-000000000046}");
|
||||||
|
// Excel V11
|
||||||
|
public static final ClassID EXCEL2003 = new ClassID("{00020812-0000-0000-C000-000000000046}");
|
||||||
|
// Excel V12
|
||||||
|
public static final ClassID EXCEL2007 = new ClassID("{00020830-0000-0000-C000-000000000046}");
|
||||||
|
public static final ClassID EXCEL2007_MACRO= new ClassID("{00020832-0000-0000-C000-000000000046}");
|
||||||
|
public static final ClassID EXCEL2007_XLSB = new ClassID("{00020833-0000-0000-C000-000000000046}");
|
||||||
|
// Excel V14
|
||||||
|
public static final ClassID EXCEL2010 = new ClassID("{00024500-0000-0000-C000-000000000046}");
|
||||||
|
public static final ClassID EXCEL2010_CHART= new ClassID("{00024505-0014-0000-C000-000000000046}");
|
||||||
|
public static final ClassID EXCEL2010_ODS = new ClassID("{EABCECDB-CC1C-4A6F-B4E3-7F888A5ADFC8}");
|
||||||
|
|
||||||
|
public static final ClassID WORD97 = new ClassID("{00020906-0000-0000-C000-000000000046}");
|
||||||
|
public static final ClassID WORD95 = new ClassID("{00020900-0000-0000-C000-000000000046}");
|
||||||
|
public static final ClassID WORD2007 = new ClassID("{F4754C9B-64F5-4B40-8AF4-679732AC0607}");
|
||||||
|
public static final ClassID WORD2007_MACRO = new ClassID("{18A06B6B-2F3F-4E2B-A611-52BE631B2D22}");
|
||||||
|
|
||||||
|
public static final ClassID POWERPOINT97 = new ClassID("{64818D10-4F9B-11CF-86EA-00AA00B929E8}");
|
||||||
|
public static final ClassID POWERPOINT95 = new ClassID("{EA7BAE70-FB3B-11CD-A903-00AA00510EA3}");
|
||||||
|
public static final ClassID POWERPOINT2007 = new ClassID("{CF4F55F4-8F87-4D47-80BB-5808164BB3F8}");
|
||||||
|
public static final ClassID POWERPOINT2007_MACRO = new ClassID("{DC020317-E6E2-4A62-B9FA-B3EFE16626F4}");
|
||||||
|
|
||||||
|
public static final ClassID EQUATION30 = new ClassID("{0002CE02-0000-0000-C000-000000000046}");
|
||||||
|
|
||||||
/** <p>The number of bytes occupied by this object in the byte
|
/** <p>The number of bytes occupied by this object in the byte
|
||||||
* stream.</p> */
|
* stream.</p> */
|
||||||
|
|
|
@ -27,10 +27,10 @@ import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
|
||||||
|
|
||||||
import org.apache.poi.hpsf.ClassID;
|
import org.apache.poi.hpsf.ClassID;
|
||||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
|
import org.apache.poi.poifs.filesystem.DocumentInputStream;
|
||||||
import org.apache.poi.poifs.filesystem.Entry;
|
import org.apache.poi.poifs.filesystem.Entry;
|
||||||
import org.apache.poi.poifs.filesystem.Ole10Native;
|
import org.apache.poi.poifs.filesystem.Ole10Native;
|
||||||
import org.apache.poi.poifs.filesystem.Ole10NativeException;
|
import org.apache.poi.poifs.filesystem.Ole10NativeException;
|
||||||
|
@ -43,12 +43,18 @@ import org.apache.poi.ss.usermodel.Shape;
|
||||||
import org.apache.poi.ss.usermodel.ShapeContainer;
|
import org.apache.poi.ss.usermodel.ShapeContainer;
|
||||||
import org.apache.poi.ss.usermodel.Sheet;
|
import org.apache.poi.ss.usermodel.Sheet;
|
||||||
import org.apache.poi.ss.usermodel.Workbook;
|
import org.apache.poi.ss.usermodel.Workbook;
|
||||||
|
import org.apache.poi.util.Beta;
|
||||||
import org.apache.poi.util.IOUtils;
|
import org.apache.poi.util.IOUtils;
|
||||||
import org.apache.poi.util.LocaleUtil;
|
import org.apache.poi.util.LocaleUtil;
|
||||||
import org.apache.poi.util.POILogFactory;
|
import org.apache.poi.util.POILogFactory;
|
||||||
import org.apache.poi.util.POILogger;
|
import org.apache.poi.util.POILogger;
|
||||||
import org.apache.poi.xssf.usermodel.XSSFObjectData;
|
import org.apache.poi.xssf.usermodel.XSSFObjectData;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This extractor class tries to identify various embedded documents within Excel files
|
||||||
|
* and provide them via a common interface, i.e. the EmbeddedData instances
|
||||||
|
*/
|
||||||
|
@Beta
|
||||||
public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
|
public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
|
||||||
private static final POILogger LOG = POILogFactory.getLogger(EmbeddedExtractor.class);
|
private static final POILogger LOG = POILogFactory.getLogger(EmbeddedExtractor.class);
|
||||||
|
|
||||||
|
@ -58,19 +64,13 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
|
||||||
private static final String CONTENT_TYPE_DOC = "application/msword";
|
private static final String CONTENT_TYPE_DOC = "application/msword";
|
||||||
private static final String CONTENT_TYPE_XLS = "application/vnd.ms-excel";
|
private static final String CONTENT_TYPE_XLS = "application/vnd.ms-excel";
|
||||||
|
|
||||||
// default file extension
|
|
||||||
private static final String PDF_EXT = ".pdf";
|
|
||||||
private static final String DOC_EXT = ".doc";
|
|
||||||
private static final String XLS_EXT = ".xls";
|
|
||||||
private static final String OLE_EXT = ".ole";
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return the list of known extractors, if you provide custom extractors, override this method
|
* @return the list of known extractors, if you provide custom extractors, override this method
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public Iterator<EmbeddedExtractor> iterator() {
|
public Iterator<EmbeddedExtractor> iterator() {
|
||||||
EmbeddedExtractor[] ee = {
|
EmbeddedExtractor[] ee = {
|
||||||
new Ole10Extractor(), new PdfExtractor(), new WordExtractor(), new ExcelExtractor(), new FsExtractor()
|
new Ole10Extractor(), new PdfExtractor(), new BiffExtractor(), new OOXMLExtractor(), new FsExtractor()
|
||||||
};
|
};
|
||||||
return Arrays.asList(ee).iterator();
|
return Arrays.asList(ee).iterator();
|
||||||
}
|
}
|
||||||
|
@ -112,10 +112,11 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
|
||||||
if (od.hasDirectoryEntry()) {
|
if (od.hasDirectoryEntry()) {
|
||||||
data = extractOne((DirectoryNode)od.getDirectory());
|
data = extractOne((DirectoryNode)od.getDirectory());
|
||||||
} else {
|
} else {
|
||||||
|
String contentType = CONTENT_TYPE_BYTES;
|
||||||
if (od instanceof XSSFObjectData) {
|
if (od instanceof XSSFObjectData) {
|
||||||
String contentType = ((XSSFObjectData)od).getObjectPart().getContentType();
|
contentType = ((XSSFObjectData)od).getObjectPart().getContentType();
|
||||||
}
|
}
|
||||||
data = new EmbeddedData(od.getFileName(), od.getObjectData(), CONTENT_TYPE_BYTES);
|
data = new EmbeddedData(od.getFileName(), od.getObjectData(), contentType);
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
LOG.log(POILogger.WARN, "Entry not found / readable - ignoring OLE embedding", e);
|
LOG.log(POILogger.WARN, "Entry not found / readable - ignoring OLE embedding", e);
|
||||||
|
@ -211,7 +212,7 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
|
||||||
InputStream is = dn.createDocumentInputStream("CONTENTS");
|
InputStream is = dn.createDocumentInputStream("CONTENTS");
|
||||||
IOUtils.copy(is, bos);
|
IOUtils.copy(is, bos);
|
||||||
is.close();
|
is.close();
|
||||||
return new EmbeddedData(dn.getName() + PDF_EXT, bos.toByteArray(), CONTENT_TYPE_PDF);
|
return new EmbeddedData(dn.getName() + ".pdf", bos.toByteArray(), CONTENT_TYPE_PDF);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -251,8 +252,8 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
|
||||||
byte[] pdfBytes = new byte[pictureBytesLen];
|
byte[] pdfBytes = new byte[pictureBytesLen];
|
||||||
System.arraycopy(pictureBytes, idxStart, pdfBytes, 0, pictureBytesLen);
|
System.arraycopy(pictureBytes, idxStart, pdfBytes, 0, pictureBytesLen);
|
||||||
String filename = source.getShapeName().trim();
|
String filename = source.getShapeName().trim();
|
||||||
if (!endsWithIgnoreCase(filename, PDF_EXT)) {
|
if (!endsWithIgnoreCase(filename, ".pdf")) {
|
||||||
filename += PDF_EXT;
|
filename += ".pdf";
|
||||||
}
|
}
|
||||||
return new EmbeddedData(filename, pdfBytes, CONTENT_TYPE_PDF);
|
return new EmbeddedData(filename, pdfBytes, CONTENT_TYPE_PDF);
|
||||||
}
|
}
|
||||||
|
@ -260,38 +261,83 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static class WordExtractor extends EmbeddedExtractor {
|
static class OOXMLExtractor extends EmbeddedExtractor {
|
||||||
@Override
|
@Override
|
||||||
public boolean canExtract(DirectoryNode dn) {
|
public boolean canExtract(DirectoryNode dn) {
|
||||||
ClassID clsId = dn.getStorageClsid();
|
return dn.hasEntry("package");
|
||||||
return (ClassID.WORD95.equals(clsId)
|
|
||||||
|| ClassID.WORD97.equals(clsId)
|
|
||||||
|| dn.hasEntry("WordDocument"));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public EmbeddedData extract(DirectoryNode dn) throws IOException {
|
public EmbeddedData extract(DirectoryNode dn) throws IOException {
|
||||||
EmbeddedData ed = super.extract(dn);
|
|
||||||
ed.setFilename(dn.getName() + DOC_EXT);
|
ClassID clsId = dn.getStorageClsid();
|
||||||
ed.setContentType(CONTENT_TYPE_DOC);
|
|
||||||
return ed;
|
String contentType, ext;
|
||||||
|
if (ClassID.WORD2007.equals(clsId)) {
|
||||||
|
ext = ".docx";
|
||||||
|
contentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
|
||||||
|
} else if (ClassID.WORD2007_MACRO.equals(clsId)) {
|
||||||
|
ext = ".docm";
|
||||||
|
contentType = "application/vnd.ms-word.document.macroEnabled.12";
|
||||||
|
} else if (ClassID.EXCEL2007.equals(clsId) || ClassID.EXCEL2003.equals(clsId) || ClassID.EXCEL2010.equals(clsId)) {
|
||||||
|
ext = ".xlsx";
|
||||||
|
contentType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
|
||||||
|
} else if (ClassID.EXCEL2007_MACRO.equals(clsId)) {
|
||||||
|
ext = ".xlsm";
|
||||||
|
contentType = "application/vnd.ms-excel.sheet.macroEnabled.12";
|
||||||
|
} else if (ClassID.EXCEL2007_XLSB.equals(clsId)) {
|
||||||
|
ext = ".xlsb";
|
||||||
|
contentType = "application/vnd.ms-excel.sheet.binary.macroEnabled.12";
|
||||||
|
} else if (ClassID.POWERPOINT2007.equals(clsId)) {
|
||||||
|
ext = ".pptx";
|
||||||
|
contentType = "application/vnd.openxmlformats-officedocument.presentationml.presentation";
|
||||||
|
} else if (ClassID.POWERPOINT2007_MACRO.equals(clsId)) {
|
||||||
|
ext = ".ppsm";
|
||||||
|
contentType = "application/vnd.ms-powerpoint.slideshow.macroEnabled.12";
|
||||||
|
} else {
|
||||||
|
ext = ".zip";
|
||||||
|
contentType = "application/zip";
|
||||||
|
}
|
||||||
|
|
||||||
|
DocumentInputStream dis = dn.createDocumentInputStream("package");
|
||||||
|
byte data[] = IOUtils.toByteArray(dis);
|
||||||
|
dis.close();
|
||||||
|
|
||||||
|
return new EmbeddedData(dn.getName()+ext, data, contentType);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static class ExcelExtractor extends EmbeddedExtractor {
|
static class BiffExtractor extends EmbeddedExtractor {
|
||||||
@Override
|
@Override
|
||||||
public boolean canExtract(DirectoryNode dn) {
|
public boolean canExtract(DirectoryNode dn) {
|
||||||
|
return canExtractExcel(dn) || canExtractWord(dn);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected boolean canExtractExcel(DirectoryNode dn) {
|
||||||
ClassID clsId = dn.getStorageClsid();
|
ClassID clsId = dn.getStorageClsid();
|
||||||
return (ClassID.EXCEL95.equals(clsId)
|
return (ClassID.EXCEL95.equals(clsId)
|
||||||
|| ClassID.EXCEL97.equals(clsId)
|
|| ClassID.EXCEL97.equals(clsId)
|
||||||
|| dn.hasEntry("Workbook") /*...*/);
|
|| dn.hasEntry("Workbook") /*...*/);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected boolean canExtractWord(DirectoryNode dn) {
|
||||||
|
ClassID clsId = dn.getStorageClsid();
|
||||||
|
return (ClassID.WORD95.equals(clsId)
|
||||||
|
|| ClassID.WORD97.equals(clsId)
|
||||||
|
|| dn.hasEntry("WordDocument"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public EmbeddedData extract(DirectoryNode dn) throws IOException {
|
public EmbeddedData extract(DirectoryNode dn) throws IOException {
|
||||||
EmbeddedData ed = super.extract(dn);
|
EmbeddedData ed = super.extract(dn);
|
||||||
ed.setFilename(dn.getName() + XLS_EXT);
|
if (canExtractExcel(dn)) {
|
||||||
ed.setContentType(CONTENT_TYPE_XLS);
|
ed.setFilename(dn.getName() + ".xls");
|
||||||
|
ed.setContentType(CONTENT_TYPE_XLS);
|
||||||
|
} else if (canExtractWord(dn)) {
|
||||||
|
ed.setFilename(dn.getName() + ".doc");
|
||||||
|
ed.setContentType(CONTENT_TYPE_DOC);
|
||||||
|
}
|
||||||
|
|
||||||
return ed;
|
return ed;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -304,7 +350,7 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
|
||||||
@Override
|
@Override
|
||||||
public EmbeddedData extract(DirectoryNode dn) throws IOException {
|
public EmbeddedData extract(DirectoryNode dn) throws IOException {
|
||||||
EmbeddedData ed = super.extract(dn);
|
EmbeddedData ed = super.extract(dn);
|
||||||
ed.setFilename(dn.getName() + OLE_EXT);
|
ed.setFilename(dn.getName() + ".ole");
|
||||||
// TODO: read the content type from CombObj stream
|
// TODO: read the content type from CombObj stream
|
||||||
return ed;
|
return ed;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue