From 368052f71e158e8dd74e07be34aad0523f89e9a1 Mon Sep 17 00:00:00 2001 From: Andreas Beeker Date: Thu, 5 Jan 2017 01:10:45 +0000 Subject: [PATCH] EmbeddedExtractor (for *SSF) - added OOXML support git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1777394 13f79535-47bb-0310-9956-ffa450edef68 --- src/java/org/apache/poi/hpsf/ClassID.java | 48 +++++++-- .../poi/ss/extractor/EmbeddedExtractor.java | 102 +++++++++++++----- 2 files changed, 111 insertions(+), 39 deletions(-) diff --git a/src/java/org/apache/poi/hpsf/ClassID.java b/src/java/org/apache/poi/hpsf/ClassID.java index 6fca6dd188..0dacf80139 100644 --- a/src/java/org/apache/poi/hpsf/ClassID.java +++ b/src/java/org/apache/poi/hpsf/ClassID.java @@ -30,17 +30,43 @@ import org.apache.poi.util.StringUtil; */ public class ClassID { - public static final ClassID OLE10_PACKAGE = new ClassID("{0003000C-0000-0000-C000-000000000046}"); - public static final ClassID PPT_SHOW = new ClassID("{64818D10-4F9B-11CF-86EA-00AA00B929E8}"); - public static final ClassID XLS_WORKBOOK = new ClassID("{00020841-0000-0000-C000-000000000046}"); - public static final ClassID TXT_ONLY = new ClassID("{5e941d80-bf96-11cd-b579-08002b30bfeb}"); - public static final ClassID EXCEL97 = new ClassID("{00020820-0000-0000-C000-000000000046}"); - public static final ClassID EXCEL95 = new ClassID("{00020810-0000-0000-C000-000000000046}"); - public static final ClassID WORD97 = new ClassID("{00020906-0000-0000-C000-000000000046}"); - public static final ClassID WORD95 = new ClassID("{00020900-0000-0000-C000-000000000046}"); - public static final ClassID POWERPOINT97 = new ClassID("{64818D10-4F9B-11CF-86EA-00AA00B929E8}"); - public static final ClassID POWERPOINT95 = new ClassID("{EA7BAE70-FB3B-11CD-A903-00AA00510EA3}"); - public static final ClassID EQUATION30 = new ClassID("{0002CE02-0000-0000-C000-000000000046}"); + public static final ClassID OLE10_PACKAGE = new ClassID("{0003000C-0000-0000-C000-000000000046}"); + public static final ClassID PPT_SHOW = new ClassID("{64818D10-4F9B-11CF-86EA-00AA00B929E8}"); + public static final ClassID XLS_WORKBOOK = new ClassID("{00020841-0000-0000-C000-000000000046}"); + public static final ClassID TXT_ONLY = new ClassID("{5e941d80-bf96-11cd-b579-08002b30bfeb}"); + + // Excel V3 + public static final ClassID EXCEL_V3 = new ClassID("{00030000-0000-0000-C000-000000000046}"); + public static final ClassID EXCEL_V3_CHART = new ClassID("{00030001-0000-0000-C000-000000000046}"); + public static final ClassID EXCEL_V3_MACRO = new ClassID("{00030002-0000-0000-C000-000000000046}"); + // Excel V5 + public static final ClassID EXCEL95 = new ClassID("{00020810-0000-0000-C000-000000000046}"); + public static final ClassID EXCEL95_CHART = new ClassID("{00020811-0000-0000-C000-000000000046}"); + // Excel V8 + public static final ClassID EXCEL97 = new ClassID("{00020820-0000-0000-C000-000000000046}"); + public static final ClassID EXCEL97_CHART = new ClassID("{00020821-0000-0000-C000-000000000046}"); + // Excel V11 + public static final ClassID EXCEL2003 = new ClassID("{00020812-0000-0000-C000-000000000046}"); + // Excel V12 + public static final ClassID EXCEL2007 = new ClassID("{00020830-0000-0000-C000-000000000046}"); + public static final ClassID EXCEL2007_MACRO= new ClassID("{00020832-0000-0000-C000-000000000046}"); + public static final ClassID EXCEL2007_XLSB = new ClassID("{00020833-0000-0000-C000-000000000046}"); + // Excel V14 + public static final ClassID EXCEL2010 = new ClassID("{00024500-0000-0000-C000-000000000046}"); + public static final ClassID EXCEL2010_CHART= new ClassID("{00024505-0014-0000-C000-000000000046}"); + public static final ClassID EXCEL2010_ODS = new ClassID("{EABCECDB-CC1C-4A6F-B4E3-7F888A5ADFC8}"); + + public static final ClassID WORD97 = new ClassID("{00020906-0000-0000-C000-000000000046}"); + public static final ClassID WORD95 = new ClassID("{00020900-0000-0000-C000-000000000046}"); + public static final ClassID WORD2007 = new ClassID("{F4754C9B-64F5-4B40-8AF4-679732AC0607}"); + public static final ClassID WORD2007_MACRO = new ClassID("{18A06B6B-2F3F-4E2B-A611-52BE631B2D22}"); + + public static final ClassID POWERPOINT97 = new ClassID("{64818D10-4F9B-11CF-86EA-00AA00B929E8}"); + public static final ClassID POWERPOINT95 = new ClassID("{EA7BAE70-FB3B-11CD-A903-00AA00510EA3}"); + public static final ClassID POWERPOINT2007 = new ClassID("{CF4F55F4-8F87-4D47-80BB-5808164BB3F8}"); + public static final ClassID POWERPOINT2007_MACRO = new ClassID("{DC020317-E6E2-4A62-B9FA-B3EFE16626F4}"); + + public static final ClassID EQUATION30 = new ClassID("{0002CE02-0000-0000-C000-000000000046}"); /**

The number of bytes occupied by this object in the byte * stream.

*/ diff --git a/src/ooxml/java/org/apache/poi/ss/extractor/EmbeddedExtractor.java b/src/ooxml/java/org/apache/poi/ss/extractor/EmbeddedExtractor.java index b45985f8cc..2e74f6c31c 100644 --- a/src/ooxml/java/org/apache/poi/ss/extractor/EmbeddedExtractor.java +++ b/src/ooxml/java/org/apache/poi/ss/extractor/EmbeddedExtractor.java @@ -27,10 +27,10 @@ import java.util.Arrays; import java.util.Collections; import java.util.Iterator; import java.util.List; -import java.util.Locale; import org.apache.poi.hpsf.ClassID; import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.DocumentInputStream; import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.Ole10Native; import org.apache.poi.poifs.filesystem.Ole10NativeException; @@ -43,12 +43,18 @@ import org.apache.poi.ss.usermodel.Shape; import org.apache.poi.ss.usermodel.ShapeContainer; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; +import org.apache.poi.util.Beta; import org.apache.poi.util.IOUtils; import org.apache.poi.util.LocaleUtil; import org.apache.poi.util.POILogFactory; import org.apache.poi.util.POILogger; import org.apache.poi.xssf.usermodel.XSSFObjectData; +/** + * This extractor class tries to identify various embedded documents within Excel files + * and provide them via a common interface, i.e. the EmbeddedData instances + */ +@Beta public class EmbeddedExtractor implements Iterable { private static final POILogger LOG = POILogFactory.getLogger(EmbeddedExtractor.class); @@ -58,19 +64,13 @@ public class EmbeddedExtractor implements Iterable { private static final String CONTENT_TYPE_DOC = "application/msword"; private static final String CONTENT_TYPE_XLS = "application/vnd.ms-excel"; - // default file extension - private static final String PDF_EXT = ".pdf"; - private static final String DOC_EXT = ".doc"; - private static final String XLS_EXT = ".xls"; - private static final String OLE_EXT = ".ole"; - /** * @return the list of known extractors, if you provide custom extractors, override this method */ @Override public Iterator iterator() { EmbeddedExtractor[] ee = { - new Ole10Extractor(), new PdfExtractor(), new WordExtractor(), new ExcelExtractor(), new FsExtractor() + new Ole10Extractor(), new PdfExtractor(), new BiffExtractor(), new OOXMLExtractor(), new FsExtractor() }; return Arrays.asList(ee).iterator(); } @@ -112,10 +112,11 @@ public class EmbeddedExtractor implements Iterable { if (od.hasDirectoryEntry()) { data = extractOne((DirectoryNode)od.getDirectory()); } else { + String contentType = CONTENT_TYPE_BYTES; if (od instanceof XSSFObjectData) { - String contentType = ((XSSFObjectData)od).getObjectPart().getContentType(); + contentType = ((XSSFObjectData)od).getObjectPart().getContentType(); } - data = new EmbeddedData(od.getFileName(), od.getObjectData(), CONTENT_TYPE_BYTES); + data = new EmbeddedData(od.getFileName(), od.getObjectData(), contentType); } } catch (Exception e) { LOG.log(POILogger.WARN, "Entry not found / readable - ignoring OLE embedding", e); @@ -211,7 +212,7 @@ public class EmbeddedExtractor implements Iterable { InputStream is = dn.createDocumentInputStream("CONTENTS"); IOUtils.copy(is, bos); is.close(); - return new EmbeddedData(dn.getName() + PDF_EXT, bos.toByteArray(), CONTENT_TYPE_PDF); + return new EmbeddedData(dn.getName() + ".pdf", bos.toByteArray(), CONTENT_TYPE_PDF); } @Override @@ -251,8 +252,8 @@ public class EmbeddedExtractor implements Iterable { byte[] pdfBytes = new byte[pictureBytesLen]; System.arraycopy(pictureBytes, idxStart, pdfBytes, 0, pictureBytesLen); String filename = source.getShapeName().trim(); - if (!endsWithIgnoreCase(filename, PDF_EXT)) { - filename += PDF_EXT; + if (!endsWithIgnoreCase(filename, ".pdf")) { + filename += ".pdf"; } return new EmbeddedData(filename, pdfBytes, CONTENT_TYPE_PDF); } @@ -260,38 +261,83 @@ public class EmbeddedExtractor implements Iterable { } - static class WordExtractor extends EmbeddedExtractor { + static class OOXMLExtractor extends EmbeddedExtractor { @Override public boolean canExtract(DirectoryNode dn) { - ClassID clsId = dn.getStorageClsid(); - return (ClassID.WORD95.equals(clsId) - || ClassID.WORD97.equals(clsId) - || dn.hasEntry("WordDocument")); + return dn.hasEntry("package"); } @Override public EmbeddedData extract(DirectoryNode dn) throws IOException { - EmbeddedData ed = super.extract(dn); - ed.setFilename(dn.getName() + DOC_EXT); - ed.setContentType(CONTENT_TYPE_DOC); - return ed; + + ClassID clsId = dn.getStorageClsid(); + + String contentType, ext; + if (ClassID.WORD2007.equals(clsId)) { + ext = ".docx"; + contentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"; + } else if (ClassID.WORD2007_MACRO.equals(clsId)) { + ext = ".docm"; + contentType = "application/vnd.ms-word.document.macroEnabled.12"; + } else if (ClassID.EXCEL2007.equals(clsId) || ClassID.EXCEL2003.equals(clsId) || ClassID.EXCEL2010.equals(clsId)) { + ext = ".xlsx"; + contentType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"; + } else if (ClassID.EXCEL2007_MACRO.equals(clsId)) { + ext = ".xlsm"; + contentType = "application/vnd.ms-excel.sheet.macroEnabled.12"; + } else if (ClassID.EXCEL2007_XLSB.equals(clsId)) { + ext = ".xlsb"; + contentType = "application/vnd.ms-excel.sheet.binary.macroEnabled.12"; + } else if (ClassID.POWERPOINT2007.equals(clsId)) { + ext = ".pptx"; + contentType = "application/vnd.openxmlformats-officedocument.presentationml.presentation"; + } else if (ClassID.POWERPOINT2007_MACRO.equals(clsId)) { + ext = ".ppsm"; + contentType = "application/vnd.ms-powerpoint.slideshow.macroEnabled.12"; + } else { + ext = ".zip"; + contentType = "application/zip"; + } + + DocumentInputStream dis = dn.createDocumentInputStream("package"); + byte data[] = IOUtils.toByteArray(dis); + dis.close(); + + return new EmbeddedData(dn.getName()+ext, data, contentType); } } - static class ExcelExtractor extends EmbeddedExtractor { + static class BiffExtractor extends EmbeddedExtractor { @Override public boolean canExtract(DirectoryNode dn) { + return canExtractExcel(dn) || canExtractWord(dn); + } + + protected boolean canExtractExcel(DirectoryNode dn) { ClassID clsId = dn.getStorageClsid(); return (ClassID.EXCEL95.equals(clsId) - || ClassID.EXCEL97.equals(clsId) - || dn.hasEntry("Workbook") /*...*/); + || ClassID.EXCEL97.equals(clsId) + || dn.hasEntry("Workbook") /*...*/); + } + + protected boolean canExtractWord(DirectoryNode dn) { + ClassID clsId = dn.getStorageClsid(); + return (ClassID.WORD95.equals(clsId) + || ClassID.WORD97.equals(clsId) + || dn.hasEntry("WordDocument")); } @Override public EmbeddedData extract(DirectoryNode dn) throws IOException { EmbeddedData ed = super.extract(dn); - ed.setFilename(dn.getName() + XLS_EXT); - ed.setContentType(CONTENT_TYPE_XLS); + if (canExtractExcel(dn)) { + ed.setFilename(dn.getName() + ".xls"); + ed.setContentType(CONTENT_TYPE_XLS); + } else if (canExtractWord(dn)) { + ed.setFilename(dn.getName() + ".doc"); + ed.setContentType(CONTENT_TYPE_DOC); + } + return ed; } } @@ -304,7 +350,7 @@ public class EmbeddedExtractor implements Iterable { @Override public EmbeddedData extract(DirectoryNode dn) throws IOException { EmbeddedData ed = super.extract(dn); - ed.setFilename(dn.getName() + OLE_EXT); + ed.setFilename(dn.getName() + ".ole"); // TODO: read the content type from CombObj stream return ed; }