From 9a2dcc327bf2ab775cd0372c76d5a7d9bd72776e Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Thu, 14 Jul 2005 11:31:26 +0000 Subject: [PATCH] Contribution from Yegor Kozlov (Bug 35630): Dumps out the raw contents of a PPT file in XML format git-svn-id: https://svn.apache.org/repos/asf/jakarta/poi/trunk@353751 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/poi/hslf/dev/PPTXMLDump.java | 254 ++++++++++++++++++ 1 file changed, 254 insertions(+) create mode 100644 src/scratchpad/src/org/apache/poi/hslf/dev/PPTXMLDump.java diff --git a/src/scratchpad/src/org/apache/poi/hslf/dev/PPTXMLDump.java b/src/scratchpad/src/org/apache/poi/hslf/dev/PPTXMLDump.java new file mode 100644 index 0000000000..b2cd4fda6e --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hslf/dev/PPTXMLDump.java @@ -0,0 +1,254 @@ +/* ==================================================================== + Copyright 2002-2004 Apache Software Foundation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hslf.dev; + +import org.apache.poi.util.LittleEndian; +import org.apache.poi.hslf.record.RecordTypes; +import org.apache.poi.poifs.filesystem.*; +import java.io.*; + +/** + * Utility class which dumps raw contents of a ppt file into XML format + * + * @author Yegor Kozlov + */ + +public class PPTXMLDump { + public static final int HEADER_SIZE = 8; //size of the record header + public static final int PICT_HEADER_SIZE = 25; //size of the picture header + public final static String PPDOC_ENTRY = "PowerPoint Document"; + public final static String PICTURES_ENTRY = "Pictures"; + public static String CR = System.getProperty("line.separator"); + + protected Writer out; + protected byte[] docstream; + protected byte[] pictstream; + protected boolean hexHeader = true; + + public PPTXMLDump(File ppt) throws IOException { + FileInputStream fis = new FileInputStream(ppt); + POIFSFileSystem fs = new POIFSFileSystem(fis); + fis.close(); + + //read the document entry from OLE file system + DocumentEntry entry = (DocumentEntry)fs.getRoot().getEntry(PPDOC_ENTRY); + docstream = new byte[entry.getSize()]; + DocumentInputStream is = fs.createDocumentInputStream(PPDOC_ENTRY); + is.read(docstream); + + try { + entry = (DocumentEntry)fs.getRoot().getEntry(PICTURES_ENTRY); + pictstream = new byte[entry.getSize()]; + is = fs.createDocumentInputStream(PICTURES_ENTRY); + is.read(pictstream); + } catch(FileNotFoundException e){ + //silently catch errors if the presentation does not contain pictures + } + } + + /** + * Dump the structure of the supplied PPT file into XML + * @param out Writer to write out + * @throws java.io.IOException + */ + public void dump(Writer out) throws IOException { + this.out = out; + + int padding = 0; + write(out, "" + CR, padding); + padding++; + if (pictstream != null){ + write(out, "" + CR, padding); + dumpPictures(pictstream, padding); + write(out, "" + CR, padding); + } + //dump the structure of the powerpoint document + write(out, "" + CR, padding); + padding++; + dump(docstream, 0, docstream.length, padding); + padding--; + write(out, "" + CR, padding); + padding--; + write(out, "", padding); + } + + /** + * Dump a part of the document stream into XML + * @param data PPT binary data + * @param offset offset from the beginning of the document + * @param length of the document + * @param padding used for formatting results + * @throws java.io.IOException + */ + public void dump(byte[] data, int offset, int length, int padding) throws IOException { + int pos = offset; + while (pos <= (offset + length - HEADER_SIZE)){ + if (pos < 0) break; + + //read record header + int info = LittleEndian.getUShort(data, pos); + pos += LittleEndian.SHORT_SIZE; + int type = LittleEndian.getUShort(data, pos); + pos += LittleEndian.SHORT_SIZE; + int size = (int)LittleEndian.getUInt(data, pos); + pos += LittleEndian.INT_SIZE; + + //get name of the record by type + String recname = RecordTypes.recordName(type); + write(out, "<"+recname + " info=\""+info+"\" type=\""+type+"\" size=\""+size+"\" offset=\""+(pos-8)+"\"", padding); + if (hexHeader){ + out.write(" header=\""); + dump(out, data, pos-8, 8, 0, false); + out.write("\""); + } + out.write(">" + CR); + padding++; + //this check works both for Escher and PowerPoint records + boolean isContainer = (info & 0x000F) == 0x000F; + if (isContainer) { + //continue to dump child records + dump(data, pos, size, padding); + } else { + //dump first 100 bytes of the atom data + dump(out, data, pos, Math.min(size, 100), padding, true); + } + padding--; + write(out, "" + CR, padding); + + pos += size; + } + } + + /** + * Dumps the Pictures OLE stream into XML. + * + * @param data from the Pictures OLE data stream + * @param padding + * @throws java.io.IOException + */ + public void dumpPictures(byte[] data, int padding) throws IOException { + int pos = 0; + while (pos < data.length) { + byte[] header = new byte[PICT_HEADER_SIZE]; + + System.arraycopy(data, pos, header, 0, header.length); + int size = LittleEndian.getInt(header, 4) - 17; + byte[] pictdata = new byte[size]; + System.arraycopy(data, pos + PICT_HEADER_SIZE, pictdata, 0, pictdata.length); + pos += PICT_HEADER_SIZE + size; + + padding++; + write(out, "" + CR, padding); + padding++; + write(out, "
" + CR, padding); + dump(out, header, 0, header.length, padding, true); + write(out, "
" + CR, padding); + write(out, "" + CR, padding); + dump(out, pictdata, 0, Math.min(pictdata.length, 100), padding, true); + write(out, "" + CR, padding); + padding--; + write(out, "
" + CR, padding); + padding--; + + } + } + + public static void main(String[] args) throws Exception { + if (args.length == 0){ + System.out.println( + "Usage: PPTXMLDump (options) pptfile\n" + + "Where options include:\n" + + " -f write output to .xml file in the current directory" + ); + return; + } + boolean outFile = false; + for (int i = 0; i < args.length; i++){ + + if (args[i].startsWith("-")) { + if ("-f".equals(args[i])){ + //write ouput to a file + outFile = true; + } + } else { + File ppt = new File(args[i]); + PPTXMLDump dump = new PPTXMLDump(ppt); + System.out.println("Dumping " + args[i]); + + if (outFile){ + FileWriter out = new FileWriter(ppt.getName() + ".xml"); + dump.dump(out); + out.close(); + } else { + StringWriter out = new StringWriter(); + dump.dump(out); + System.out.println(out.toString()); + } + } + + } + } + + + /** + * write a string to out with the specified padding + */ + private static void write(Writer out, String str, int padding) throws IOException { + for (int i = 0; i < padding; i++) out.write(" "); + out.write(str); + } + + private String getPictureType(byte[] header){ + String type; + int meta = LittleEndian.getUShort(header, 0); + + switch(meta){ + case 0x46A0: type = "jpeg"; break; + case 0x2160: type = "wmf"; break; + case 0x6E00: type = "png"; break; + default: type = "unknown"; break; + } + return type; + } + + /** + * dump binary data to out with the specified padding + */ + private static void dump(Writer out, byte[] data, int offset, int length, int padding, boolean nl) throws IOException { + int linesize = 25; + for (int i = 0; i < padding; i++) out.write(" "); + int i; + for (i = offset; i < (offset + length); i++) { + int c = data[i]; + out.write((char) hexval[(c & 0xF0) >> 4]); + out.write((char) hexval[(c & 0x0F) >> 0]); + out.write(' '); + if((i+1-offset) % linesize == 0 && i != (offset + length-1)) { + out.write(CR); + for (int j = 0; j < padding; j++) out.write(" "); + } + } + if(nl && length > 0)out.write(CR); + } + + private static final byte hexval[] = + {(byte) '0', (byte) '1', (byte) '2', (byte) '3', + (byte) '4', (byte) '5', (byte) '6', (byte) '7', + (byte) '8', (byte) '9', (byte) 'A', (byte) 'B', + (byte) 'C', (byte) 'D', (byte) 'E', (byte) 'F'}; + +}