Update HSLFSlideShow and HSSFWorkbook to take advantage of POIFS updates, and allow reading embeded documents

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@647186 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-04-11 14:43:05 +00:00
parent 3351310f3e
commit 1c998e90b3
10 changed files with 336 additions and 32 deletions

View File

@ -37,6 +37,7 @@
<!-- Don't forget to update status.xml too! -->
<release version="3.0.3-beta1" date="2008-04-??">
<action dev="POI-DEVELOPERS" type="add">Update HSLFSlideShow and HSSFWorkbook to take advantage of POIFS updates, and allow reading embeded documents</action>
<action dev="POI-DEVELOPERS" type="add">Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document</action>
<action dev="POI-DEVELOPERS" type="add">Initial support for getting and changing chart and series titles</action>
<action dev="POI-DEVELOPERS" type="add">Implement a proxy HSSFListener which tracks the format records, and lets you lookup the format string for a given cell. Convert the xls to csv example to use it</action>

View File

@ -34,6 +34,7 @@
<!-- Don't forget to update changes.xml too! -->
<changes>
<release version="3.0.3-beta1" date="2008-04-??">
<action dev="POI-DEVELOPERS" type="add">Update HSLFSlideShow and HSSFWorkbook to take advantage of POIFS updates, and allow reading embeded documents</action>
<action dev="POI-DEVELOPERS" type="add">Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document</action>
<action dev="POI-DEVELOPERS" type="add">Initial support for getting and changing chart and series titles</action>
<action dev="POI-DEVELOPERS" type="add">Implement a proxy HSSFListener which tracks the format records, and lets you lookup the format string for a given cell. Convert the xls to csv example to use it</action>

View File

@ -161,17 +161,37 @@ public class HSSFWorkbook extends POIDocument
* @see org.apache.poi.poifs.filesystem.POIFSFileSystem
* @exception IOException if the stream cannot be read
*/
public HSSFWorkbook(POIFSFileSystem fs, boolean preserveNodes)
throws IOException
{
super(fs);
this(fs.getRoot(), fs, preserveNodes);
}
/**
* given a POI POIFSFileSystem object, and a specific directory
* within it, read in its Workbook and populate the high and
* low level models. If you're reading in a workbook...start here.
*
* @param directory the POI filesystem directory to process from
* @param fs the POI filesystem that contains the Workbook stream.
* @param preserveNodes whether to preseve other nodes, such as
* macros. This takes more memory, so only say yes if you
* need to. If set, will store all of the POIFSFileSystem
* in memory
* @see org.apache.poi.poifs.filesystem.POIFSFileSystem
* @exception IOException if the stream cannot be read
*/
public HSSFWorkbook(DirectoryNode directory, POIFSFileSystem fs, boolean preserveNodes)
throws IOException
{
super(directory, fs);
this.preserveNodes = preserveNodes;
// If we're not preserving nodes, don't track the
// POIFS any more
if(! preserveNodes) {
this.filesystem = null;
this.directory = null;
}
sheets = new ArrayList(INITIAL_CAPACITY);
@ -182,13 +202,13 @@ public class HSSFWorkbook extends POIDocument
// put theirs in one called "WORKBOOK"
String workbookName = "Workbook";
try {
fs.getRoot().getEntry(workbookName);
directory.getEntry(workbookName);
// Is the default name
} catch(FileNotFoundException fe) {
// Try the upper case form
try {
workbookName = "WORKBOOK";
fs.getRoot().getEntry(workbookName);
directory.getEntry(workbookName);
} catch(FileNotFoundException wfe) {
// Doesn't contain it in either form
throw new IllegalArgumentException("The supplied POIFSFileSystem contained neither a 'Workbook' entry, nor a 'WORKBOOK' entry. Is it really an excel file?");
@ -198,7 +218,7 @@ public class HSSFWorkbook extends POIDocument
// Grab the data from the workbook stream, however
// it happens to be spelt.
InputStream stream = fs.createDocumentInputStream(workbookName);
InputStream stream = directory.createDocumentInputStream(workbookName);
EventRecordFactory factory = new EventRecordFactory();

View File

@ -0,0 +1,81 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.poifs.dev;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Iterator;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* A lister of the entries in POIFS files.
*
* Much simpler than {@link POIFSViewer}
*/
public class POIFSLister {
/**
* Display the entries of multiple POIFS files
*
* @param args the names of the files to be displayed
*/
public static void main(final String args[]) throws IOException {
if (args.length == 0)
{
System.err.println("Must specify at least one file to view");
System.exit(1);
}
for (int j = 0; j < args.length; j++)
{
viewFile(args[ j ]);
}
}
public static void viewFile(final String filename) throws IOException
{
POIFSFileSystem fs = new POIFSFileSystem(
new FileInputStream(filename)
);
displayDirectory(fs.getRoot(), "");
}
public static void displayDirectory(DirectoryNode dir, String indent) {
System.out.println(indent + dir.getName() + " -");
String newIndent = indent + " ";
for(Iterator it = dir.getEntries(); it.hasNext(); ) {
Object entry = it.next();
if(entry instanceof DirectoryNode) {
displayDirectory((DirectoryNode)entry, newIndent);
} else {
DocumentNode doc = (DocumentNode)entry;
String name = doc.getName();
if(name.charAt(0) < 10) {
String altname = "(0x0" + (int)name.charAt(0) + ")" + name.substring(1);
name = name.substring(1) + " <" + altname + ">";
}
System.out.println(newIndent + name);
}
}
}
}

View File

@ -45,6 +45,7 @@ import org.apache.poi.hslf.record.Record;
import org.apache.poi.hslf.record.UserEditAtom;
import org.apache.poi.hslf.usermodel.ObjectData;
import org.apache.poi.hslf.usermodel.PictureData;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@ -124,7 +125,21 @@ public class HSLFSlideShow extends POIDocument
*/
public HSLFSlideShow(POIFSFileSystem filesystem) throws IOException
{
super(filesystem);
this(filesystem.getRoot(), filesystem);
}
/**
* Constructs a Powerpoint document from a specific point in a
* POIFS Filesystem. Parses the document and places all the
* important stuff into data structures.
*
* @param dir the POIFS directory to read from
* @param filesystem the POIFS FileSystem to read from
* @throws IOException if there is a problem while parsing the document.
*/
public HSLFSlideShow(DirectoryNode dir, POIFSFileSystem filesystem) throws IOException
{
super(dir, filesystem);
// First up, grab the "Current User" stream
// We need this before we can detect Encrypted Documents
@ -186,11 +201,11 @@ public class HSLFSlideShow extends POIDocument
{
// Get the main document stream
DocumentEntry docProps =
(DocumentEntry)filesystem.getRoot().getEntry("PowerPoint Document");
(DocumentEntry)directory.getEntry("PowerPoint Document");
// Grab the document stream
_docstream = new byte[docProps.getSize()];
filesystem.createDocumentInputStream("PowerPoint Document").read(_docstream);
directory.createDocumentInputStream("PowerPoint Document").read(_docstream);
}
/**
@ -272,7 +287,7 @@ public class HSLFSlideShow extends POIDocument
*/
private void readCurrentUserStream() {
try {
currentUser = new CurrentUserAtom(filesystem);
currentUser = new CurrentUserAtom(directory);
} catch(IOException ie) {
logger.log(POILogger.ERROR, "Error finding Current User Atom:\n" + ie);
currentUser = new CurrentUserAtom();
@ -293,9 +308,9 @@ public class HSLFSlideShow extends POIDocument
byte[] pictstream;
try {
DocumentEntry entry = (DocumentEntry)filesystem.getRoot().getEntry("Pictures");
DocumentEntry entry = (DocumentEntry)directory.getEntry("Pictures");
pictstream = new byte[entry.getSize()];
DocumentInputStream is = filesystem.createDocumentInputStream("Pictures");
DocumentInputStream is = directory.createDocumentInputStream("Pictures");
is.read(pictstream);
} catch (FileNotFoundException e){
// Silently catch exceptions if the presentation doesn't

View File

@ -93,9 +93,15 @@ public class CurrentUserAtom
* Find the Current User in the filesystem, and create from that
*/
public CurrentUserAtom(POIFSFileSystem fs) throws IOException {
this(fs.getRoot());
}
/**
* Find the Current User in the filesystem, and create from that
*/
public CurrentUserAtom(DirectoryNode dir) throws IOException {
// Decide how big it is
DocumentEntry docProps =
(DocumentEntry)fs.getRoot().getEntry("Current User");
(DocumentEntry)dir.getEntry("Current User");
_contents = new byte[docProps.getSize()];
// Check it's big enough - if it's not at least 28 bytes long, then
@ -105,7 +111,7 @@ public class CurrentUserAtom
}
// Grab the contents
InputStream in = fs.createDocumentInputStream("Current User");
InputStream in = dir.createDocumentInputStream("Current User");
in.read(_contents);
// Set everything up

View File

@ -21,6 +21,12 @@
package org.apache.poi.hslf.extractor;
import java.io.FileInputStream;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import junit.framework.TestCase;
/**
@ -35,6 +41,8 @@ public class TextExtractor extends TestCase {
private PowerPointExtractor ppe2;
/** Where to go looking for our test files */
private String dirname;
/** Where our embeded files live */
private String pdirname;
public TextExtractor() throws Exception {
dirname = System.getProperty("HSLF.testdata.path");
@ -42,6 +50,8 @@ public class TextExtractor extends TestCase {
ppe = new PowerPointExtractor(filename);
String filename2 = dirname + "/with_textbox.ppt";
ppe2 = new PowerPointExtractor(filename2);
pdirname = System.getProperty("POIFS.testdata.path");
}
public void testReadSheetText() throws Exception {
@ -123,9 +133,87 @@ public class TextExtractor extends TestCase {
char[] expC = exp.toCharArray();
char[] actC = act.toCharArray();
for(int i=0; i<expC.length; i++) {
System.out.println(i + "\t" + expC[i] + " " + actC[i]);
assertEquals(expC[i],actC[i]);
assertEquals("Char " + i, expC[i], actC[i]);
}
assertEquals(exp,act);
}
public void testExtractFromEmbeded() throws Exception {
String filename3 = pdirname + "/excel_with_embeded.xls";
POIFSFileSystem fs = new POIFSFileSystem(
new FileInputStream(filename3)
);
HSLFSlideShow ss;
DirectoryNode dirA = (DirectoryNode)
fs.getRoot().getEntry("MBD0000A3B6");
DirectoryNode dirB = (DirectoryNode)
fs.getRoot().getEntry("MBD0000A3B3");
assertNotNull(dirA.getEntry("PowerPoint Document"));
assertNotNull(dirB.getEntry("PowerPoint Document"));
// Check the first file
ss = new HSLFSlideShow(dirA, fs);
ppe = new PowerPointExtractor(ss);
assertEquals("Sample PowerPoint file\nThis is the 1st file\nNot much too it\n",
ppe.getText(true, false)
);
// And the second
ss = new HSLFSlideShow(dirB, fs);
ppe = new PowerPointExtractor(ss);
assertEquals("Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n",
ppe.getText(true, false)
);
}
/**
* A powerpoint file with embeded powerpoint files
* TODO - figure out how to handle this, as ppt
* appears to embed not as ole2 streams
*/
public void DISABLEDtestExtractFromOwnEmbeded() throws Exception {
String filename3 = pdirname + "/ppt_with_embeded.ppt";
POIFSFileSystem fs = new POIFSFileSystem(
new FileInputStream(filename3)
);
HSLFSlideShow ss;
DirectoryNode dirA = (DirectoryNode)
fs.getRoot().getEntry("MBD0000A3B6");
DirectoryNode dirB = (DirectoryNode)
fs.getRoot().getEntry("MBD0000A3B3");
assertNotNull(dirA.getEntry("PowerPoint Document"));
assertNotNull(dirB.getEntry("PowerPoint Document"));
// Check the first file
ss = new HSLFSlideShow(dirA, fs);
ppe = new PowerPointExtractor(ss);
assertEquals("Sample PowerPoint file\nThis is the 1st file\nNot much too it\n",
ppe.getText(true, false)
);
// And the second
ss = new HSLFSlideShow(dirB, fs);
ppe = new PowerPointExtractor(ss);
assertEquals("Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n",
ppe.getText(true, false)
);
// Check the master doc two ways
ss = new HSLFSlideShow(fs.getRoot(), fs);
ppe = new PowerPointExtractor(ss);
assertEquals("I have embeded files in me\n",
ppe.getText(true, false)
);
ss = new HSLFSlideShow(fs);
ppe = new PowerPointExtractor(ss);
assertEquals("I have embeded files in me\n",
ppe.getText(true, false)
);
}
}

View File

@ -17,17 +17,13 @@
package org.apache.poi.hwpf.extractor;
import java.io.FileInputStream;
import java.util.Iterator;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import junit.framework.TestCase;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* Test the different routes to extracting text
*
@ -110,22 +106,47 @@ public class TestWordExtractor extends TestCase {
/**
* Test that we can get data from an
* embeded word document
* Test that we can get data from two different
* embeded word documents
* @throws Exception
*/
public void testExtractFromEmbeded() throws Exception {
POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(filename3));
DirectoryNode dir = (DirectoryNode)
fs.getRoot().getEntry("MBD03F25D8D");
// Should have WordDocument and 1Table
assertNotNull(dir.getEntry("1Table"));
assertNotNull(dir.getEntry("WordDocument"));
HWPFDocument doc;
WordExtractor extractor3;
HWPFDocument doc = new HWPFDocument(dir, fs);
WordExtractor extractor3 = new WordExtractor(doc);
DirectoryNode dirA = (DirectoryNode)
fs.getRoot().getEntry("MBD0000A3B7");
DirectoryNode dirB = (DirectoryNode)
fs.getRoot().getEntry("MBD0000A3B2");
// Should have WordDocument and 1Table
assertNotNull(dirA.getEntry("1Table"));
assertNotNull(dirA.getEntry("WordDocument"));
assertNotNull(dirB.getEntry("1Table"));
assertNotNull(dirB.getEntry("WordDocument"));
// Check each in turn
doc = new HWPFDocument(dirA, fs);
extractor3 = new WordExtractor(doc);
assertNotNull(extractor3.getText());
assertTrue(extractor3.getText().length() > 20);
assertEquals("I am a sample document\r\nNot much on me\r\nI am document 1\r\n",
extractor3.getText());
assertEquals("Sample Doc 1", extractor3.getSummaryInformation().getTitle());
assertEquals("Sample Test", extractor3.getSummaryInformation().getSubject());
doc = new HWPFDocument(dirB, fs);
extractor3 = new WordExtractor(doc);
assertNotNull(extractor3.getText());
assertTrue(extractor3.getText().length() > 20);
assertEquals("I am another sample document\r\nNot much on me\r\nI am document 2\r\n",
extractor3.getText());
assertEquals("Sample Doc 2", extractor3.getSummaryInformation().getTitle());
assertEquals("Another Sample Test", extractor3.getSummaryInformation().getSubject());
}
}

View File

@ -17,12 +17,15 @@
package org.apache.poi.hssf.extractor;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import junit.framework.TestCase;
import org.apache.poi.hssf.HSSFTestDataSamples;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
*
@ -118,4 +121,72 @@ public final class TestExcelExtractor extends TestCase {
assertEquals("Sheet1\nUPPER(\"xyz\")\nSheet2\nSheet3\n", extractor.getText());
}
/**
* Embded in a non-excel file
*/
public void testWithEmbeded() throws Exception {
String pdirname = System.getProperty("POIFS.testdata.path");
String filename = pdirname + "/word_with_embeded.doc";
POIFSFileSystem fs = new POIFSFileSystem(
new FileInputStream(filename)
);
DirectoryNode objPool = (DirectoryNode)
fs.getRoot().getEntry("ObjectPool");
DirectoryNode dirA = (DirectoryNode)
objPool.getEntry("_1269427460");
DirectoryNode dirB = (DirectoryNode)
objPool.getEntry("_1269427461");
HSSFWorkbook wbA = new HSSFWorkbook(dirA, fs, true);
HSSFWorkbook wbB = new HSSFWorkbook(dirB, fs, true);
ExcelExtractor exA = new ExcelExtractor(wbA);
ExcelExtractor exB = new ExcelExtractor(wbB);
assertEquals("Sheet1\nTest excel file\nThis is the first file\nSheet2\nSheet3\n",
exA.getText());
assertEquals("Sample Excel", exA.getSummaryInformation().getTitle());
assertEquals("Sheet1\nAnother excel file\nThis is the second file\nSheet2\nSheet3\n",
exB.getText());
assertEquals("Sample Excel 2", exB.getSummaryInformation().getTitle());
}
/**
* Excel embeded in excel
*/
public void testWithEmbededInOwn() throws Exception {
String pdirname = System.getProperty("POIFS.testdata.path");
String filename = pdirname + "/excel_with_embeded.xls";
POIFSFileSystem fs = new POIFSFileSystem(
new FileInputStream(filename)
);
DirectoryNode dirA = (DirectoryNode)
fs.getRoot().getEntry("MBD0000A3B5");
DirectoryNode dirB = (DirectoryNode)
fs.getRoot().getEntry("MBD0000A3B4");
HSSFWorkbook wbA = new HSSFWorkbook(dirA, fs, true);
HSSFWorkbook wbB = new HSSFWorkbook(dirB, fs, true);
ExcelExtractor exA = new ExcelExtractor(wbA);
ExcelExtractor exB = new ExcelExtractor(wbB);
assertEquals("Sheet1\nTest excel file\nThis is the first file\nSheet2\nSheet3\n",
exA.getText());
assertEquals("Sample Excel", exA.getSummaryInformation().getTitle());
assertEquals("Sheet1\nAnother excel file\nThis is the second file\nSheet2\nSheet3\n",
exB.getText());
assertEquals("Sample Excel 2", exB.getSummaryInformation().getTitle());
// And the base file too
ExcelExtractor ex = new ExcelExtractor(fs);
assertEquals("Sheet1\nI have lots of embeded files in me\nSheet2\nSheet3\n",
ex.getText());
assertEquals("Excel With Embeded", ex.getSummaryInformation().getTitle());
}
}