From 450c9754f3e6c1fce1daf58fc7199fa0c37b236a Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Thu, 10 Apr 2008 16:59:10 +0000 Subject: [PATCH] Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@646870 13f79535-47bb-0310-9956-ffa450edef68 --- build.xml | 8 +++++ src/documentation/content/xdocs/changes.xml | 1 + src/documentation/content/xdocs/status.xml | 1 + src/java/org/apache/poi/POIDocument.java | 14 +++++++- .../poi/hssf/usermodel/HSSFWorkbook.java | 3 +- .../poi/poifs/filesystem/DirectoryNode.java | 25 ++++++++++++++ .../poi/poifs/filesystem/POIFSFileSystem.java | 13 ++----- .../src/org/apache/poi/hdgf/HDGFDiagram.java | 2 +- .../org/apache/poi/hslf/HSLFSlideShow.java | 2 +- .../src/org/apache/poi/hwpf/HWPFDocument.java | 34 ++++++++++++++----- .../poi/hwpf/extractor/TestWordExtractor.java | 27 +++++++++++++++ 11 files changed, 107 insertions(+), 23 deletions(-) diff --git a/build.xml b/build.xml index b4e10a0085..4201e8155e 100644 --- a/build.xml +++ b/build.xml @@ -521,6 +521,8 @@ under the License. file="${main.src.test}/org/apache/poi/hwpf/data"/> + @@ -556,6 +558,8 @@ under the License. file="${main.src.test}/org/apache/poi/hpsf/data"/> + @@ -585,6 +589,7 @@ under the License. + @@ -601,6 +606,7 @@ under the License. + @@ -639,6 +645,7 @@ under the License. + @@ -673,6 +680,7 @@ under the License. + diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml index c3b1f72e84..2bfb46d0e2 100644 --- a/src/documentation/content/xdocs/changes.xml +++ b/src/documentation/content/xdocs/changes.xml @@ -37,6 +37,7 @@ + Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document Initial support for getting and changing chart and series titles Implement a proxy HSSFListener which tracks the format records, and lets you lookup the format string for a given cell. Convert the xls to csv example to use it 44792 - fixed encode/decode problems in ExternalNameRecord and CRNRecord. diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 2ceb4ea9e5..9d50f53edb 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document Initial support for getting and changing chart and series titles Implement a proxy HSSFListener which tracks the format records, and lets you lookup the format string for a given cell. Convert the xls to csv example to use it 44792 - fixed encode/decode problems in ExternalNameRecord and CRNRecord. diff --git a/src/java/org/apache/poi/POIDocument.java b/src/java/org/apache/poi/POIDocument.java index 075fa45381..01e50231ce 100644 --- a/src/java/org/apache/poi/POIDocument.java +++ b/src/java/org/apache/poi/POIDocument.java @@ -29,6 +29,7 @@ import org.apache.poi.hpsf.PropertySet; import org.apache.poi.hpsf.PropertySetFactory; import org.apache.poi.hpsf.SummaryInformation; import org.apache.poi.poifs.filesystem.DirectoryEntry; +import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DocumentEntry; import org.apache.poi.poifs.filesystem.DocumentInputStream; import org.apache.poi.poifs.filesystem.Entry; @@ -50,12 +51,23 @@ public abstract class POIDocument { protected DocumentSummaryInformation dsInf; /** The open POIFS FileSystem that contains our document */ protected POIFSFileSystem filesystem; + /** The directory that our document lives in */ + protected DirectoryNode directory; /** For our own logging use */ protected POILogger logger = POILogFactory.getLogger(this.getClass()); /* Have the property streams been read yet? (Only done on-demand) */ protected boolean initialized = false; + + + protected POIDocument(DirectoryNode dir, POIFSFileSystem fs) { + this.filesystem = fs; + this.directory = dir; + } + protected POIDocument(POIFSFileSystem fs) { + this(fs.getRoot(), fs); + } /** * Fetch the Document Summary Information of the document @@ -110,7 +122,7 @@ public abstract class POIDocument { DocumentInputStream dis; try { // Find the entry, and get an input stream for it - dis = filesystem.createDocumentInputStream(setName); + dis = directory.createDocumentInputStream(setName); } catch(IOException ie) { // Oh well, doesn't exist logger.log(POILogger.WARN, "Error getting property set with name " + setName + "\n" + ie); diff --git a/src/java/org/apache/poi/hssf/usermodel/HSSFWorkbook.java b/src/java/org/apache/poi/hssf/usermodel/HSSFWorkbook.java index 3838e634d9..1eb4f7c8cf 100644 --- a/src/java/org/apache/poi/hssf/usermodel/HSSFWorkbook.java +++ b/src/java/org/apache/poi/hssf/usermodel/HSSFWorkbook.java @@ -139,6 +139,7 @@ public class HSSFWorkbook extends POIDocument protected HSSFWorkbook( Workbook book ) { + super(null, null); workbook = book; sheets = new ArrayList( INITIAL_CAPACITY ); names = new ArrayList( INITIAL_CAPACITY ); @@ -164,8 +165,8 @@ public class HSSFWorkbook extends POIDocument public HSSFWorkbook(POIFSFileSystem fs, boolean preserveNodes) throws IOException { + super(fs); this.preserveNodes = preserveNodes; - this.filesystem = fs; // If we're not preserving nodes, don't track the // POIFS any more diff --git a/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java b/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java index cb8039033d..6805e5197b 100644 --- a/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java +++ b/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java @@ -105,6 +105,31 @@ public class DirectoryNode { return _path; } + + /** + * open a document in the directory's entry's list of entries + * + * @param documentName the name of the document to be opened + * + * @return a newly opened DocumentInputStream + * + * @exception IOException if the document does not exist or the + * name is that of a DirectoryEntry + */ + + public DocumentInputStream createDocumentInputStream( + final String documentName) + throws IOException + { + Entry document = getEntry(documentName); + + if (!document.isDocumentEntry()) + { + throw new IOException("Entry '" + documentName + + "' is not a DocumentEntry"); + } + return new DocumentInputStream(( DocumentEntry ) document); + } /** * create a new DocumentEntry diff --git a/src/java/org/apache/poi/poifs/filesystem/POIFSFileSystem.java b/src/java/org/apache/poi/poifs/filesystem/POIFSFileSystem.java index 61774dc676..7c693a5de8 100644 --- a/src/java/org/apache/poi/poifs/filesystem/POIFSFileSystem.java +++ b/src/java/org/apache/poi/poifs/filesystem/POIFSFileSystem.java @@ -287,7 +287,7 @@ public class POIFSFileSystem { return getRoot().createDirectory(name); } - + /** * Write the filesystem out * @@ -422,7 +422,7 @@ public class POIFSFileSystem * @return the root entry */ - public DirectoryEntry getRoot() + public DirectoryNode getRoot() { if (_root == null) { @@ -446,14 +446,7 @@ public class POIFSFileSystem final String documentName) throws IOException { - Entry document = getRoot().getEntry(documentName); - - if (!document.isDocumentEntry()) - { - throw new IOException("Entry '" + documentName - + "' is not a DocumentEntry"); - } - return new DocumentInputStream(( DocumentEntry ) document); + return getRoot().createDocumentInputStream(documentName); } /** diff --git a/src/scratchpad/src/org/apache/poi/hdgf/HDGFDiagram.java b/src/scratchpad/src/org/apache/poi/hdgf/HDGFDiagram.java index 955cbc5ab4..af66163072 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/HDGFDiagram.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/HDGFDiagram.java @@ -53,7 +53,7 @@ public class HDGFDiagram extends POIDocument { private PointerFactory ptrFactory; public HDGFDiagram(POIFSFileSystem fs) throws IOException { - filesystem = fs; + super(fs); DocumentEntry docProps = (DocumentEntry)filesystem.getRoot().getEntry("VisioDocument"); diff --git a/src/scratchpad/src/org/apache/poi/hslf/HSLFSlideShow.java b/src/scratchpad/src/org/apache/poi/hslf/HSLFSlideShow.java index 12afcc49f6..2c523c70a6 100644 --- a/src/scratchpad/src/org/apache/poi/hslf/HSLFSlideShow.java +++ b/src/scratchpad/src/org/apache/poi/hslf/HSLFSlideShow.java @@ -124,7 +124,7 @@ public class HSLFSlideShow extends POIDocument */ public HSLFSlideShow(POIFSFileSystem filesystem) throws IOException { - this.filesystem = filesystem; + super(filesystem); // First up, grab the "Current User" stream // We need this before we can detect Encrypted Documents diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java index 557060aa50..a54e50de43 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java @@ -29,6 +29,7 @@ import java.io.ByteArrayInputStream; import java.util.Iterator; import org.apache.poi.POIDocument; +import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.DocumentEntry; import org.apache.poi.poifs.common.POIFSConstants; @@ -95,7 +96,7 @@ public class HWPFDocument extends POIDocument protected HWPFDocument() { - + super(null, null); } /** @@ -132,7 +133,7 @@ public class HWPFDocument extends POIDocument //do Ole stuff this( verifyAndBuildPOIFS(istream) ); } - + /** * This constructor loads a Word document from a POIFSFileSystem * @@ -141,16 +142,31 @@ public class HWPFDocument extends POIDocument * in POIFSFileSystem. */ public HWPFDocument(POIFSFileSystem pfilesystem) throws IOException + { + this(pfilesystem.getRoot(), pfilesystem); + } + + /** + * This constructor loads a Word document from a specific point + * in a POIFSFileSystem, probably not the default. + * Used typically to open embeded documents. + * + * @param pfilesystem The POIFSFileSystem that contains the Word document. + * @throws IOException If there is an unexpected IOException from the passed + * in POIFSFileSystem. + */ + public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException { // Sort out the hpsf properties - filesystem = pfilesystem; + super(directory, pfilesystem); readProperties(); // read in the main stream. - DocumentEntry documentProps = - (DocumentEntry)filesystem.getRoot().getEntry("WordDocument"); + DocumentEntry documentProps = (DocumentEntry) + directory.getEntry("WordDocument"); _mainStream = new byte[documentProps.getSize()]; - filesystem.createDocumentInputStream("WordDocument").read(_mainStream); + + directory.createDocumentInputStream("WordDocument").read(_mainStream); // use the fib to determine the name of the table stream. _fib = new FileInformationBlock(_mainStream); @@ -165,14 +181,14 @@ public class HWPFDocument extends POIDocument DocumentEntry tableProps; try { tableProps = - (DocumentEntry)filesystem.getRoot().getEntry(name); + (DocumentEntry)directory.getEntry(name); } catch(FileNotFoundException fnfe) { throw new IllegalStateException("Table Stream '" + name + "' wasn't found - Either the document is corrupt, or is Word95 (or earlier)"); } // read in the table stream. _tableStream = new byte[tableProps.getSize()]; - filesystem.createDocumentInputStream(name).read(_tableStream); + directory.createDocumentInputStream(name).read(_tableStream); _fib.fillVariableFields(_mainStream, _tableStream); @@ -180,7 +196,7 @@ public class HWPFDocument extends POIDocument try { DocumentEntry dataProps = - (DocumentEntry) filesystem.getRoot().getEntry("Data"); + (DocumentEntry)directory.getEntry("Data"); _dataStream = new byte[dataProps.getSize()]; filesystem.createDocumentInputStream("Data").read(_dataStream); } diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java index cda33675f2..c78ccfa323 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java @@ -23,6 +23,8 @@ import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.model.TextPiece; import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Range; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import junit.framework.TestCase; @@ -54,12 +56,16 @@ public class TestWordExtractor extends TestCase { private WordExtractor extractor; // Corrupted document - can't do paragraph based stuff private WordExtractor extractor2; + // A word doc embeded in an excel file + private String filename3; protected void setUp() throws Exception { String dirname = System.getProperty("HWPF.testdata.path"); + String pdirname = System.getProperty("POIFS.testdata.path"); String filename = dirname + "/test2.doc"; String filename2 = dirname + "/test.doc"; + filename3 = pdirname + "/excel_with_embeded.xls"; extractor = new WordExtractor(new FileInputStream(filename)); extractor2 = new WordExtractor(new FileInputStream(filename2)); @@ -101,4 +107,25 @@ public class TestWordExtractor extends TestCase { String text = extractor.getTextFromPieces(); assertEquals(p_text1_block, text); } + + + /** + * Test that we can get data from an + * embeded word document + * @throws Exception + */ + public void testExtractFromEmbeded() throws Exception { + POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(filename3)); + DirectoryNode dir = (DirectoryNode) + fs.getRoot().getEntry("MBD03F25D8D"); + // Should have WordDocument and 1Table + assertNotNull(dir.getEntry("1Table")); + assertNotNull(dir.getEntry("WordDocument")); + + HWPFDocument doc = new HWPFDocument(dir, fs); + WordExtractor extractor3 = new WordExtractor(doc); + + assertNotNull(extractor3.getText()); + assertTrue(extractor3.getText().length() > 20); + } }