Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@646870 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-04-10 16:59:10 +00:00
parent 636e3df7cf
commit 450c9754f3
11 changed files with 107 additions and 23 deletions

View File

@ -521,6 +521,8 @@ under the License.
file="${main.src.test}/org/apache/poi/hwpf/data"/> file="${main.src.test}/org/apache/poi/hwpf/data"/>
<sysproperty key="HPSF.testdata.path" <sysproperty key="HPSF.testdata.path"
file="${main.src.test}/org/apache/poi/hpsf/data"/> file="${main.src.test}/org/apache/poi/hpsf/data"/>
<sysproperty key="POIFS.testdata.path"
file="${main.src.test}/org/apache/poi/poifs/data"/>
<sysproperty key="java.awt.headless" value="true"/> <sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain"/> <formatter type="plain"/>
<formatter type="xml"/> <formatter type="xml"/>
@ -556,6 +558,8 @@ under the License.
file="${main.src.test}/org/apache/poi/hpsf/data"/> file="${main.src.test}/org/apache/poi/hpsf/data"/>
<sysproperty key="HWPF.testdata.path" <sysproperty key="HWPF.testdata.path"
file="${scratchpad.src.test}/org/apache/poi/hwpf/data"/> file="${scratchpad.src.test}/org/apache/poi/hwpf/data"/>
<sysproperty key="POIFS.testdata.path"
file="${main.src.test}/org/apache/poi/poifs/data"/>
<sysproperty key="java.awt.headless" value="true"/> <sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain" usefile="no"/> <formatter type="plain" usefile="no"/>
<batchtest todir="${main.reports.test}"> <batchtest todir="${main.reports.test}">
@ -585,6 +589,7 @@ under the License.
<sysproperty key="HWPF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hwpf/data"/> <sysproperty key="HWPF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hwpf/data"/>
<sysproperty key="HSMF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hsmf/data"/> <sysproperty key="HSMF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hsmf/data"/>
<sysproperty key="HDGF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hdgf/data"/> <sysproperty key="HDGF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hdgf/data"/>
<sysproperty key="POIFS.testdata.path" file="${main.src.test}/org/apache/poi/poifs/data"/>
<sysproperty key="java.awt.headless" value="true"/> <sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain" usefile="no"/> <formatter type="plain" usefile="no"/>
<formatter type="xml"/> <formatter type="xml"/>
@ -601,6 +606,7 @@ under the License.
<classpath refid="test.classpath"/> <classpath refid="test.classpath"/>
<sysproperty key="HSSF.testdata.path" file="${main.src.test}/org/apache/poi/hssf/data"/> <sysproperty key="HSSF.testdata.path" file="${main.src.test}/org/apache/poi/hssf/data"/>
<sysproperty key="HPSF.testdata.path" file="${main.src.test}/org/apache/poi/hpsf/data"/> <sysproperty key="HPSF.testdata.path" file="${main.src.test}/org/apache/poi/hpsf/data"/>
<sysproperty key="POIFS.testdata.path" file="${main.src.test}/org/apache/poi/poifs/data"/>
<sysproperty key="java.awt.headless" value="true"/> <sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain" usefile="no"/> <formatter type="plain" usefile="no"/>
<test name="${testcase}"/> <test name="${testcase}"/>
@ -639,6 +645,7 @@ under the License.
<sysproperty key="HSLF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hslf/data"/> <sysproperty key="HSLF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hslf/data"/>
<sysproperty key="HSMF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hsmf/data"/> <sysproperty key="HSMF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hsmf/data"/>
<sysproperty key="HDGF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hdgf/data"/> <sysproperty key="HDGF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hdgf/data"/>
<sysproperty key="POIFS.testdata.path" file="${main.src.test}/org/apache/poi/poifs/data"/>
<sysproperty key="java.awt.headless" value="true"/> <sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain"/> <formatter type="plain"/>
<formatter type="xml"/> <formatter type="xml"/>
@ -673,6 +680,7 @@ under the License.
<sysproperty key="HSLF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hslf/data"/> <sysproperty key="HSLF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hslf/data"/>
<sysproperty key="HSMF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hsmf/data"/> <sysproperty key="HSMF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hsmf/data"/>
<sysproperty key="HDGF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hdgf/data"/> <sysproperty key="HDGF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hdgf/data"/>
<sysproperty key="POIFS.testdata.path" file="${main.src.test}/org/apache/poi/poifs/data"/>
<sysproperty key="java.awt.headless" value="true"/> <sysproperty key="java.awt.headless" value="true"/>
<sysproperty key="java.awt.headless" value="true"/> <sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain" usefile="no"/> <formatter type="plain" usefile="no"/>

View File

@ -37,6 +37,7 @@
<!-- Don't forget to update status.xml too! --> <!-- Don't forget to update status.xml too! -->
<release version="3.0.3-beta1" date="2008-04-??"> <release version="3.0.3-beta1" date="2008-04-??">
<action dev="POI-DEVELOPERS" type="add">Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document</action>
<action dev="POI-DEVELOPERS" type="add">Initial support for getting and changing chart and series titles</action> <action dev="POI-DEVELOPERS" type="add">Initial support for getting and changing chart and series titles</action>
<action dev="POI-DEVELOPERS" type="add">Implement a proxy HSSFListener which tracks the format records, and lets you lookup the format string for a given cell. Convert the xls to csv example to use it</action> <action dev="POI-DEVELOPERS" type="add">Implement a proxy HSSFListener which tracks the format records, and lets you lookup the format string for a given cell. Convert the xls to csv example to use it</action>
<action dev="POI-DEVELOPERS" type="fix">44792 - fixed encode/decode problems in ExternalNameRecord and CRNRecord.</action> <action dev="POI-DEVELOPERS" type="fix">44792 - fixed encode/decode problems in ExternalNameRecord and CRNRecord.</action>

View File

@ -34,6 +34,7 @@
<!-- Don't forget to update changes.xml too! --> <!-- Don't forget to update changes.xml too! -->
<changes> <changes>
<release version="3.0.3-beta1" date="2008-04-??"> <release version="3.0.3-beta1" date="2008-04-??">
<action dev="POI-DEVELOPERS" type="add">Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document</action>
<action dev="POI-DEVELOPERS" type="add">Initial support for getting and changing chart and series titles</action> <action dev="POI-DEVELOPERS" type="add">Initial support for getting and changing chart and series titles</action>
<action dev="POI-DEVELOPERS" type="add">Implement a proxy HSSFListener which tracks the format records, and lets you lookup the format string for a given cell. Convert the xls to csv example to use it</action> <action dev="POI-DEVELOPERS" type="add">Implement a proxy HSSFListener which tracks the format records, and lets you lookup the format string for a given cell. Convert the xls to csv example to use it</action>
<action dev="POI-DEVELOPERS" type="fix">44792 - fixed encode/decode problems in ExternalNameRecord and CRNRecord.</action> <action dev="POI-DEVELOPERS" type="fix">44792 - fixed encode/decode problems in ExternalNameRecord and CRNRecord.</action>

View File

@ -29,6 +29,7 @@ import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.PropertySetFactory; import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.hpsf.SummaryInformation; import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry; import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream; import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.Entry;
@ -50,6 +51,8 @@ public abstract class POIDocument {
protected DocumentSummaryInformation dsInf; protected DocumentSummaryInformation dsInf;
/** The open POIFS FileSystem that contains our document */ /** The open POIFS FileSystem that contains our document */
protected POIFSFileSystem filesystem; protected POIFSFileSystem filesystem;
/** The directory that our document lives in */
protected DirectoryNode directory;
/** For our own logging use */ /** For our own logging use */
protected POILogger logger = POILogFactory.getLogger(this.getClass()); protected POILogger logger = POILogFactory.getLogger(this.getClass());
@ -57,6 +60,15 @@ public abstract class POIDocument {
/* Have the property streams been read yet? (Only done on-demand) */ /* Have the property streams been read yet? (Only done on-demand) */
protected boolean initialized = false; protected boolean initialized = false;
protected POIDocument(DirectoryNode dir, POIFSFileSystem fs) {
this.filesystem = fs;
this.directory = dir;
}
protected POIDocument(POIFSFileSystem fs) {
this(fs.getRoot(), fs);
}
/** /**
* Fetch the Document Summary Information of the document * Fetch the Document Summary Information of the document
*/ */
@ -110,7 +122,7 @@ public abstract class POIDocument {
DocumentInputStream dis; DocumentInputStream dis;
try { try {
// Find the entry, and get an input stream for it // Find the entry, and get an input stream for it
dis = filesystem.createDocumentInputStream(setName); dis = directory.createDocumentInputStream(setName);
} catch(IOException ie) { } catch(IOException ie) {
// Oh well, doesn't exist // Oh well, doesn't exist
logger.log(POILogger.WARN, "Error getting property set with name " + setName + "\n" + ie); logger.log(POILogger.WARN, "Error getting property set with name " + setName + "\n" + ie);

View File

@ -139,6 +139,7 @@ public class HSSFWorkbook extends POIDocument
protected HSSFWorkbook( Workbook book ) protected HSSFWorkbook( Workbook book )
{ {
super(null, null);
workbook = book; workbook = book;
sheets = new ArrayList( INITIAL_CAPACITY ); sheets = new ArrayList( INITIAL_CAPACITY );
names = new ArrayList( INITIAL_CAPACITY ); names = new ArrayList( INITIAL_CAPACITY );
@ -164,8 +165,8 @@ public class HSSFWorkbook extends POIDocument
public HSSFWorkbook(POIFSFileSystem fs, boolean preserveNodes) public HSSFWorkbook(POIFSFileSystem fs, boolean preserveNodes)
throws IOException throws IOException
{ {
super(fs);
this.preserveNodes = preserveNodes; this.preserveNodes = preserveNodes;
this.filesystem = fs;
// If we're not preserving nodes, don't track the // If we're not preserving nodes, don't track the
// POIFS any more // POIFS any more

View File

@ -106,6 +106,31 @@ public class DirectoryNode
return _path; return _path;
} }
/**
* open a document in the directory's entry's list of entries
*
* @param documentName the name of the document to be opened
*
* @return a newly opened DocumentInputStream
*
* @exception IOException if the document does not exist or the
* name is that of a DirectoryEntry
*/
public DocumentInputStream createDocumentInputStream(
final String documentName)
throws IOException
{
Entry document = getEntry(documentName);
if (!document.isDocumentEntry())
{
throw new IOException("Entry '" + documentName
+ "' is not a DocumentEntry");
}
return new DocumentInputStream(( DocumentEntry ) document);
}
/** /**
* create a new DocumentEntry * create a new DocumentEntry
* *

View File

@ -422,7 +422,7 @@ public class POIFSFileSystem
* @return the root entry * @return the root entry
*/ */
public DirectoryEntry getRoot() public DirectoryNode getRoot()
{ {
if (_root == null) if (_root == null)
{ {
@ -446,14 +446,7 @@ public class POIFSFileSystem
final String documentName) final String documentName)
throws IOException throws IOException
{ {
Entry document = getRoot().getEntry(documentName); return getRoot().createDocumentInputStream(documentName);
if (!document.isDocumentEntry())
{
throw new IOException("Entry '" + documentName
+ "' is not a DocumentEntry");
}
return new DocumentInputStream(( DocumentEntry ) document);
} }
/** /**

View File

@ -53,7 +53,7 @@ public class HDGFDiagram extends POIDocument {
private PointerFactory ptrFactory; private PointerFactory ptrFactory;
public HDGFDiagram(POIFSFileSystem fs) throws IOException { public HDGFDiagram(POIFSFileSystem fs) throws IOException {
filesystem = fs; super(fs);
DocumentEntry docProps = DocumentEntry docProps =
(DocumentEntry)filesystem.getRoot().getEntry("VisioDocument"); (DocumentEntry)filesystem.getRoot().getEntry("VisioDocument");

View File

@ -124,7 +124,7 @@ public class HSLFSlideShow extends POIDocument
*/ */
public HSLFSlideShow(POIFSFileSystem filesystem) throws IOException public HSLFSlideShow(POIFSFileSystem filesystem) throws IOException
{ {
this.filesystem = filesystem; super(filesystem);
// First up, grab the "Current User" stream // First up, grab the "Current User" stream
// We need this before we can detect Encrypted Documents // We need this before we can detect Encrypted Documents

View File

@ -29,6 +29,7 @@ import java.io.ByteArrayInputStream;
import java.util.Iterator; import java.util.Iterator;
import org.apache.poi.POIDocument; import org.apache.poi.POIDocument;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.poifs.filesystem.DocumentEntry; import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.common.POIFSConstants; import org.apache.poi.poifs.common.POIFSConstants;
@ -95,7 +96,7 @@ public class HWPFDocument extends POIDocument
protected HWPFDocument() protected HWPFDocument()
{ {
super(null, null);
} }
/** /**
@ -141,16 +142,31 @@ public class HWPFDocument extends POIDocument
* in POIFSFileSystem. * in POIFSFileSystem.
*/ */
public HWPFDocument(POIFSFileSystem pfilesystem) throws IOException public HWPFDocument(POIFSFileSystem pfilesystem) throws IOException
{
this(pfilesystem.getRoot(), pfilesystem);
}
/**
* This constructor loads a Word document from a specific point
* in a POIFSFileSystem, probably not the default.
* Used typically to open embeded documents.
*
* @param pfilesystem The POIFSFileSystem that contains the Word document.
* @throws IOException If there is an unexpected IOException from the passed
* in POIFSFileSystem.
*/
public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
{ {
// Sort out the hpsf properties // Sort out the hpsf properties
filesystem = pfilesystem; super(directory, pfilesystem);
readProperties(); readProperties();
// read in the main stream. // read in the main stream.
DocumentEntry documentProps = DocumentEntry documentProps = (DocumentEntry)
(DocumentEntry)filesystem.getRoot().getEntry("WordDocument"); directory.getEntry("WordDocument");
_mainStream = new byte[documentProps.getSize()]; _mainStream = new byte[documentProps.getSize()];
filesystem.createDocumentInputStream("WordDocument").read(_mainStream);
directory.createDocumentInputStream("WordDocument").read(_mainStream);
// use the fib to determine the name of the table stream. // use the fib to determine the name of the table stream.
_fib = new FileInformationBlock(_mainStream); _fib = new FileInformationBlock(_mainStream);
@ -165,14 +181,14 @@ public class HWPFDocument extends POIDocument
DocumentEntry tableProps; DocumentEntry tableProps;
try { try {
tableProps = tableProps =
(DocumentEntry)filesystem.getRoot().getEntry(name); (DocumentEntry)directory.getEntry(name);
} catch(FileNotFoundException fnfe) { } catch(FileNotFoundException fnfe) {
throw new IllegalStateException("Table Stream '" + name + "' wasn't found - Either the document is corrupt, or is Word95 (or earlier)"); throw new IllegalStateException("Table Stream '" + name + "' wasn't found - Either the document is corrupt, or is Word95 (or earlier)");
} }
// read in the table stream. // read in the table stream.
_tableStream = new byte[tableProps.getSize()]; _tableStream = new byte[tableProps.getSize()];
filesystem.createDocumentInputStream(name).read(_tableStream); directory.createDocumentInputStream(name).read(_tableStream);
_fib.fillVariableFields(_mainStream, _tableStream); _fib.fillVariableFields(_mainStream, _tableStream);
@ -180,7 +196,7 @@ public class HWPFDocument extends POIDocument
try try
{ {
DocumentEntry dataProps = DocumentEntry dataProps =
(DocumentEntry) filesystem.getRoot().getEntry("Data"); (DocumentEntry)directory.getEntry("Data");
_dataStream = new byte[dataProps.getSize()]; _dataStream = new byte[dataProps.getSize()];
filesystem.createDocumentInputStream("Data").read(_dataStream); filesystem.createDocumentInputStream("Data").read(_dataStream);
} }

View File

@ -23,6 +23,8 @@ import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.TextPiece; import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import junit.framework.TestCase; import junit.framework.TestCase;
@ -54,12 +56,16 @@ public class TestWordExtractor extends TestCase {
private WordExtractor extractor; private WordExtractor extractor;
// Corrupted document - can't do paragraph based stuff // Corrupted document - can't do paragraph based stuff
private WordExtractor extractor2; private WordExtractor extractor2;
// A word doc embeded in an excel file
private String filename3;
protected void setUp() throws Exception { protected void setUp() throws Exception {
String dirname = System.getProperty("HWPF.testdata.path"); String dirname = System.getProperty("HWPF.testdata.path");
String pdirname = System.getProperty("POIFS.testdata.path");
String filename = dirname + "/test2.doc"; String filename = dirname + "/test2.doc";
String filename2 = dirname + "/test.doc"; String filename2 = dirname + "/test.doc";
filename3 = pdirname + "/excel_with_embeded.xls";
extractor = new WordExtractor(new FileInputStream(filename)); extractor = new WordExtractor(new FileInputStream(filename));
extractor2 = new WordExtractor(new FileInputStream(filename2)); extractor2 = new WordExtractor(new FileInputStream(filename2));
@ -101,4 +107,25 @@ public class TestWordExtractor extends TestCase {
String text = extractor.getTextFromPieces(); String text = extractor.getTextFromPieces();
assertEquals(p_text1_block, text); assertEquals(p_text1_block, text);
} }
/**
* Test that we can get data from an
* embeded word document
* @throws Exception
*/
public void testExtractFromEmbeded() throws Exception {
POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(filename3));
DirectoryNode dir = (DirectoryNode)
fs.getRoot().getEntry("MBD03F25D8D");
// Should have WordDocument and 1Table
assertNotNull(dir.getEntry("1Table"));
assertNotNull(dir.getEntry("WordDocument"));
HWPFDocument doc = new HWPFDocument(dir, fs);
WordExtractor extractor3 = new WordExtractor(doc);
assertNotNull(extractor3.getText());
assertTrue(extractor3.getText().length() > 20);
}
} }