mirror of https://github.com/apache/poi.git
Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@646870 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
636e3df7cf
commit
450c9754f3
|
@ -521,6 +521,8 @@ under the License.
|
|||
file="${main.src.test}/org/apache/poi/hwpf/data"/>
|
||||
<sysproperty key="HPSF.testdata.path"
|
||||
file="${main.src.test}/org/apache/poi/hpsf/data"/>
|
||||
<sysproperty key="POIFS.testdata.path"
|
||||
file="${main.src.test}/org/apache/poi/poifs/data"/>
|
||||
<sysproperty key="java.awt.headless" value="true"/>
|
||||
<formatter type="plain"/>
|
||||
<formatter type="xml"/>
|
||||
|
@ -556,6 +558,8 @@ under the License.
|
|||
file="${main.src.test}/org/apache/poi/hpsf/data"/>
|
||||
<sysproperty key="HWPF.testdata.path"
|
||||
file="${scratchpad.src.test}/org/apache/poi/hwpf/data"/>
|
||||
<sysproperty key="POIFS.testdata.path"
|
||||
file="${main.src.test}/org/apache/poi/poifs/data"/>
|
||||
<sysproperty key="java.awt.headless" value="true"/>
|
||||
<formatter type="plain" usefile="no"/>
|
||||
<batchtest todir="${main.reports.test}">
|
||||
|
@ -585,6 +589,7 @@ under the License.
|
|||
<sysproperty key="HWPF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hwpf/data"/>
|
||||
<sysproperty key="HSMF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hsmf/data"/>
|
||||
<sysproperty key="HDGF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hdgf/data"/>
|
||||
<sysproperty key="POIFS.testdata.path" file="${main.src.test}/org/apache/poi/poifs/data"/>
|
||||
<sysproperty key="java.awt.headless" value="true"/>
|
||||
<formatter type="plain" usefile="no"/>
|
||||
<formatter type="xml"/>
|
||||
|
@ -601,6 +606,7 @@ under the License.
|
|||
<classpath refid="test.classpath"/>
|
||||
<sysproperty key="HSSF.testdata.path" file="${main.src.test}/org/apache/poi/hssf/data"/>
|
||||
<sysproperty key="HPSF.testdata.path" file="${main.src.test}/org/apache/poi/hpsf/data"/>
|
||||
<sysproperty key="POIFS.testdata.path" file="${main.src.test}/org/apache/poi/poifs/data"/>
|
||||
<sysproperty key="java.awt.headless" value="true"/>
|
||||
<formatter type="plain" usefile="no"/>
|
||||
<test name="${testcase}"/>
|
||||
|
@ -639,6 +645,7 @@ under the License.
|
|||
<sysproperty key="HSLF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hslf/data"/>
|
||||
<sysproperty key="HSMF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hsmf/data"/>
|
||||
<sysproperty key="HDGF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hdgf/data"/>
|
||||
<sysproperty key="POIFS.testdata.path" file="${main.src.test}/org/apache/poi/poifs/data"/>
|
||||
<sysproperty key="java.awt.headless" value="true"/>
|
||||
<formatter type="plain"/>
|
||||
<formatter type="xml"/>
|
||||
|
@ -673,6 +680,7 @@ under the License.
|
|||
<sysproperty key="HSLF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hslf/data"/>
|
||||
<sysproperty key="HSMF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hsmf/data"/>
|
||||
<sysproperty key="HDGF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hdgf/data"/>
|
||||
<sysproperty key="POIFS.testdata.path" file="${main.src.test}/org/apache/poi/poifs/data"/>
|
||||
<sysproperty key="java.awt.headless" value="true"/>
|
||||
<sysproperty key="java.awt.headless" value="true"/>
|
||||
<formatter type="plain" usefile="no"/>
|
||||
|
|
|
@ -37,6 +37,7 @@
|
|||
|
||||
<!-- Don't forget to update status.xml too! -->
|
||||
<release version="3.0.3-beta1" date="2008-04-??">
|
||||
<action dev="POI-DEVELOPERS" type="add">Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Initial support for getting and changing chart and series titles</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Implement a proxy HSSFListener which tracks the format records, and lets you lookup the format string for a given cell. Convert the xls to csv example to use it</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">44792 - fixed encode/decode problems in ExternalNameRecord and CRNRecord.</action>
|
||||
|
|
|
@ -34,6 +34,7 @@
|
|||
<!-- Don't forget to update changes.xml too! -->
|
||||
<changes>
|
||||
<release version="3.0.3-beta1" date="2008-04-??">
|
||||
<action dev="POI-DEVELOPERS" type="add">Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Initial support for getting and changing chart and series titles</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Implement a proxy HSSFListener which tracks the format records, and lets you lookup the format string for a given cell. Convert the xls to csv example to use it</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">44792 - fixed encode/decode problems in ExternalNameRecord and CRNRecord.</action>
|
||||
|
|
|
@ -29,6 +29,7 @@ import org.apache.poi.hpsf.PropertySet;
|
|||
import org.apache.poi.hpsf.PropertySetFactory;
|
||||
import org.apache.poi.hpsf.SummaryInformation;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.DocumentEntry;
|
||||
import org.apache.poi.poifs.filesystem.DocumentInputStream;
|
||||
import org.apache.poi.poifs.filesystem.Entry;
|
||||
|
@ -50,12 +51,23 @@ public abstract class POIDocument {
|
|||
protected DocumentSummaryInformation dsInf;
|
||||
/** The open POIFS FileSystem that contains our document */
|
||||
protected POIFSFileSystem filesystem;
|
||||
/** The directory that our document lives in */
|
||||
protected DirectoryNode directory;
|
||||
|
||||
/** For our own logging use */
|
||||
protected POILogger logger = POILogFactory.getLogger(this.getClass());
|
||||
|
||||
/* Have the property streams been read yet? (Only done on-demand) */
|
||||
protected boolean initialized = false;
|
||||
|
||||
|
||||
protected POIDocument(DirectoryNode dir, POIFSFileSystem fs) {
|
||||
this.filesystem = fs;
|
||||
this.directory = dir;
|
||||
}
|
||||
protected POIDocument(POIFSFileSystem fs) {
|
||||
this(fs.getRoot(), fs);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch the Document Summary Information of the document
|
||||
|
@ -110,7 +122,7 @@ public abstract class POIDocument {
|
|||
DocumentInputStream dis;
|
||||
try {
|
||||
// Find the entry, and get an input stream for it
|
||||
dis = filesystem.createDocumentInputStream(setName);
|
||||
dis = directory.createDocumentInputStream(setName);
|
||||
} catch(IOException ie) {
|
||||
// Oh well, doesn't exist
|
||||
logger.log(POILogger.WARN, "Error getting property set with name " + setName + "\n" + ie);
|
||||
|
|
|
@ -139,6 +139,7 @@ public class HSSFWorkbook extends POIDocument
|
|||
|
||||
protected HSSFWorkbook( Workbook book )
|
||||
{
|
||||
super(null, null);
|
||||
workbook = book;
|
||||
sheets = new ArrayList( INITIAL_CAPACITY );
|
||||
names = new ArrayList( INITIAL_CAPACITY );
|
||||
|
@ -164,8 +165,8 @@ public class HSSFWorkbook extends POIDocument
|
|||
public HSSFWorkbook(POIFSFileSystem fs, boolean preserveNodes)
|
||||
throws IOException
|
||||
{
|
||||
super(fs);
|
||||
this.preserveNodes = preserveNodes;
|
||||
this.filesystem = fs;
|
||||
|
||||
// If we're not preserving nodes, don't track the
|
||||
// POIFS any more
|
||||
|
|
|
@ -105,6 +105,31 @@ public class DirectoryNode
|
|||
{
|
||||
return _path;
|
||||
}
|
||||
|
||||
/**
|
||||
* open a document in the directory's entry's list of entries
|
||||
*
|
||||
* @param documentName the name of the document to be opened
|
||||
*
|
||||
* @return a newly opened DocumentInputStream
|
||||
*
|
||||
* @exception IOException if the document does not exist or the
|
||||
* name is that of a DirectoryEntry
|
||||
*/
|
||||
|
||||
public DocumentInputStream createDocumentInputStream(
|
||||
final String documentName)
|
||||
throws IOException
|
||||
{
|
||||
Entry document = getEntry(documentName);
|
||||
|
||||
if (!document.isDocumentEntry())
|
||||
{
|
||||
throw new IOException("Entry '" + documentName
|
||||
+ "' is not a DocumentEntry");
|
||||
}
|
||||
return new DocumentInputStream(( DocumentEntry ) document);
|
||||
}
|
||||
|
||||
/**
|
||||
* create a new DocumentEntry
|
||||
|
|
|
@ -287,7 +287,7 @@ public class POIFSFileSystem
|
|||
{
|
||||
return getRoot().createDirectory(name);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Write the filesystem out
|
||||
*
|
||||
|
@ -422,7 +422,7 @@ public class POIFSFileSystem
|
|||
* @return the root entry
|
||||
*/
|
||||
|
||||
public DirectoryEntry getRoot()
|
||||
public DirectoryNode getRoot()
|
||||
{
|
||||
if (_root == null)
|
||||
{
|
||||
|
@ -446,14 +446,7 @@ public class POIFSFileSystem
|
|||
final String documentName)
|
||||
throws IOException
|
||||
{
|
||||
Entry document = getRoot().getEntry(documentName);
|
||||
|
||||
if (!document.isDocumentEntry())
|
||||
{
|
||||
throw new IOException("Entry '" + documentName
|
||||
+ "' is not a DocumentEntry");
|
||||
}
|
||||
return new DocumentInputStream(( DocumentEntry ) document);
|
||||
return getRoot().createDocumentInputStream(documentName);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -53,7 +53,7 @@ public class HDGFDiagram extends POIDocument {
|
|||
private PointerFactory ptrFactory;
|
||||
|
||||
public HDGFDiagram(POIFSFileSystem fs) throws IOException {
|
||||
filesystem = fs;
|
||||
super(fs);
|
||||
|
||||
DocumentEntry docProps =
|
||||
(DocumentEntry)filesystem.getRoot().getEntry("VisioDocument");
|
||||
|
|
|
@ -124,7 +124,7 @@ public class HSLFSlideShow extends POIDocument
|
|||
*/
|
||||
public HSLFSlideShow(POIFSFileSystem filesystem) throws IOException
|
||||
{
|
||||
this.filesystem = filesystem;
|
||||
super(filesystem);
|
||||
|
||||
// First up, grab the "Current User" stream
|
||||
// We need this before we can detect Encrypted Documents
|
||||
|
|
|
@ -29,6 +29,7 @@ import java.io.ByteArrayInputStream;
|
|||
import java.util.Iterator;
|
||||
|
||||
import org.apache.poi.POIDocument;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.poifs.filesystem.DocumentEntry;
|
||||
import org.apache.poi.poifs.common.POIFSConstants;
|
||||
|
@ -95,7 +96,7 @@ public class HWPFDocument extends POIDocument
|
|||
|
||||
protected HWPFDocument()
|
||||
{
|
||||
|
||||
super(null, null);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -132,7 +133,7 @@ public class HWPFDocument extends POIDocument
|
|||
//do Ole stuff
|
||||
this( verifyAndBuildPOIFS(istream) );
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This constructor loads a Word document from a POIFSFileSystem
|
||||
*
|
||||
|
@ -141,16 +142,31 @@ public class HWPFDocument extends POIDocument
|
|||
* in POIFSFileSystem.
|
||||
*/
|
||||
public HWPFDocument(POIFSFileSystem pfilesystem) throws IOException
|
||||
{
|
||||
this(pfilesystem.getRoot(), pfilesystem);
|
||||
}
|
||||
|
||||
/**
|
||||
* This constructor loads a Word document from a specific point
|
||||
* in a POIFSFileSystem, probably not the default.
|
||||
* Used typically to open embeded documents.
|
||||
*
|
||||
* @param pfilesystem The POIFSFileSystem that contains the Word document.
|
||||
* @throws IOException If there is an unexpected IOException from the passed
|
||||
* in POIFSFileSystem.
|
||||
*/
|
||||
public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
|
||||
{
|
||||
// Sort out the hpsf properties
|
||||
filesystem = pfilesystem;
|
||||
super(directory, pfilesystem);
|
||||
readProperties();
|
||||
|
||||
// read in the main stream.
|
||||
DocumentEntry documentProps =
|
||||
(DocumentEntry)filesystem.getRoot().getEntry("WordDocument");
|
||||
DocumentEntry documentProps = (DocumentEntry)
|
||||
directory.getEntry("WordDocument");
|
||||
_mainStream = new byte[documentProps.getSize()];
|
||||
filesystem.createDocumentInputStream("WordDocument").read(_mainStream);
|
||||
|
||||
directory.createDocumentInputStream("WordDocument").read(_mainStream);
|
||||
|
||||
// use the fib to determine the name of the table stream.
|
||||
_fib = new FileInformationBlock(_mainStream);
|
||||
|
@ -165,14 +181,14 @@ public class HWPFDocument extends POIDocument
|
|||
DocumentEntry tableProps;
|
||||
try {
|
||||
tableProps =
|
||||
(DocumentEntry)filesystem.getRoot().getEntry(name);
|
||||
(DocumentEntry)directory.getEntry(name);
|
||||
} catch(FileNotFoundException fnfe) {
|
||||
throw new IllegalStateException("Table Stream '" + name + "' wasn't found - Either the document is corrupt, or is Word95 (or earlier)");
|
||||
}
|
||||
|
||||
// read in the table stream.
|
||||
_tableStream = new byte[tableProps.getSize()];
|
||||
filesystem.createDocumentInputStream(name).read(_tableStream);
|
||||
directory.createDocumentInputStream(name).read(_tableStream);
|
||||
|
||||
_fib.fillVariableFields(_mainStream, _tableStream);
|
||||
|
||||
|
@ -180,7 +196,7 @@ public class HWPFDocument extends POIDocument
|
|||
try
|
||||
{
|
||||
DocumentEntry dataProps =
|
||||
(DocumentEntry) filesystem.getRoot().getEntry("Data");
|
||||
(DocumentEntry)directory.getEntry("Data");
|
||||
_dataStream = new byte[dataProps.getSize()];
|
||||
filesystem.createDocumentInputStream("Data").read(_dataStream);
|
||||
}
|
||||
|
|
|
@ -23,6 +23,8 @@ import org.apache.poi.hwpf.HWPFDocument;
|
|||
import org.apache.poi.hwpf.model.TextPiece;
|
||||
import org.apache.poi.hwpf.usermodel.Paragraph;
|
||||
import org.apache.poi.hwpf.usermodel.Range;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
|
@ -54,12 +56,16 @@ public class TestWordExtractor extends TestCase {
|
|||
private WordExtractor extractor;
|
||||
// Corrupted document - can't do paragraph based stuff
|
||||
private WordExtractor extractor2;
|
||||
// A word doc embeded in an excel file
|
||||
private String filename3;
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
String dirname = System.getProperty("HWPF.testdata.path");
|
||||
String pdirname = System.getProperty("POIFS.testdata.path");
|
||||
|
||||
String filename = dirname + "/test2.doc";
|
||||
String filename2 = dirname + "/test.doc";
|
||||
filename3 = pdirname + "/excel_with_embeded.xls";
|
||||
extractor = new WordExtractor(new FileInputStream(filename));
|
||||
extractor2 = new WordExtractor(new FileInputStream(filename2));
|
||||
|
||||
|
@ -101,4 +107,25 @@ public class TestWordExtractor extends TestCase {
|
|||
String text = extractor.getTextFromPieces();
|
||||
assertEquals(p_text1_block, text);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Test that we can get data from an
|
||||
* embeded word document
|
||||
* @throws Exception
|
||||
*/
|
||||
public void testExtractFromEmbeded() throws Exception {
|
||||
POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(filename3));
|
||||
DirectoryNode dir = (DirectoryNode)
|
||||
fs.getRoot().getEntry("MBD03F25D8D");
|
||||
// Should have WordDocument and 1Table
|
||||
assertNotNull(dir.getEntry("1Table"));
|
||||
assertNotNull(dir.getEntry("WordDocument"));
|
||||
|
||||
HWPFDocument doc = new HWPFDocument(dir, fs);
|
||||
WordExtractor extractor3 = new WordExtractor(doc);
|
||||
|
||||
assertNotNull(extractor3.getText());
|
||||
assertTrue(extractor3.getText().length() > 20);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue