Basic text extraction support for old Word 6 and Word 95 documents via some HWPF extensions

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@959346 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2010-06-30 15:13:10 +00:00
parent 7ae1a20f07
commit 999aecbaa1
11 changed files with 531 additions and 72 deletions

View File

@ -17,26 +17,43 @@
package org.apache.poi.hwpf; package org.apache.poi.hwpf;
import java.io.InputStream; import java.io.ByteArrayInputStream;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.PushbackInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream; import java.io.OutputStream;
import java.io.ByteArrayInputStream;
import java.util.Iterator; import java.util.Iterator;
import org.apache.poi.EncryptedDocumentException; import org.apache.poi.hwpf.model.CHPBinTable;
import org.apache.poi.POIDocument; import org.apache.poi.hwpf.model.CPSplitCalculator;
import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.hwpf.model.ComplexFileTable;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.hwpf.model.DocumentProperties;
import org.apache.poi.poifs.filesystem.DocumentEntry; import org.apache.poi.hwpf.model.EscherRecordHolder;
import org.apache.poi.hwpf.model.FSPATable;
import org.apache.poi.hwpf.model.FileInformationBlock;
import org.apache.poi.hwpf.model.FontTable;
import org.apache.poi.hwpf.model.GenericPropertyNode;
import org.apache.poi.hwpf.model.ListTables;
import org.apache.poi.hwpf.model.PAPBinTable;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.model.PlexOfCps;
import org.apache.poi.hwpf.model.PropertyNode;
import org.apache.poi.hwpf.model.RevisionMarkAuthorTable;
import org.apache.poi.hwpf.model.SavedByTable;
import org.apache.poi.hwpf.model.SectionTable;
import org.apache.poi.hwpf.model.ShapesTable;
import org.apache.poi.hwpf.model.StyleSheet;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.model.TextPieceTable;
import org.apache.poi.hwpf.model.io.HWPFFileSystem;
import org.apache.poi.hwpf.model.io.HWPFOutputStream;
import org.apache.poi.hwpf.usermodel.HWPFList;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.common.POIFSConstants; import org.apache.poi.poifs.common.POIFSConstants;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.hwpf.model.*; import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.hwpf.model.io.*; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.hwpf.usermodel.*;
/** /**
@ -46,17 +63,11 @@ import org.apache.poi.hwpf.usermodel.*;
* *
* @author Ryan Ackley * @author Ryan Ackley
*/ */
public final class HWPFDocument extends POIDocument public final class HWPFDocument extends HWPFDocumentCore
// implements Cloneable
{ {
/** The FIB */
protected FileInformationBlock _fib;
/** And for making sense of CP lengths in the FIB */ /** And for making sense of CP lengths in the FIB */
protected CPSplitCalculator _cpSplit; protected CPSplitCalculator _cpSplit;
/** main document stream buffer*/
protected byte[] _mainStream;
/** table stream buffer*/ /** table stream buffer*/
protected byte[] _tableStream; protected byte[] _tableStream;
@ -110,29 +121,7 @@ public final class HWPFDocument extends POIDocument
protected HWPFDocument() protected HWPFDocument()
{ {
super(null, null); super();
}
/**
* Takens an InputStream, verifies that it's not RTF, builds a
* POIFSFileSystem from it, and returns that.
*/
public static POIFSFileSystem verifyAndBuildPOIFS(InputStream istream) throws IOException {
// Open a PushbackInputStream, so we can peek at the first few bytes
PushbackInputStream pis = new PushbackInputStream(istream,6);
byte[] first6 = new byte[6];
pis.read(first6);
// Does it start with {\rtf ? If so, it's really RTF
if(first6[0] == '{' && first6[1] == '\\' && first6[2] == 'r'
&& first6[3] == 't' && first6[4] == 'f') {
throw new IllegalArgumentException("The document is really a RTF file");
}
// OK, so it's not RTF
// Open a POIFSFileSystem on the (pushed back) stream
pis.unread(first6);
return new POIFSFileSystem(pis);
} }
/** /**
@ -171,21 +160,16 @@ public final class HWPFDocument extends POIDocument
*/ */
public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
{ {
// Sort out the hpsf properties // Load the main stream and FIB
// Also handles HPSF bits
super(directory, pfilesystem); super(directory, pfilesystem);
// read in the main stream. // Do the CP Split
DocumentEntry documentProps = (DocumentEntry)
directory.getEntry("WordDocument");
_mainStream = new byte[documentProps.getSize()];
directory.createDocumentInputStream("WordDocument").read(_mainStream);
// Create our FIB, and check for the doc being encrypted
_fib = new FileInformationBlock(_mainStream);
_cpSplit = new CPSplitCalculator(_fib); _cpSplit = new CPSplitCalculator(_fib);
if(_fib.isFEncrypted()) {
throw new EncryptedDocumentException("Cannot process encrypted word files!"); // Is this document too old for us?
if(_fib.getNFib() < 106) {
throw new OldWordFileFormatException("The document is too old (Word 95 or older) ");
} }
// use the fib to determine the name of the table stream. // use the fib to determine the name of the table stream.
@ -691,17 +675,4 @@ public final class HWPFDocument extends POIDocument
t.printStackTrace(); t.printStackTrace();
} }
} }
// public Object clone()
// throws CloneNotSupportedException
// {
// _tpt;
//
// _cbt;
//
// _pbt;
//
// _st;
//
// }
} }

View File

@ -0,0 +1,130 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import org.apache.poi.EncryptedDocumentException;
import org.apache.poi.POIDocument;
import org.apache.poi.hwpf.model.FileInformationBlock;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* This class holds much of the core of a Word document, but
* without some of the table structure information.
* You generally want to work with one of
* {@link HWPFDocument} or {@link HWPFOldDocument}
*/
public abstract class HWPFDocumentCore extends POIDocument
{
/** The FIB */
protected FileInformationBlock _fib;
/** main document stream buffer*/
protected byte[] _mainStream;
protected HWPFDocumentCore()
{
super(null, null);
}
/**
* Takens an InputStream, verifies that it's not RTF, builds a
* POIFSFileSystem from it, and returns that.
*/
public static POIFSFileSystem verifyAndBuildPOIFS(InputStream istream) throws IOException {
// Open a PushbackInputStream, so we can peek at the first few bytes
PushbackInputStream pis = new PushbackInputStream(istream,6);
byte[] first6 = new byte[6];
pis.read(first6);
// Does it start with {\rtf ? If so, it's really RTF
if(first6[0] == '{' && first6[1] == '\\' && first6[2] == 'r'
&& first6[3] == 't' && first6[4] == 'f') {
throw new IllegalArgumentException("The document is really a RTF file");
}
// OK, so it's not RTF
// Open a POIFSFileSystem on the (pushed back) stream
pis.unread(first6);
return new POIFSFileSystem(pis);
}
/**
* This constructor loads a Word document from an InputStream.
*
* @param istream The InputStream that contains the Word document.
* @throws IOException If there is an unexpected IOException from the passed
* in InputStream.
*/
public HWPFDocumentCore(InputStream istream) throws IOException
{
//do Ole stuff
this( verifyAndBuildPOIFS(istream) );
}
/**
* This constructor loads a Word document from a POIFSFileSystem
*
* @param pfilesystem The POIFSFileSystem that contains the Word document.
* @throws IOException If there is an unexpected IOException from the passed
* in POIFSFileSystem.
*/
public HWPFDocumentCore(POIFSFileSystem pfilesystem) throws IOException
{
this(pfilesystem.getRoot(), pfilesystem);
}
/**
* This constructor loads a Word document from a specific point
* in a POIFSFileSystem, probably not the default.
* Used typically to open embeded documents.
*
* @param pfilesystem The POIFSFileSystem that contains the Word document.
* @throws IOException If there is an unexpected IOException from the passed
* in POIFSFileSystem.
*/
public HWPFDocumentCore(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
{
// Sort out the hpsf properties
super(directory, pfilesystem);
// read in the main stream.
DocumentEntry documentProps = (DocumentEntry)
directory.getEntry("WordDocument");
_mainStream = new byte[documentProps.getSize()];
directory.createDocumentInputStream("WordDocument").read(_mainStream);
// Create our FIB, and check for the doc being encrypted
_fib = new FileInformationBlock(_mainStream);
if(_fib.isFEncrypted()) {
throw new EncryptedDocumentException("Cannot process encrypted word files!");
}
}
public FileInformationBlock getFileInformationBlock()
{
return _fib;
}
}

View File

@ -0,0 +1,135 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.hwpf.model.CHPX;
import org.apache.poi.hwpf.model.ComplexFileTable;
import org.apache.poi.hwpf.model.OldCHPBinTable;
import org.apache.poi.hwpf.model.PieceDescriptor;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.model.TextPieceTable;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.LittleEndian;
/**
* Provides very simple support for old (Word 6 / Word 95)
* files.
* TODO Provide a way to get at the properties associated
* with each block of text
*/
public class HWPFOldDocument extends HWPFDocumentCore {
private List<TextAndCHPX> contents = new ArrayList<TextAndCHPX>();
public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
this(fs.getRoot(), fs);
}
public HWPFOldDocument(DirectoryNode directory, POIFSFileSystem fs)
throws IOException {
super(directory, fs);
// Where are things?
int chpTableOffset = LittleEndian.getInt(_mainStream, 0xb8);
int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
// We need to get hold of the text that makes up the
// document, which might be regular or fast-saved
StringBuffer text = new StringBuffer();
TextPieceTable tpt;
if(_fib.isFComplex()) {
ComplexFileTable cft = new ComplexFileTable(
_mainStream, _mainStream,
complexTableOffset, _fib.getFcMin()
);
tpt = cft.getTextPieceTable();
for(TextPiece tp : tpt.getTextPieces()) {
text.append( tp.getStringBuffer() );
}
} else {
// TODO Build the Piece Descriptor properly
// TODO Can these old documents ever contain Unicode strings?
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0);
pd.setFilePosition(_fib.getFcMin());
tpt = new TextPieceTable();
byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()];
System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length);
TextPiece tp = new TextPiece(
0, textData.length, textData, pd, 0
);
tpt.getTextPieces().add(tp);
text.append(tp.getStringBuffer());
}
// Now we can fetch the character and paragraph properties
OldCHPBinTable chpTable = new OldCHPBinTable(
_mainStream, chpTableOffset, chpTableSize,
_fib.getFcMin(), tpt
);
// Finally build up runs
for(CHPX chpx : chpTable.getTextRuns()) {
String str = text.substring(chpx.getStart(), chpx.getEnd());
contents.add(new TextAndCHPX(str,chpx));
}
}
@Override
public void write(OutputStream out) throws IOException {
throw new IllegalStateException("Writing is not available for the older file formats");
}
/**
* Retrieves all our text, in order, along with the
* CHPX information on each bit.
* Every entry has the same formatting, but as yet
* we've no way to tell what the formatting is...
* Warnings - this will change as soon as we support
* text formatting!
*/
public List<TextAndCHPX> getContents() {
return contents;
}
/**
* Warnings - this will change as soon as we support
* text formatting!
*/
public static class TextAndCHPX {
private String text;
private CHPX chpx;
private TextAndCHPX(String text, CHPX chpx) {
this.text = text;
this.chpx = chpx;
}
public String getText() {
return text;
}
public CHPX getChpx() {
return chpx;
}
}
}

View File

@ -0,0 +1,25 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf;
import org.apache.poi.OldFileFormatException;
public class OldWordFileFormatException extends OldFileFormatException {
public OldWordFileFormatException(String s) {
super(s);
}
}

View File

@ -0,0 +1,79 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.extractor;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hwpf.HWPFOldDocument;
import org.apache.poi.hwpf.HWPFOldDocument.TextAndCHPX;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* Class to extract the text from old (Word 6 / Word 95) Word Documents.
*
* This should only be used on the older files, for most uses you
* should call {@link WordExtractor} which deals properly
* with HWPF.
*
* @author Nick Burch
*/
public final class Word6Extractor extends POIOLE2TextExtractor {
private POIFSFileSystem fs;
private HWPFOldDocument doc;
/**
* Create a new Word Extractor
* @param is InputStream containing the word file
*/
public Word6Extractor(InputStream is) throws IOException {
this( new POIFSFileSystem(is) );
}
/**
* Create a new Word Extractor
* @param fs POIFSFileSystem containing the word file
*/
public Word6Extractor(POIFSFileSystem fs) throws IOException {
this(fs.getRoot(), fs);
}
public Word6Extractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
this(new HWPFOldDocument(dir,fs));
}
/**
* Create a new Word Extractor
* @param doc The HWPFOldDocument to extract from
*/
public Word6Extractor(HWPFOldDocument doc) {
super(doc);
this.doc = doc;
}
@Override
public String getText() {
StringBuffer text = new StringBuffer();
for(TextAndCHPX tchpx : doc.getContents()) {
text.append( Range.stripFields(tchpx.getText()) );
}
return text.toString();
}
}

View File

@ -40,7 +40,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
* You should use either getParagraphText() or getText() unless * You should use either getParagraphText() or getText() unless
* you have a strong reason otherwise. * you have a strong reason otherwise.
* *
* @author Nick Burch (nick at torchbox dot com) * @author Nick Burch
*/ */
public final class WordExtractor extends POIOLE2TextExtractor { public final class WordExtractor extends POIOLE2TextExtractor {
private POIFSFileSystem fs; private POIFSFileSystem fs;

View File

@ -61,4 +61,9 @@ public final class CHPX extends BytePropertyNode
CharacterProperties props = CharacterSprmUncompressor.uncompressCHP(baseStyle, getGrpprl(), 0); CharacterProperties props = CharacterSprmUncompressor.uncompressCHP(baseStyle, getGrpprl(), 0);
return props; return props;
} }
public String toString() {
return "CHPX from " + getStart() + " to " + getEnd() +
" (in bytes " + getStartBytes() + " to " + getEndBytes() + ")";
}
} }

View File

@ -0,0 +1,77 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.model;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.poifs.common.POIFSConstants;
import org.apache.poi.util.LittleEndian;
/**
* This class holds all of the character formatting
* properties from Old (Word 6 / Word 95) documents.
* Unlike with Word 97+, it all gets held in the
* same stream.
* In common with the rest of the old support, it
* is read only
*/
public final class OldCHPBinTable
{
/** List of character properties.*/
protected ArrayList<CHPX> _textRuns = new ArrayList<CHPX>();
/**
* Constructor used to read an old-style binTable
* in from a Word document.
*
* @param documentStream
* @param offset
* @param size
* @param fcMin
*/
public OldCHPBinTable(byte[] documentStream, int offset,
int size, int fcMin, TextPieceTable tpt)
{
PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
int length = binTable.length();
for (int x = 0; x < length; x++)
{
GenericPropertyNode node = binTable.getProperty(x);
int pageNum = LittleEndian.getShort(node.getBytes());
int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum;
CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
pageOffset, fcMin, tpt);
int fkpSize = cfkp.size();
for (int y = 0; y < fkpSize; y++)
{
_textRuns.add(cfkp.getCHPX(y));
}
}
}
public List<CHPX> getTextRuns()
{
return _textRuns;
}
}

View File

@ -19,13 +19,12 @@ package org.apache.poi.hwpf.extractor;
import junit.framework.TestCase; import junit.framework.TestCase;
import org.apache.poi.POIDataSamples;
import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFTestDataSamples; import org.apache.poi.hwpf.HWPFTestDataSamples;
import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.POIDataSamples;
import java.io.FileInputStream;
/** /**
* Test the different routes to extracting text * Test the different routes to extracting text
@ -237,4 +236,42 @@ public final class TestWordExtractor extends TestCase {
assertTrue(b.toString().contains("TestComment")); assertTrue(b.toString().contains("TestComment"));
} }
public void testWord95() throws Exception {
// Too old for the default
try {
extractor = new WordExtractor(
POIDataSamples.getDocumentInstance().openResourceAsStream("Word95.doc")
);
fail();
} catch(OldWordFileFormatException e) {}
// Can work with the special one
Word6Extractor w6e = new Word6Extractor(
POIDataSamples.getDocumentInstance().openResourceAsStream("Word95.doc")
);
String text = w6e.getText();
assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
assertTrue(text.contains("Paragraph 2"));
assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it"));
assertTrue(text.contains("Last (4th) paragraph"));
}
public void testWord6() throws Exception {
// Too old for the default
try {
extractor = new WordExtractor(
POIDataSamples.getDocumentInstance().openResourceAsStream("Word6.doc")
);
fail();
} catch(OldWordFileFormatException e) {}
Word6Extractor w6e = new Word6Extractor(
POIDataSamples.getDocumentInstance().openResourceAsStream("Word6.doc")
);
String text = w6e.getText();
assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
}
} }

Binary file not shown.

Binary file not shown.