diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java index f0d8b1d8ee..bd31f6253d 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java @@ -17,26 +17,43 @@ package org.apache.poi.hwpf; -import java.io.InputStream; +import java.io.ByteArrayInputStream; import java.io.FileInputStream; import java.io.FileNotFoundException; -import java.io.PushbackInputStream; import java.io.IOException; +import java.io.InputStream; import java.io.OutputStream; -import java.io.ByteArrayInputStream; - import java.util.Iterator; -import org.apache.poi.EncryptedDocumentException; -import org.apache.poi.POIDocument; -import org.apache.poi.poifs.filesystem.DirectoryNode; -import org.apache.poi.poifs.filesystem.POIFSFileSystem; -import org.apache.poi.poifs.filesystem.DocumentEntry; +import org.apache.poi.hwpf.model.CHPBinTable; +import org.apache.poi.hwpf.model.CPSplitCalculator; +import org.apache.poi.hwpf.model.ComplexFileTable; +import org.apache.poi.hwpf.model.DocumentProperties; +import org.apache.poi.hwpf.model.EscherRecordHolder; +import org.apache.poi.hwpf.model.FSPATable; +import org.apache.poi.hwpf.model.FileInformationBlock; +import org.apache.poi.hwpf.model.FontTable; +import org.apache.poi.hwpf.model.GenericPropertyNode; +import org.apache.poi.hwpf.model.ListTables; +import org.apache.poi.hwpf.model.PAPBinTable; +import org.apache.poi.hwpf.model.PicturesTable; +import org.apache.poi.hwpf.model.PlexOfCps; +import org.apache.poi.hwpf.model.PropertyNode; +import org.apache.poi.hwpf.model.RevisionMarkAuthorTable; +import org.apache.poi.hwpf.model.SavedByTable; +import org.apache.poi.hwpf.model.SectionTable; +import org.apache.poi.hwpf.model.ShapesTable; +import org.apache.poi.hwpf.model.StyleSheet; +import org.apache.poi.hwpf.model.TextPiece; +import org.apache.poi.hwpf.model.TextPieceTable; +import org.apache.poi.hwpf.model.io.HWPFFileSystem; +import org.apache.poi.hwpf.model.io.HWPFOutputStream; +import org.apache.poi.hwpf.usermodel.HWPFList; +import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.poifs.common.POIFSConstants; - -import org.apache.poi.hwpf.model.*; -import org.apache.poi.hwpf.model.io.*; -import org.apache.poi.hwpf.usermodel.*; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.DocumentEntry; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** @@ -46,17 +63,11 @@ import org.apache.poi.hwpf.usermodel.*; * * @author Ryan Ackley */ -public final class HWPFDocument extends POIDocument -// implements Cloneable +public final class HWPFDocument extends HWPFDocumentCore { - /** The FIB */ - protected FileInformationBlock _fib; /** And for making sense of CP lengths in the FIB */ protected CPSplitCalculator _cpSplit; - /** main document stream buffer*/ - protected byte[] _mainStream; - /** table stream buffer*/ protected byte[] _tableStream; @@ -110,29 +121,7 @@ public final class HWPFDocument extends POIDocument protected HWPFDocument() { - super(null, null); - } - - /** - * Takens an InputStream, verifies that it's not RTF, builds a - * POIFSFileSystem from it, and returns that. - */ - public static POIFSFileSystem verifyAndBuildPOIFS(InputStream istream) throws IOException { - // Open a PushbackInputStream, so we can peek at the first few bytes - PushbackInputStream pis = new PushbackInputStream(istream,6); - byte[] first6 = new byte[6]; - pis.read(first6); - - // Does it start with {\rtf ? If so, it's really RTF - if(first6[0] == '{' && first6[1] == '\\' && first6[2] == 'r' - && first6[3] == 't' && first6[4] == 'f') { - throw new IllegalArgumentException("The document is really a RTF file"); - } - - // OK, so it's not RTF - // Open a POIFSFileSystem on the (pushed back) stream - pis.unread(first6); - return new POIFSFileSystem(pis); + super(); } /** @@ -171,21 +160,16 @@ public final class HWPFDocument extends POIDocument */ public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException { - // Sort out the hpsf properties + // Load the main stream and FIB + // Also handles HPSF bits super(directory, pfilesystem); - // read in the main stream. - DocumentEntry documentProps = (DocumentEntry) - directory.getEntry("WordDocument"); - _mainStream = new byte[documentProps.getSize()]; - - directory.createDocumentInputStream("WordDocument").read(_mainStream); - - // Create our FIB, and check for the doc being encrypted - _fib = new FileInformationBlock(_mainStream); + // Do the CP Split _cpSplit = new CPSplitCalculator(_fib); - if(_fib.isFEncrypted()) { - throw new EncryptedDocumentException("Cannot process encrypted word files!"); + + // Is this document too old for us? + if(_fib.getNFib() < 106) { + throw new OldWordFileFormatException("The document is too old (Word 95 or older) "); } // use the fib to determine the name of the table stream. @@ -691,17 +675,4 @@ public final class HWPFDocument extends POIDocument t.printStackTrace(); } } - -// public Object clone() -// throws CloneNotSupportedException -// { -// _tpt; -// -// _cbt; -// -// _pbt; -// -// _st; -// -// } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java new file mode 100644 index 0000000000..af17cc2ed2 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java @@ -0,0 +1,130 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf; + +import java.io.IOException; +import java.io.InputStream; +import java.io.PushbackInputStream; + +import org.apache.poi.EncryptedDocumentException; +import org.apache.poi.POIDocument; +import org.apache.poi.hwpf.model.FileInformationBlock; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.DocumentEntry; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + + +/** + * This class holds much of the core of a Word document, but + * without some of the table structure information. + * You generally want to work with one of + * {@link HWPFDocument} or {@link HWPFOldDocument} + */ +public abstract class HWPFDocumentCore extends POIDocument +{ + /** The FIB */ + protected FileInformationBlock _fib; + + /** main document stream buffer*/ + protected byte[] _mainStream; + + protected HWPFDocumentCore() + { + super(null, null); + } + + /** + * Takens an InputStream, verifies that it's not RTF, builds a + * POIFSFileSystem from it, and returns that. + */ + public static POIFSFileSystem verifyAndBuildPOIFS(InputStream istream) throws IOException { + // Open a PushbackInputStream, so we can peek at the first few bytes + PushbackInputStream pis = new PushbackInputStream(istream,6); + byte[] first6 = new byte[6]; + pis.read(first6); + + // Does it start with {\rtf ? If so, it's really RTF + if(first6[0] == '{' && first6[1] == '\\' && first6[2] == 'r' + && first6[3] == 't' && first6[4] == 'f') { + throw new IllegalArgumentException("The document is really a RTF file"); + } + + // OK, so it's not RTF + // Open a POIFSFileSystem on the (pushed back) stream + pis.unread(first6); + return new POIFSFileSystem(pis); + } + + /** + * This constructor loads a Word document from an InputStream. + * + * @param istream The InputStream that contains the Word document. + * @throws IOException If there is an unexpected IOException from the passed + * in InputStream. + */ + public HWPFDocumentCore(InputStream istream) throws IOException + { + //do Ole stuff + this( verifyAndBuildPOIFS(istream) ); + } + + /** + * This constructor loads a Word document from a POIFSFileSystem + * + * @param pfilesystem The POIFSFileSystem that contains the Word document. + * @throws IOException If there is an unexpected IOException from the passed + * in POIFSFileSystem. + */ + public HWPFDocumentCore(POIFSFileSystem pfilesystem) throws IOException + { + this(pfilesystem.getRoot(), pfilesystem); + } + + /** + * This constructor loads a Word document from a specific point + * in a POIFSFileSystem, probably not the default. + * Used typically to open embeded documents. + * + * @param pfilesystem The POIFSFileSystem that contains the Word document. + * @throws IOException If there is an unexpected IOException from the passed + * in POIFSFileSystem. + */ + public HWPFDocumentCore(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException + { + // Sort out the hpsf properties + super(directory, pfilesystem); + + // read in the main stream. + DocumentEntry documentProps = (DocumentEntry) + directory.getEntry("WordDocument"); + _mainStream = new byte[documentProps.getSize()]; + + directory.createDocumentInputStream("WordDocument").read(_mainStream); + + // Create our FIB, and check for the doc being encrypted + _fib = new FileInformationBlock(_mainStream); + if(_fib.isFEncrypted()) { + throw new EncryptedDocumentException("Cannot process encrypted word files!"); + } + } + + public FileInformationBlock getFileInformationBlock() + { + return _fib; + } +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java new file mode 100644 index 0000000000..42cff2ace8 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java @@ -0,0 +1,135 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.List; + +import org.apache.poi.hwpf.model.CHPX; +import org.apache.poi.hwpf.model.ComplexFileTable; +import org.apache.poi.hwpf.model.OldCHPBinTable; +import org.apache.poi.hwpf.model.PieceDescriptor; +import org.apache.poi.hwpf.model.TextPiece; +import org.apache.poi.hwpf.model.TextPieceTable; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.util.LittleEndian; + +/** + * Provides very simple support for old (Word 6 / Word 95) + * files. + * TODO Provide a way to get at the properties associated + * with each block of text + */ +public class HWPFOldDocument extends HWPFDocumentCore { + private List contents = new ArrayList(); + + public HWPFOldDocument(POIFSFileSystem fs) throws IOException { + this(fs.getRoot(), fs); + } + + public HWPFOldDocument(DirectoryNode directory, POIFSFileSystem fs) + throws IOException { + super(directory, fs); + + // Where are things? + int chpTableOffset = LittleEndian.getInt(_mainStream, 0xb8); + int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc); + int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160); + + // We need to get hold of the text that makes up the + // document, which might be regular or fast-saved + StringBuffer text = new StringBuffer(); + TextPieceTable tpt; + if(_fib.isFComplex()) { + ComplexFileTable cft = new ComplexFileTable( + _mainStream, _mainStream, + complexTableOffset, _fib.getFcMin() + ); + tpt = cft.getTextPieceTable(); + + for(TextPiece tp : tpt.getTextPieces()) { + text.append( tp.getStringBuffer() ); + } + } else { + // TODO Build the Piece Descriptor properly + // TODO Can these old documents ever contain Unicode strings? + PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0); + pd.setFilePosition(_fib.getFcMin()); + + tpt = new TextPieceTable(); + byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()]; + System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length); + TextPiece tp = new TextPiece( + 0, textData.length, textData, pd, 0 + ); + tpt.getTextPieces().add(tp); + + text.append(tp.getStringBuffer()); + } + + // Now we can fetch the character and paragraph properties + OldCHPBinTable chpTable = new OldCHPBinTable( + _mainStream, chpTableOffset, chpTableSize, + _fib.getFcMin(), tpt + ); + + // Finally build up runs + for(CHPX chpx : chpTable.getTextRuns()) { + String str = text.substring(chpx.getStart(), chpx.getEnd()); + contents.add(new TextAndCHPX(str,chpx)); + } + } + + @Override + public void write(OutputStream out) throws IOException { + throw new IllegalStateException("Writing is not available for the older file formats"); + } + + /** + * Retrieves all our text, in order, along with the + * CHPX information on each bit. + * Every entry has the same formatting, but as yet + * we've no way to tell what the formatting is... + * Warnings - this will change as soon as we support + * text formatting! + */ + public List getContents() { + return contents; + } + + /** + * Warnings - this will change as soon as we support + * text formatting! + */ + public static class TextAndCHPX { + private String text; + private CHPX chpx; + private TextAndCHPX(String text, CHPX chpx) { + this.text = text; + this.chpx = chpx; + } + public String getText() { + return text; + } + public CHPX getChpx() { + return chpx; + } + } +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/OldWordFileFormatException.java b/src/scratchpad/src/org/apache/poi/hwpf/OldWordFileFormatException.java new file mode 100644 index 0000000000..cfa97bc70e --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/OldWordFileFormatException.java @@ -0,0 +1,25 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf; + +import org.apache.poi.OldFileFormatException; + +public class OldWordFileFormatException extends OldFileFormatException { + public OldWordFileFormatException(String s) { + super(s); + } +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java new file mode 100644 index 0000000000..3ea6d42d46 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java @@ -0,0 +1,79 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.extractor; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.poi.POIOLE2TextExtractor; +import org.apache.poi.hwpf.HWPFOldDocument; +import org.apache.poi.hwpf.HWPFOldDocument.TextAndCHPX; +import org.apache.poi.hwpf.usermodel.Range; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + +/** + * Class to extract the text from old (Word 6 / Word 95) Word Documents. + * + * This should only be used on the older files, for most uses you + * should call {@link WordExtractor} which deals properly + * with HWPF. + * + * @author Nick Burch + */ +public final class Word6Extractor extends POIOLE2TextExtractor { + private POIFSFileSystem fs; + private HWPFOldDocument doc; + + /** + * Create a new Word Extractor + * @param is InputStream containing the word file + */ + public Word6Extractor(InputStream is) throws IOException { + this( new POIFSFileSystem(is) ); + } + + /** + * Create a new Word Extractor + * @param fs POIFSFileSystem containing the word file + */ + public Word6Extractor(POIFSFileSystem fs) throws IOException { + this(fs.getRoot(), fs); + } + public Word6Extractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException { + this(new HWPFOldDocument(dir,fs)); + } + + /** + * Create a new Word Extractor + * @param doc The HWPFOldDocument to extract from + */ + public Word6Extractor(HWPFOldDocument doc) { + super(doc); + this.doc = doc; + } + + @Override + public String getText() { + StringBuffer text = new StringBuffer(); + for(TextAndCHPX tchpx : doc.getContents()) { + text.append( Range.stripFields(tchpx.getText()) ); + } + return text.toString(); + } +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java index 1be78ab1a6..fe57f7c474 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java @@ -40,7 +40,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; * You should use either getParagraphText() or getText() unless * you have a strong reason otherwise. * - * @author Nick Burch (nick at torchbox dot com) + * @author Nick Burch */ public final class WordExtractor extends POIOLE2TextExtractor { private POIFSFileSystem fs; diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java index b78cdffc57..f56621afad 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java @@ -61,4 +61,9 @@ public final class CHPX extends BytePropertyNode CharacterProperties props = CharacterSprmUncompressor.uncompressCHP(baseStyle, getGrpprl(), 0); return props; } + + public String toString() { + return "CHPX from " + getStart() + " to " + getEnd() + + " (in bytes " + getStartBytes() + " to " + getEndBytes() + ")"; + } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java new file mode 100644 index 0000000000..3c97652e55 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java @@ -0,0 +1,77 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.model; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.poi.poifs.common.POIFSConstants; +import org.apache.poi.util.LittleEndian; + +/** + * This class holds all of the character formatting + * properties from Old (Word 6 / Word 95) documents. + * Unlike with Word 97+, it all gets held in the + * same stream. + * In common with the rest of the old support, it + * is read only + */ +public final class OldCHPBinTable +{ + /** List of character properties.*/ + protected ArrayList _textRuns = new ArrayList(); + + /** + * Constructor used to read an old-style binTable + * in from a Word document. + * + * @param documentStream + * @param offset + * @param size + * @param fcMin + */ + public OldCHPBinTable(byte[] documentStream, int offset, + int size, int fcMin, TextPieceTable tpt) + { + PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2); + + int length = binTable.length(); + for (int x = 0; x < length; x++) + { + GenericPropertyNode node = binTable.getProperty(x); + + int pageNum = LittleEndian.getShort(node.getBytes()); + int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum; + + CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream, + pageOffset, fcMin, tpt); + + int fkpSize = cfkp.size(); + + for (int y = 0; y < fkpSize; y++) + { + _textRuns.add(cfkp.getCHPX(y)); + } + } + } + + public List getTextRuns() + { + return _textRuns; + } +} diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java index 47e0b431f6..9634ab9d40 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java @@ -19,13 +19,12 @@ package org.apache.poi.hwpf.extractor; import junit.framework.TestCase; +import org.apache.poi.POIDataSamples; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.HWPFTestDataSamples; +import org.apache.poi.hwpf.OldWordFileFormatException; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; -import org.apache.poi.POIDataSamples; - -import java.io.FileInputStream; /** * Test the different routes to extracting text @@ -237,4 +236,42 @@ public final class TestWordExtractor extends TestCase { assertTrue(b.toString().contains("TestComment")); } + + public void testWord95() throws Exception { + // Too old for the default + try { + extractor = new WordExtractor( + POIDataSamples.getDocumentInstance().openResourceAsStream("Word95.doc") + ); + fail(); + } catch(OldWordFileFormatException e) {} + + // Can work with the special one + Word6Extractor w6e = new Word6Extractor( + POIDataSamples.getDocumentInstance().openResourceAsStream("Word95.doc") + ); + String text = w6e.getText(); + + assertTrue(text.contains("The quick brown fox jumps over the lazy dog")); + assertTrue(text.contains("Paragraph 2")); + assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it")); + assertTrue(text.contains("Last (4th) paragraph")); + } + + public void testWord6() throws Exception { + // Too old for the default + try { + extractor = new WordExtractor( + POIDataSamples.getDocumentInstance().openResourceAsStream("Word6.doc") + ); + fail(); + } catch(OldWordFileFormatException e) {} + + Word6Extractor w6e = new Word6Extractor( + POIDataSamples.getDocumentInstance().openResourceAsStream("Word6.doc") + ); + String text = w6e.getText(); + + assertTrue(text.contains("The quick brown fox jumps over the lazy dog")); + } } diff --git a/test-data/document/Word6.doc b/test-data/document/Word6.doc new file mode 100644 index 0000000000..a614a0783f Binary files /dev/null and b/test-data/document/Word6.doc differ diff --git a/test-data/document/Word95.doc b/test-data/document/Word95.doc new file mode 100644 index 0000000000..a214f29258 Binary files /dev/null and b/test-data/document/Word95.doc differ