Basic text extraction support for old Word 6 and Word 95 documents via some HWPF extensions

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@959346 13f79535-47bb-0310-9956-ffa450edef68
2010-06-30 15:13:10 +00:00 · 2010-06-30 15:13:10 +00:00 · 999aecbaa1
parent 7ae1a20f07
commit 999aecbaa1
11 changed files with 531 additions and 72 deletions
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
@ -17,26 +17,43 @@

 package org.apache.poi.hwpf;

-import java.io.InputStream;
+import java.io.ByteArrayInputStream;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
-import java.io.PushbackInputStream;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.OutputStream;
-import java.io.ByteArrayInputStream;
-
 import java.util.Iterator;

-import org.apache.poi.EncryptedDocumentException;
-import org.apache.poi.POIDocument;
-import org.apache.poi.poifs.filesystem.DirectoryNode;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.hwpf.model.CHPBinTable;
+import org.apache.poi.hwpf.model.CPSplitCalculator;
+import org.apache.poi.hwpf.model.ComplexFileTable;
+import org.apache.poi.hwpf.model.DocumentProperties;
+import org.apache.poi.hwpf.model.EscherRecordHolder;
+import org.apache.poi.hwpf.model.FSPATable;
+import org.apache.poi.hwpf.model.FileInformationBlock;
+import org.apache.poi.hwpf.model.FontTable;
+import org.apache.poi.hwpf.model.GenericPropertyNode;
+import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.model.PAPBinTable;
+import org.apache.poi.hwpf.model.PicturesTable;
+import org.apache.poi.hwpf.model.PlexOfCps;
+import org.apache.poi.hwpf.model.PropertyNode;
+import org.apache.poi.hwpf.model.RevisionMarkAuthorTable;
+import org.apache.poi.hwpf.model.SavedByTable;
+import org.apache.poi.hwpf.model.SectionTable;
+import org.apache.poi.hwpf.model.ShapesTable;
+import org.apache.poi.hwpf.model.StyleSheet;
+import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.model.TextPieceTable;
+import org.apache.poi.hwpf.model.io.HWPFFileSystem;
+import org.apache.poi.hwpf.model.io.HWPFOutputStream;
+import org.apache.poi.hwpf.usermodel.HWPFList;
+import org.apache.poi.hwpf.usermodel.Range;
 import org.apache.poi.poifs.common.POIFSConstants;
-
-import org.apache.poi.hwpf.model.*;
-import org.apache.poi.hwpf.model.io.*;
-import org.apache.poi.hwpf.usermodel.*;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;


 /**
@ -46,17 +63,11 @@ import org.apache.poi.hwpf.usermodel.*;
 *
 * @author Ryan Ackley
 */
-public final class HWPFDocument extends POIDocument
-//  implements Cloneable
+public final class HWPFDocument extends HWPFDocumentCore
 {
-  /** The FIB */
-  protected FileInformationBlock _fib;
  /** And for making sense of CP lengths in the FIB */
  protected CPSplitCalculator _cpSplit;

-  /** main document stream buffer*/
-  protected byte[] _mainStream;
-
  /** table stream buffer*/
  protected byte[] _tableStream;

@ -110,29 +121,7 @@ public final class HWPFDocument extends POIDocument

  protected HWPFDocument()
  {
-     super(null, null);
-  }
-
-  /**
-   * Takens an InputStream, verifies that it's not RTF, builds a
-   *  POIFSFileSystem from it, and returns that.
-   */
-  public static POIFSFileSystem verifyAndBuildPOIFS(InputStream istream) throws IOException {
-	// Open a PushbackInputStream, so we can peek at the first few bytes
-	PushbackInputStream pis = new PushbackInputStream(istream,6);
-	byte[] first6 = new byte[6];
-	pis.read(first6);
-
-	// Does it start with {\rtf ? If so, it's really RTF
-	if(first6[0] == '{' && first6[1] == '\\' && first6[2] == 'r'
-		&& first6[3] == 't' && first6[4] == 'f') {
-		throw new IllegalArgumentException("The document is really a RTF file");
-	}
-
-	// OK, so it's not RTF
-	// Open a POIFSFileSystem on the (pushed back) stream
-	pis.unread(first6);
-	return new POIFSFileSystem(pis);
+     super();
  }

  /**
@ -171,21 +160,16 @@ public final class HWPFDocument extends POIDocument
   */
  public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
  {
-    // Sort out the hpsf properties
+    // Load the main stream and FIB
+    // Also handles HPSF bits
 	super(directory, pfilesystem);

-    // read in the main stream.
-    DocumentEntry documentProps = (DocumentEntry)
-       directory.getEntry("WordDocument");
-    _mainStream = new byte[documentProps.getSize()];
-
-    directory.createDocumentInputStream("WordDocument").read(_mainStream);
-
-    // Create our FIB, and check for the doc being encrypted
-    _fib = new FileInformationBlock(_mainStream);
+    // Do the CP Split
    _cpSplit = new CPSplitCalculator(_fib);
-    if(_fib.isFEncrypted()) {
-    	throw new EncryptedDocumentException("Cannot process encrypted word files!");
+    
+    // Is this document too old for us?
+    if(_fib.getNFib() < 106) {
+        throw new OldWordFileFormatException("The document is too old (Word 95 or older) ");
    }

    // use the fib to determine the name of the table stream.
@ -691,17 +675,4 @@ public final class HWPFDocument extends POIDocument
      t.printStackTrace();
    }
  }
-
-//  public Object clone()
-//    throws CloneNotSupportedException
-//  {
-//    _tpt;
-//
-//    _cbt;
-//
-//    _pbt;
-//
-//    _st;
-//
-//  }
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java
@ -0,0 +1,130 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+
+import org.apache.poi.EncryptedDocumentException;
+import org.apache.poi.POIDocument;
+import org.apache.poi.hwpf.model.FileInformationBlock;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+
+/**
+ * This class holds much of the core of a Word document, but
+ *  without some of the table structure information.
+ * You generally want to work with one of
+ *  {@link HWPFDocument} or {@link HWPFOldDocument} 
+ */
+public abstract class HWPFDocumentCore extends POIDocument
+{
+  /** The FIB */
+  protected FileInformationBlock _fib;
+
+  /** main document stream buffer*/
+  protected byte[] _mainStream;
+
+  protected HWPFDocumentCore()
+  {
+     super(null, null);
+  }
+
+  /**
+   * Takens an InputStream, verifies that it's not RTF, builds a
+   *  POIFSFileSystem from it, and returns that.
+   */
+  public static POIFSFileSystem verifyAndBuildPOIFS(InputStream istream) throws IOException {
+	// Open a PushbackInputStream, so we can peek at the first few bytes
+	PushbackInputStream pis = new PushbackInputStream(istream,6);
+	byte[] first6 = new byte[6];
+	pis.read(first6);
+
+	// Does it start with {\rtf ? If so, it's really RTF
+	if(first6[0] == '{' && first6[1] == '\\' && first6[2] == 'r'
+		&& first6[3] == 't' && first6[4] == 'f') {
+		throw new IllegalArgumentException("The document is really a RTF file");
+	}
+
+	// OK, so it's not RTF
+	// Open a POIFSFileSystem on the (pushed back) stream
+	pis.unread(first6);
+	return new POIFSFileSystem(pis);
+  }
+
+  /**
+   * This constructor loads a Word document from an InputStream.
+   *
+   * @param istream The InputStream that contains the Word document.
+   * @throws IOException If there is an unexpected IOException from the passed
+   *         in InputStream.
+   */
+  public HWPFDocumentCore(InputStream istream) throws IOException
+  {
+    //do Ole stuff
+    this( verifyAndBuildPOIFS(istream) );
+  }
+
+  /**
+   * This constructor loads a Word document from a POIFSFileSystem
+   *
+   * @param pfilesystem The POIFSFileSystem that contains the Word document.
+   * @throws IOException If there is an unexpected IOException from the passed
+   *         in POIFSFileSystem.
+   */
+  public HWPFDocumentCore(POIFSFileSystem pfilesystem) throws IOException
+  {
+	this(pfilesystem.getRoot(), pfilesystem);
+  }
+
+  /**
+   * This constructor loads a Word document from a specific point
+   *  in a POIFSFileSystem, probably not the default.
+   * Used typically to open embeded documents.
+   *
+   * @param pfilesystem The POIFSFileSystem that contains the Word document.
+   * @throws IOException If there is an unexpected IOException from the passed
+   *         in POIFSFileSystem.
+   */
+  public HWPFDocumentCore(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
+  {
+    // Sort out the hpsf properties
+	super(directory, pfilesystem);
+
+    // read in the main stream.
+    DocumentEntry documentProps = (DocumentEntry)
+       directory.getEntry("WordDocument");
+    _mainStream = new byte[documentProps.getSize()];
+
+    directory.createDocumentInputStream("WordDocument").read(_mainStream);
+
+    // Create our FIB, and check for the doc being encrypted
+    _fib = new FileInformationBlock(_mainStream);
+    if(_fib.isFEncrypted()) {
+    	throw new EncryptedDocumentException("Cannot process encrypted word files!");
+    }
+  }
+
+  public FileInformationBlock getFileInformationBlock()
+  {
+    return _fib;
+  }
+}
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
@ -0,0 +1,135 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.poi.hwpf.model.CHPX;
+import org.apache.poi.hwpf.model.ComplexFileTable;
+import org.apache.poi.hwpf.model.OldCHPBinTable;
+import org.apache.poi.hwpf.model.PieceDescriptor;
+import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.model.TextPieceTable;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.LittleEndian;
+
+/**
+ * Provides very simple support for old (Word 6 / Word 95)
+ *  files.
+ * TODO Provide a way to get at the properties associated
+ *  with each block of text
+ */
+public class HWPFOldDocument extends HWPFDocumentCore {
+    private List<TextAndCHPX> contents = new ArrayList<TextAndCHPX>(); 
+    
+    public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
+        this(fs.getRoot(), fs);
+    }
+
+    public HWPFOldDocument(DirectoryNode directory, POIFSFileSystem fs)
+            throws IOException {
+        super(directory, fs);
+        
+        // Where are things?
+        int chpTableOffset = LittleEndian.getInt(_mainStream, 0xb8);
+        int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
+        int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
+        
+        // We need to get hold of the text that makes up the
+        //  document, which might be regular or fast-saved
+        StringBuffer text = new StringBuffer();
+        TextPieceTable tpt;
+        if(_fib.isFComplex()) {
+            ComplexFileTable cft = new ComplexFileTable(
+                    _mainStream, _mainStream,
+                    complexTableOffset, _fib.getFcMin()
+            );
+            tpt = cft.getTextPieceTable();
+            
+            for(TextPiece tp : tpt.getTextPieces()) {
+                text.append( tp.getStringBuffer() );
+            }
+        } else {
+            // TODO Build the Piece Descriptor properly
+            // TODO Can these old documents ever contain Unicode strings?
+            PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0);
+            pd.setFilePosition(_fib.getFcMin());
+
+            tpt = new TextPieceTable();
+            byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()];
+            System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length);
+            TextPiece tp = new TextPiece(
+                    0, textData.length, textData, pd, 0
+            );
+            tpt.getTextPieces().add(tp);
+            
+            text.append(tp.getStringBuffer());
+        }
+        
+        // Now we can fetch the character and paragraph properties
+        OldCHPBinTable chpTable = new OldCHPBinTable(
+                _mainStream, chpTableOffset, chpTableSize,
+                _fib.getFcMin(), tpt
+        );
+        
+        // Finally build up runs
+        for(CHPX chpx : chpTable.getTextRuns()) {
+            String str = text.substring(chpx.getStart(), chpx.getEnd());
+            contents.add(new TextAndCHPX(str,chpx));
+        }
+    }
+
+    @Override
+    public void write(OutputStream out) throws IOException {
+        throw new IllegalStateException("Writing is not available for the older file formats");
+    }
+    
+    /**
+     * Retrieves all our text, in order, along with the
+     *  CHPX information on each bit.
+     * Every entry has the same formatting, but as yet 
+     *  we've no way to tell what the formatting is...
+     * Warnings - this will change as soon as we support
+     *  text formatting!
+     */
+    public List<TextAndCHPX> getContents() {
+        return contents;
+    }
+    
+    /**
+     * Warnings - this will change as soon as we support
+     *  text formatting!
+     */
+    public static class TextAndCHPX {
+        private String text;
+        private CHPX chpx;
+        private TextAndCHPX(String text, CHPX chpx) {
+            this.text = text;
+            this.chpx = chpx;
+        }
+        public String getText() {
+            return text;
+        }
+        public CHPX getChpx() {
+            return chpx;
+        }
+    }
+}
--- a/src/scratchpad/src/org/apache/poi/hwpf/OldWordFileFormatException.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/OldWordFileFormatException.java
@ -0,0 +1,25 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf;
+
+import org.apache.poi.OldFileFormatException;
+
+public class OldWordFileFormatException extends OldFileFormatException {
+    public OldWordFileFormatException(String s) {
+        super(s);
+    }
+}
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java
@ -0,0 +1,79 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.extractor;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.POIOLE2TextExtractor;
+import org.apache.poi.hwpf.HWPFOldDocument;
+import org.apache.poi.hwpf.HWPFOldDocument.TextAndCHPX;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+/**
+ * Class to extract the text from old (Word 6 / Word 95) Word Documents.
+ *
+ * This should only be used on the older files, for most uses you
+ *  should call {@link WordExtractor} which deals properly 
+ *  with HWPF.
+ *
+ * @author Nick Burch
+ */
+public final class Word6Extractor extends POIOLE2TextExtractor {
+	private POIFSFileSystem fs;
+	private HWPFOldDocument doc;
+
+	/**
+	 * Create a new Word Extractor
+	 * @param is InputStream containing the word file
+	 */
+	public Word6Extractor(InputStream is) throws IOException {
+		this( new POIFSFileSystem(is) );
+	}
+
+	/**
+	 * Create a new Word Extractor
+	 * @param fs POIFSFileSystem containing the word file
+	 */
+	public Word6Extractor(POIFSFileSystem fs) throws IOException {
+		this(fs.getRoot(), fs);
+	}
+	public Word6Extractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+	    this(new HWPFOldDocument(dir,fs));
+	}
+
+	/**
+	 * Create a new Word Extractor
+	 * @param doc The HWPFOldDocument to extract from
+	 */
+	public Word6Extractor(HWPFOldDocument doc) {
+		super(doc);
+		this.doc = doc;
+	}
+
+    @Override
+    public String getText() {
+        StringBuffer text = new StringBuffer();
+        for(TextAndCHPX tchpx : doc.getContents()) {
+            text.append( Range.stripFields(tchpx.getText()) );
+        }
+        return text.toString();
+    }
+}
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
@ -40,7 +40,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 * You should use either getParagraphText() or getText() unless
 *  you have a strong reason otherwise.
 *
- * @author Nick Burch (nick at torchbox dot com)
+ * @author Nick Burch
 */
 public final class WordExtractor extends POIOLE2TextExtractor {
 	private POIFSFileSystem fs;
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java
@ -61,4 +61,9 @@ public final class CHPX extends BytePropertyNode
    CharacterProperties props = CharacterSprmUncompressor.uncompressCHP(baseStyle, getGrpprl(), 0);
    return props;
  }
+  
+  public String toString() {
+      return "CHPX from " + getStart() + " to " + getEnd() + 
+         " (in bytes " + getStartBytes() + " to " + getEndBytes() + ")";
+  }
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java
@ -0,0 +1,77 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.model;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.poi.poifs.common.POIFSConstants;
+import org.apache.poi.util.LittleEndian;
+
+/**
+ * This class holds all of the character formatting 
+ *  properties from Old (Word 6 / Word 95) documents.
+ * Unlike with Word 97+, it all gets held in the
+ *  same stream.
+ * In common with the rest of the old support, it 
+ *  is read only
+ */
+public final class OldCHPBinTable
+{
+  /** List of character properties.*/
+  protected ArrayList<CHPX> _textRuns = new ArrayList<CHPX>();
+
+  /**
+   * Constructor used to read an old-style binTable
+   *  in from a Word document.
+   *
+   * @param documentStream
+   * @param offset
+   * @param size
+   * @param fcMin
+   */
+  public OldCHPBinTable(byte[] documentStream, int offset,
+                     int size, int fcMin, TextPieceTable tpt)
+  {
+    PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
+
+    int length = binTable.length();
+    for (int x = 0; x < length; x++)
+    {
+      GenericPropertyNode node = binTable.getProperty(x);
+
+      int pageNum = LittleEndian.getShort(node.getBytes());
+      int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum;
+
+      CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
+        pageOffset, fcMin, tpt);
+
+      int fkpSize = cfkp.size();
+
+      for (int y = 0; y < fkpSize; y++)
+      {
+        _textRuns.add(cfkp.getCHPX(y));
+      }
+    }
+  }
+
+  public List<CHPX> getTextRuns()
+  {
+    return _textRuns;
+  }
+}
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
@ -19,13 +19,12 @@ package org.apache.poi.hwpf.extractor;

 import junit.framework.TestCase;

+import org.apache.poi.POIDataSamples;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.HWPFTestDataSamples;
+import org.apache.poi.hwpf.OldWordFileFormatException;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.POIDataSamples;
-
-import java.io.FileInputStream;

 /**
 * Test the different routes to extracting text
@ -237,4 +236,42 @@ public final class TestWordExtractor extends TestCase {

 		assertTrue(b.toString().contains("TestComment"));
 	}
+	
+	public void testWord95() throws Exception {
+	    // Too old for the default
+	    try {
+    		extractor = new WordExtractor(
+    				POIDataSamples.getDocumentInstance().openResourceAsStream("Word95.doc")
+    		);
+    		fail();
+	    } catch(OldWordFileFormatException e) {}
+		
+		// Can work with the special one
+	    Word6Extractor w6e = new Word6Extractor(
+                POIDataSamples.getDocumentInstance().openResourceAsStream("Word95.doc")
+        );
+		String text = w6e.getText();
+		
+		assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
+        assertTrue(text.contains("Paragraph 2"));
+        assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it"));
+        assertTrue(text.contains("Last (4th) paragraph"));
+	}
+	
+	public void testWord6() throws Exception {
+        // Too old for the default
+        try {
+    		extractor = new WordExtractor(
+    				POIDataSamples.getDocumentInstance().openResourceAsStream("Word6.doc")
+    		);
+            fail();
+        } catch(OldWordFileFormatException e) {}
+        
+        Word6Extractor w6e = new Word6Extractor(
+                POIDataSamples.getDocumentInstance().openResourceAsStream("Word6.doc")
+        );
+        String text = w6e.getText();
+        
+        assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
+	}
 }
--- a/test-data/document/Word6.doc
+++ b/test-data/document/Word6.doc
--- a/test-data/document/Word95.doc
+++ b/test-data/document/Word95.doc