More Word 6 / Word 95 Support

HWPFOldDocument now processes a few more table sections, and so we can fake up some basic Ranges. This allows us to do paragraph level text extraction git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@960102 13f79535-47bb-0310-9956-ffa450edef68
2010-07-02 20:59:30 +00:00 · 2010-07-02 20:59:30 +00:00 · 2d9df14178
parent c1d139babd
commit 2d9df14178
15 changed files with 308 additions and 143 deletions
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@ -34,6 +34,7 @@

    <changes>
        <release version="3.7-beta2" date="2010-??-??">
+           <action dev="POI-DEVELOPERS" type="add">Paragraph level as well as whole-file text extraction for Word 6/95 files through HWPF</action>
           <action dev="POI-DEVELOPERS" type="add">Text Extraction support for older Word 6 and Word 95 files via HWPF</action>
           <action dev="POI-DEVELOPERS" type="add">49508 - Allow the addition of paragraphs to XWPF Table Cells</action>
           <action dev="POI-DEVELOPERS" type="fix">49446 - Don't consider 17.16.23 field codes as properly part of the paragraph's text</action>
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
@ -31,7 +31,6 @@ import org.apache.poi.hwpf.model.ComplexFileTable;
 import org.apache.poi.hwpf.model.DocumentProperties;
 import org.apache.poi.hwpf.model.EscherRecordHolder;
 import org.apache.poi.hwpf.model.FSPATable;
-import org.apache.poi.hwpf.model.FileInformationBlock;
 import org.apache.poi.hwpf.model.FontTable;
 import org.apache.poi.hwpf.model.GenericPropertyNode;
 import org.apache.poi.hwpf.model.ListTables;
@ -83,24 +82,6 @@ public final class HWPFDocument extends HWPFDocumentCore

  protected TextPieceTable _tpt;

-  /** Contains formatting properties for text*/
-  protected CHPBinTable _cbt;
-
-  /** Contains formatting properties for paragraphs*/
-  protected PAPBinTable _pbt;
-
-  /** Contains formatting properties for sections.*/
-  protected SectionTable _st;
-
-  /** Holds styles for this document.*/
-  protected StyleSheet _ss;
-
-  /** Holds fonts for this document.*/
-  protected FontTable _ft;
-
-  /** Hold list tables */
-  protected ListTables _lt;
-
  /** Holds the save history for this document. */
  protected SavedByTable _sbt;
  
@ -277,15 +258,11 @@ public final class HWPFDocument extends HWPFDocumentCore
    }
  }

-  public StyleSheet getStyleSheet()
+  public TextPieceTable getTextTable()
  {
-    return _ss;
+    return _cft.getTextPieceTable();
  }

-  public FileInformationBlock getFileInformationBlock()
-  {
-    return _fib;
-  }
  public CPSplitCalculator getCPSplitCalculator()
  {
 	return _cpSplit;
@ -390,11 +367,6 @@ public final class HWPFDocument extends HWPFDocumentCore
    return length;
  }

-  public ListTables getListTables()
-  {
-    return _lt;
-  }
-
  /**
   * Gets a reference to the saved -by table, which holds the save history for the document.
   *
@ -591,26 +563,6 @@ public final class HWPFDocument extends HWPFDocumentCore
    pfs.writeFilesystem(out);
  }

-  public CHPBinTable getCharacterTable()
-  {
-    return _cbt;
-  }
-
-  public PAPBinTable getParagraphTable()
-  {
-    return _pbt;
-  }
-
-  public SectionTable getSectionTable()
-  {
-    return _st;
-  }
-
-  public TextPieceTable getTextTable()
-  {
-    return _cft.getTextPieceTable();
-  }
-
  public byte[] getDataStream()
  {
    return _dataStream;
@ -629,11 +581,6 @@ public final class HWPFDocument extends HWPFDocumentCore
    return _lt.addList(list.getListData(), list.getOverride());
  }

-  public FontTable getFontTable()
-  {
-    return _ft;
-  }
-
  public void delete(int start, int length)
  {
    Range r = new Range(start, start + length, this);
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java
@ -23,7 +23,15 @@ import java.io.PushbackInputStream;

 import org.apache.poi.EncryptedDocumentException;
 import org.apache.poi.POIDocument;
+import org.apache.poi.hwpf.model.CHPBinTable;
 import org.apache.poi.hwpf.model.FileInformationBlock;
+import org.apache.poi.hwpf.model.FontTable;
+import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.model.PAPBinTable;
+import org.apache.poi.hwpf.model.SectionTable;
+import org.apache.poi.hwpf.model.StyleSheet;
+import org.apache.poi.hwpf.model.TextPieceTable;
+import org.apache.poi.hwpf.usermodel.Range;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.DocumentEntry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@ -40,6 +48,24 @@ public abstract class HWPFDocumentCore extends POIDocument
  /** The FIB */
  protected FileInformationBlock _fib;

+  /** Holds styles for this document.*/
+  protected StyleSheet _ss;
+
+  /** Contains formatting properties for text*/
+  protected CHPBinTable _cbt;
+
+  /** Contains formatting properties for paragraphs*/
+  protected PAPBinTable _pbt;
+
+  /** Contains formatting properties for sections.*/
+  protected SectionTable _st;
+
+  /** Holds fonts for this document.*/
+  protected FontTable _ft;
+
+  /** Hold list tables */
+  protected ListTables _lt;
+
  /** main document stream buffer*/
  protected byte[] _mainStream;

@ -123,6 +149,44 @@ public abstract class HWPFDocumentCore extends POIDocument
    }
  }

+  /**
+   * Returns the range which covers the whole of the
+   *  document, but excludes any headers and footers.
+   */
+  public abstract Range getRange();
+  
+  public abstract TextPieceTable getTextTable();
+  
+  public CHPBinTable getCharacterTable()
+  {
+    return _cbt;
+  }
+
+  public PAPBinTable getParagraphTable()
+  {
+    return _pbt;
+  }
+
+  public SectionTable getSectionTable()
+  {
+    return _st;
+  }
+
+  public StyleSheet getStyleSheet()
+  {
+    return _ss;
+  }
+
+  public ListTables getListTables()
+  {
+    return _lt;
+  }
+
+  public FontTable getFontTable()
+  {
+    return _ft;
+  }
+
  public FileInformationBlock getFileInformationBlock()
  {
    return _fib;
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
@ -18,15 +18,15 @@ package org.apache.poi.hwpf;

 import java.io.IOException;
 import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.List;

-import org.apache.poi.hwpf.model.CHPX;
 import org.apache.poi.hwpf.model.ComplexFileTable;
 import org.apache.poi.hwpf.model.OldCHPBinTable;
+import org.apache.poi.hwpf.model.OldPAPBinTable;
+import org.apache.poi.hwpf.model.OldSectionTable;
 import org.apache.poi.hwpf.model.PieceDescriptor;
 import org.apache.poi.hwpf.model.TextPiece;
 import org.apache.poi.hwpf.model.TextPieceTable;
+import org.apache.poi.hwpf.usermodel.Range;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.LittleEndian;
@ -34,11 +34,9 @@ import org.apache.poi.util.LittleEndian;
 /**
 * Provides very simple support for old (Word 6 / Word 95)
 *  files.
- * TODO Provide a way to get at the properties associated
- *  with each block of text
 */
 public class HWPFOldDocument extends HWPFDocumentCore {
-    private List<TextAndCHPX> contents = new ArrayList<TextAndCHPX>(); 
+    private TextPieceTable tpt;
    
    public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
        this(fs.getRoot(), fs);
@ -49,14 +47,19 @@ public class HWPFOldDocument extends HWPFDocumentCore {
        super(directory, fs);
        
        // Where are things?
+        int sedTableOffset = LittleEndian.getInt(_mainStream, 0x88);
+        int sedTableSize   = LittleEndian.getInt(_mainStream, 0x8c);
        int chpTableOffset = LittleEndian.getInt(_mainStream, 0xb8);
-        int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
+        int chpTableSize   = LittleEndian.getInt(_mainStream, 0xbc);
+        int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0);
+        int papTableSize   = LittleEndian.getInt(_mainStream, 0xc4);
+        //int shfTableOffset = LittleEndian.getInt(_mainStream, 0x60);
+        //int shfTableSize   = LittleEndian.getInt(_mainStream, 0x64);
        int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
        
        // We need to get hold of the text that makes up the
        //  document, which might be regular or fast-saved
        StringBuffer text = new StringBuffer();
-        TextPieceTable tpt;
        if(_fib.isFComplex()) {
            ComplexFileTable cft = new ComplexFileTable(
                    _mainStream, _mainStream,
@ -68,11 +71,15 @@ public class HWPFOldDocument extends HWPFDocumentCore {
                text.append( tp.getStringBuffer() );
            }
        } else {
+            // TODO Discover if these older documents can ever hold Unicode Strings?
+            //  (We think not, because they seem to lack a Piece table)
            // TODO Build the Piece Descriptor properly
-            // TODO Can these old documents ever contain Unicode strings?
+            //  (We have to fake it, as they don't seem to have a proper Piece table)
            PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0);
            pd.setFilePosition(_fib.getFcMin());

+            // Generate a single Text Piece Table, with a single Text Piece
+            //  which covers all the (8 bit only) text in the file
            tpt = new TextPieceTable();
            byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()];
            System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length);
@ -85,51 +92,34 @@ public class HWPFOldDocument extends HWPFDocumentCore {
        }
        
        // Now we can fetch the character and paragraph properties
-        OldCHPBinTable chpTable = new OldCHPBinTable(
+        _cbt = new OldCHPBinTable(
                _mainStream, chpTableOffset, chpTableSize,
                _fib.getFcMin(), tpt
        );
-        
-        // Finally build up runs
-        for(CHPX chpx : chpTable.getTextRuns()) {
-            String str = text.substring(chpx.getStart(), chpx.getEnd());
-            contents.add(new TextAndCHPX(str,chpx));
-        }
+        _pbt = new OldPAPBinTable(
+                _mainStream, papTableOffset, papTableSize,
+                _fib.getFcMin(), tpt
+        );
+        _st = new OldSectionTable(
+                _mainStream, sedTableOffset, sedTableSize,
+                _fib.getFcMin(), tpt
+        );
+    }
+    
+    public Range getRange() {
+        // Life is easy when we have no footers, headers or unicode!
+        return new Range(
+                0, _fib.getFcMac() - _fib.getFcMin(), this
+        );
+    }
+
+    public TextPieceTable getTextTable()
+    {
+      return tpt;
    }

    @Override
    public void write(OutputStream out) throws IOException {
        throw new IllegalStateException("Writing is not available for the older file formats");
    }
-    
-    /**
-     * Retrieves all our text, in order, along with the
-     *  CHPX information on each bit.
-     * Every entry has the same formatting, but as yet 
-     *  we've no way to tell what the formatting is...
-     * Warnings - this will change as soon as we support
-     *  text formatting!
-     */
-    public List<TextAndCHPX> getContents() {
-        return contents;
-    }
-    
-    /**
-     * Warnings - this will change as soon as we support
-     *  text formatting!
-     */
-    public static class TextAndCHPX {
-        private String text;
-        private CHPX chpx;
-        private TextAndCHPX(String text, CHPX chpx) {
-            this.text = text;
-            this.chpx = chpx;
-        }
-        public String getText() {
-            return text;
-        }
-        public CHPX getChpx() {
-            return chpx;
-        }
-    }
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java
@ -22,7 +22,6 @@ import java.io.InputStream;

 import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.hwpf.HWPFOldDocument;
-import org.apache.poi.hwpf.HWPFOldDocument.TextAndCHPX;
 import org.apache.poi.hwpf.usermodel.Range;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@ -68,12 +67,41 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
 		this.doc = doc;
 	}

-    @Override
+    /**
+     * Get the text from the word file, as an array with one String
+     *  per paragraph
+     */
+	public String[] getParagraphText() {
+	    String[] ret;
+
+	    // Extract using the model code
+	    try {
+	        Range r = doc.getRange();
+
+	        ret = WordExtractor.getParagraphText(r);
+	    } catch (Exception e) {
+            // Something's up with turning the text pieces into paragraphs
+            // Fall back to ripping out the text pieces
+	        ret = new String[doc.getTextTable().getTextPieces().size()];
+	        for(int i=0; i<ret.length; i++) {
+	            ret[i] = doc.getTextTable().getTextPieces().get(i).getStringBuffer().toString();
+	            
+	            // Fix the line endings
+	            ret[i].replaceAll("\r", "\ufffe");
+                ret[i].replaceAll("\ufffe","\r\n");
+	        }
+	    }
+
+	    return ret;
+	}
+
    public String getText() {
        StringBuffer text = new StringBuffer();
-        for(TextAndCHPX tchpx : doc.getContents()) {
-            text.append( Range.stripFields(tchpx.getText()) );
+        
+        for(String t : getParagraphText()) {
+            text.append(t);
        }
+
        return text.toString();
    }
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
@ -17,13 +17,12 @@

 package org.apache.poi.hwpf.extractor;

+import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.FileInputStream;
 import java.io.UnsupportedEncodingException;
-import java.util.Iterator;
-import java.util.Arrays;
 import java.util.ArrayList;
+import java.util.Arrays;

 import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.hwpf.HWPFDocument;
@ -133,7 +132,7 @@ public final class WordExtractor extends POIOLE2TextExtractor {
                return getParagraphText(r);
        }

-        private String[] getParagraphText(Range r) {
+        protected static String[] getParagraphText(Range r) {
                String[] ret;
                ret = new String[r.numParagraphs()];
                for (int i = 0; i < ret.length; i++) {
@ -215,10 +214,7 @@ public final class WordExtractor extends POIOLE2TextExtractor {
 	public String getTextFromPieces() {
    	StringBuffer textBuf = new StringBuffer();

-    	Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
-    	while (textPieces.hasNext()) {
-    		TextPiece piece = (TextPiece) textPieces.next();
-
+    	for(TextPiece piece : doc.getTextTable().getTextPieces()) {
    		String encoding = "Cp1252";
    		if (piece.isUnicode()) {
    			encoding = "UTF-16LE";
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java
@ -32,7 +32,7 @@ import org.apache.poi.hwpf.sprm.SprmBuffer;
 *
 * @author Ryan Ackley
 */
-public final class CHPBinTable
+public class CHPBinTable
 {
  /** List of character properties.*/
  protected ArrayList<CHPX> _textRuns = new ArrayList<CHPX>();
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java
@ -17,9 +17,6 @@

 package org.apache.poi.hwpf.model;

-import java.util.ArrayList;
-import java.util.List;
-
 import org.apache.poi.poifs.common.POIFSConstants;
 import org.apache.poi.util.LittleEndian;

@ -31,11 +28,8 @@ import org.apache.poi.util.LittleEndian;
 * In common with the rest of the old support, it 
 *  is read only
 */
-public final class OldCHPBinTable
+public final class OldCHPBinTable extends CHPBinTable
 {
-  /** List of character properties.*/
-  protected ArrayList<CHPX> _textRuns = new ArrayList<CHPX>();
-
  /**
   * Constructor used to read an old-style binTable
   *  in from a Word document.
@ -69,9 +63,4 @@ public final class OldCHPBinTable
      }
    }
  }
-
-  public List<CHPX> getTextRuns()
-  {
-    return _textRuns;
-  }
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldPAPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldPAPBinTable.java
@ -0,0 +1,59 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.model;
+
+import org.apache.poi.poifs.common.POIFSConstants;
+import org.apache.poi.util.LittleEndian;
+
+/**
+ * This class holds all of the paragraph formatting 
+ *  properties from Old (Word 6 / Word 95) documents.
+ * Unlike with Word 97+, it all gets held in the
+ *  same stream.
+ * In common with the rest of the old support, it 
+ *  is read only
+ */
+public final class OldPAPBinTable extends PAPBinTable
+{
+  public OldPAPBinTable(byte[] documentStream, int offset,
+                     int size, int fcMin, TextPieceTable tpt)
+  {
+    PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
+
+    int length = binTable.length();
+    for (int x = 0; x < length; x++)
+    {
+      GenericPropertyNode node = binTable.getProperty(x);
+
+      int pageNum = LittleEndian.getShort(node.getBytes());
+      int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum;
+
+      PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(documentStream,
+        documentStream, pageOffset, fcMin, tpt);
+
+      int fkpSize = pfkp.size();
+
+      for (int y = 0; y < fkpSize; y++)
+      {
+    	PAPX papx = pfkp.getPAPX(y);
+        _paragraphs.add(papx);
+      }
+    }
+  }
+}
+
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java
@ -0,0 +1,65 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.model;
+
+import org.apache.poi.util.LittleEndian;
+
+/**
+ * This class holds all of the section formatting 
+ *  properties from Old (Word 6 / Word 95) documents.
+ * Unlike with Word 97+, it all gets held in the
+ *  same stream.
+ * In common with the rest of the old support, it 
+ *  is read only
+ */
+public final class OldSectionTable extends SectionTable
+{
+  public OldSectionTable(byte[] documentStream, int offset,
+                      int size, int fcMin,
+                      TextPieceTable tpt)
+  {
+    PlexOfCps sedPlex = new PlexOfCps(documentStream, offset, size, 12);
+
+    int length = sedPlex.length();
+
+    for (int x = 0; x < length; x++)
+    {
+      GenericPropertyNode node = sedPlex.getProperty(x);
+      SectionDescriptor sed = new SectionDescriptor(node.getBytes(), 0);
+
+      int fileOffset = sed.getFc();
+      int startAt = node.getStart();
+      int endAt = node.getEnd();
+
+      // check for the optimization
+      if (fileOffset == 0xffffffff)
+      {
+        _sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0]));
+      }
+      else
+      {
+        // The first short at the offset is the size of the grpprl.
+        int sepxSize = LittleEndian.getShort(documentStream, fileOffset);
+        byte[] buf = new byte[sepxSize];
+        fileOffset += LittleEndian.SHORT_SIZE;
+        System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
+        _sections.add(new SEPX(sed, startAt, endAt, tpt, buf));
+      }
+    }
+  }
+}
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
@ -34,7 +34,7 @@ import org.apache.poi.util.LittleEndian;
 *
 * @author Ryan Ackley
 */
-public final class PAPBinTable
+public class PAPBinTable
 {
  protected ArrayList<PAPX> _paragraphs = new ArrayList<PAPX>();
  byte[] _dataStream;
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java
@ -112,7 +112,11 @@ public final class PAPX extends BytePropertyNode {
    {
      return 0;
    }
-      return LittleEndian.getShort(buf);
+    if (buf.length == 1)
+    {
+      return (short)LittleEndian.getUnsignedByte(buf, 0);
+    }
+    return LittleEndian.getShort(buf);
  }

  public SprmBuffer getSprmBuf()
@ -122,6 +126,11 @@ public final class PAPX extends BytePropertyNode {

  public ParagraphProperties getParagraphProperties(StyleSheet ss)
  {
+    if(ss == null) {
+        // TODO Fix up for Word 6/95
+        return new ParagraphProperties();
+    }
+      
    short istd = getIstd();
    ParagraphProperties baseStyle = ss.getParagraphStyle(istd);
    ParagraphProperties props = ParagraphSprmUncompressor.uncompressPAP(baseStyle, getGrpprl(), 2);
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java
@ -27,12 +27,12 @@ import org.apache.poi.hwpf.model.io.*;
 /**
 * @author Ryan Ackley
 */
-public final class SectionTable
+public class SectionTable
 {
  private static final int SED_SIZE = 12;

-  protected ArrayList _sections = new ArrayList();
-  protected List _text;
+  protected ArrayList<SEPX> _sections = new ArrayList<SEPX>();
+  protected List<TextPiece> _text;

  /** So we can know if things are unicode or not */
  private TextPieceTable tpt;
@ -84,7 +84,7 @@ public final class SectionTable
    boolean matchAt = false;
    boolean matchHalf = false;
    for(int i=0; i<_sections.size(); i++) {
-    	SEPX s = (SEPX)_sections.get(i);
+    	SEPX s = _sections.get(i);
    	if(s.getEnd() == mainEndsAt) {
    		matchAt = true;
    	} else if(s.getEndBytes() == mainEndsAt || s.getEndBytes() == mainEndsAt-1) {
@ -94,7 +94,7 @@ public final class SectionTable
    if(! matchAt && matchHalf) {
    	System.err.println("Your document seemed to be mostly unicode, but the section definition was in bytes! Trying anyway, but things may well go wrong!");
        for(int i=0; i<_sections.size(); i++) {
-        	SEPX s = (SEPX)_sections.get(i);
+        	SEPX s = _sections.get(i);
            GenericPropertyNode node = sedPlex.getProperty(i);

        	s.setStart( CPtoFC(node.getStart()) );
@ -106,12 +106,12 @@ public final class SectionTable
  public void adjustForInsert(int listIndex, int length)
  {
    int size = _sections.size();
-    SEPX sepx = (SEPX)_sections.get(listIndex);
+    SEPX sepx = _sections.get(listIndex);
    sepx.setEnd(sepx.getEnd() + length);

    for (int x = listIndex + 1; x < size; x++)
    {
-      sepx = (SEPX)_sections.get(x);
+      sepx = _sections.get(x);
      sepx.setStart(sepx.getStart() + length);
      sepx.setEnd(sepx.getEnd() + length);
    }
@ -129,7 +129,7 @@ public final class SectionTable

      for(int i=_text.size()-1; i>-1; i--)
      {
-        TP = (TextPiece)_text.get(i);
+        TP = _text.get(i);

        if(CP >= TP.getCP()) break;
      }
@ -142,7 +142,7 @@ public final class SectionTable
      return FC;
    }

-  public ArrayList getSections()
+  public ArrayList<SEPX> getSections()
  {
    return _sections;
  }
@ -159,7 +159,7 @@ public final class SectionTable

    for (int x = 0; x < len; x++)
    {
-      SEPX sepx = (SEPX)_sections.get(x);
+      SEPX sepx = _sections.get(x);
      byte[] grpprl = sepx.getGrpprl();

      // write the sepx to the document stream. starts with a 2 byte size
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
@ -20,6 +20,7 @@ package org.apache.poi.hwpf.usermodel;
 import org.apache.poi.util.LittleEndian;

 import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFDocumentCore;

 import org.apache.poi.hwpf.usermodel.CharacterRun;
 import org.apache.poi.hwpf.usermodel.Paragraph;
@ -77,7 +78,7 @@ public class Range { // TODO -instantiable superclass
 	protected int _end;

 	/** The document this range blongs to. */
-	protected HWPFDocument _doc;
+	protected HWPFDocumentCore _doc;

 	/** Have we loaded the section indexes yet */
 	boolean _sectionRangeFound;
@ -144,7 +145,7 @@ public class Range { // TODO -instantiable superclass
 	 * @param doc
 	 *            The HWPFDocument the range is based on.
 	 */
-	public Range(int start, int end, HWPFDocument doc) {
+	public Range(int start, int end, HWPFDocumentCore doc) {
 		_start = start;
 		_end = end;
 		_doc = doc;
@ -1004,6 +1005,8 @@ public class Range { // TODO -instantiable superclass
 	 *            The (signed) value that should be added to the FIB CCP fields
 	 */
 	protected void adjustFIB(int adjustment) {
+	    assert (_doc instanceof HWPFDocument);
+	    
 		// update the FIB.CCPText field (this should happen once per adjustment,
 		// so we don't want it in
 		// adjustForInsert() or it would get updated multiple times if the range
@ -1011,7 +1014,7 @@ public class Range { // TODO -instantiable superclass
 		// without this, OpenOffice.org (v. 2.2.x) does not see all the text in
 		// the document

-		CPSplitCalculator cpS = _doc.getCPSplitCalculator();
+		CPSplitCalculator cpS = ((HWPFDocument)_doc).getCPSplitCalculator();
 		FileInformationBlock fib = _doc.getFileInformationBlock();

 		// Do for each affected part
@ -1066,7 +1069,7 @@ public class Range { // TODO -instantiable superclass
 		return _end;
 	}

-	protected HWPFDocument getDocument() {
+	protected HWPFDocumentCore getDocument() {

 		return _doc;
 	}
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
@ -256,6 +256,16 @@ public final class TestWordExtractor extends TestCase {
        assertTrue(text.contains("Paragraph 2"));
        assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it"));
        assertTrue(text.contains("Last (4th) paragraph"));
+        
+        String[] tp = w6e.getParagraphText();
+        assertEquals(7, tp.length);
+        assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]);
+        assertEquals("\r\n", tp[1]);
+        assertEquals("Paragraph 2\r\n", tp[2]);
+        assertEquals("\r\n", tp[3]);
+        assertEquals("Paragraph 3. Has some RED text and some BLUE BOLD text in it.\r\n", tp[4]);
+        assertEquals("\r\n", tp[5]);
+        assertEquals("Last (4th) paragraph.\r\n", tp[6]);
 	}
 	
 	public void testWord6() throws Exception {
@ -273,5 +283,9 @@ public final class TestWordExtractor extends TestCase {
        String text = w6e.getText();
        
        assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
+        
+        String[] tp = w6e.getParagraphText();
+        assertEquals(1, tp.length);
+        assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]);
 	}
 }