replace ComplexFileTable with single-element-one right after load; replace text piece table as well

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1150675 13f79535-47bb-0310-9956-ffa450edef68
2011-07-25 12:58:09 +00:00 · 2011-07-25 12:58:09 +00:00 · 4c724bf71c
parent 23d2678a0e
commit 4c724bf71c
15 changed files with 323 additions and 391 deletions
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
@ -23,8 +23,6 @@ import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
-import java.util.Iterator;
-import java.util.List;

 import org.apache.poi.hwpf.model.BookmarksTables;
 import org.apache.poi.hwpf.model.CHPBinTable;
@ -40,6 +38,7 @@ import org.apache.poi.hwpf.model.NoteType;
 import org.apache.poi.hwpf.model.NotesTables;
 import org.apache.poi.hwpf.model.PAPBinTable;
 import org.apache.poi.hwpf.model.PicturesTable;
+import org.apache.poi.hwpf.model.PieceDescriptor;
 import org.apache.poi.hwpf.model.RevisionMarkAuthorTable;
 import org.apache.poi.hwpf.model.SavedByTable;
 import org.apache.poi.hwpf.model.SectionTable;
@ -92,7 +91,7 @@ public final class HWPFDocument extends HWPFDocumentCore
  * structure*/
  protected ComplexFileTable _cft;

-  protected TextPieceTable _tpt;
+  protected final StringBuilder _text;

  /** Holds the save history for this document. */
  protected SavedByTable _sbt;
@ -139,6 +138,7 @@ public final class HWPFDocument extends HWPFDocumentCore
  protected HWPFDocument()
  {
     super();
+     this._text = new StringBuilder("\r");
  }

  /**
@ -246,15 +246,35 @@ public final class HWPFDocument extends HWPFDocumentCore
    // Start to load up our standard structures.
    _dop = new DocumentProperties(_tableStream, _fib.getFcDop());
    _cft = new ComplexFileTable(_mainStream, _tableStream, _fib.getFcClx(), fcMin);
-    _tpt = _cft.getTextPieceTable();
+    TextPieceTable _tpt = _cft.getTextPieceTable();

    // Now load the rest of the properties, which need to be adjusted
    //  for where text really begin
    _cbt = new CHPBinTable(_mainStream, _tableStream, _fib.getFcPlcfbteChpx(), _fib.getLcbPlcfbteChpx(), _tpt);
    _pbt = new PAPBinTable(_mainStream, _tableStream, _dataStream, _fib.getFcPlcfbtePapx(), _fib.getLcbPlcfbtePapx(), _tpt);

+        _text = _tpt.getText();
        _cbt.rebuild( _cft );
-        _pbt.rebuild( _dataStream, _cft );
+        _pbt.rebuild( _text, _dataStream, _cft );
+
+        boolean preserve = false;
+        try
+        {
+            preserve = Boolean.parseBoolean( System
+                    .getProperty( "org.apache.poi.hwpf.preserveTextTable" ) );
+        }
+        catch ( Exception exc )
+        {
+            // ignore;
+        }
+        if ( !preserve )
+        {
+            _cft = new ComplexFileTable();
+            _tpt = _cft.getTextPieceTable();
+            _tpt.add( new TextPiece( 0, _text.length(), _text.toString()
+                    .getBytes( "UTF-16LE" ), new PieceDescriptor( new byte[8],
+                    0 ) ) );
+        }

    // Read FSPA and Escher information
    _fspa = new FSPATable(_tableStream, _fib.getFcPlcspaMom(), _fib.getLcbPlcspaMom(), getTextTable().getTextPieces());
@ -314,6 +334,12 @@ public final class HWPFDocument extends HWPFDocumentCore
    return _cft.getTextPieceTable();
  }

+    @Override
+    public StringBuilder getText()
+    {
+        return _text;
+    }
+
  @Deprecated
  public CPSplitCalculator getCPSplitCalculator()
  {
@ -326,10 +352,7 @@ public final class HWPFDocument extends HWPFDocumentCore
  }

  public Range getOverallRange() {
-	  // hack to get the ending cp of the document, Have to revisit this.
-      TextPiece p =  _tpt.getTextPieces().get(_tpt.getTextPieces().size() - 1);
-
-      return new Range(0, p.getEnd(), this);
+      return new Range(0, _text.length(), this);
  }

    /**
@ -445,16 +468,7 @@ public final class HWPFDocument extends HWPFDocumentCore
   */
  public int characterLength()
  {
-    List<TextPiece> textPieces = _tpt.getTextPieces();
-    Iterator<TextPiece> textIt = textPieces.iterator();
-
-    int length = 0;
-    while(textIt.hasNext())
-    {
-      TextPiece tp = textIt.next();
-      length += tp.characterLength();
-    }
-    return length;
+      return _text.length();
  }

  /**
@ -643,7 +657,7 @@ public final class HWPFDocument extends HWPFDocumentCore

    // write out the PAPBinTable.
    _fib.setFcPlcfbtePapx(tableOffset);
-    _pbt.writeTo(docSys, fcMin);
+    _pbt.writeTo(docSys, fcMin, _cft.getTextPieceTable());
    _fib.setLcbPlcfbtePapx(tableStream.getOffset() - tableOffset);
    tableOffset = tableStream.getOffset();

--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java
@ -35,6 +35,7 @@ import org.apache.poi.hwpf.usermodel.Range;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.DocumentEntry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.Internal;


 /**
@ -161,8 +162,20 @@ public abstract class HWPFDocumentCore extends POIDocument
     */
    public abstract Range getOverallRange();

-  public abstract TextPieceTable getTextTable();
-  
+    /**
+     * Returns document text, i.e. text information from all text pieces,
+     * including OLE descriptions and field codes
+     */
+    public String getDocumentText() {
+        return getText().toString();
+    }
+
+    /**
+     * Internal method to access document text
+     */
+    @Internal
+    public abstract StringBuilder getText();
+
  public CHPBinTable getCharacterTable()
  {
    return _cbt;
@ -197,4 +210,6 @@ public abstract class HWPFDocumentCore extends POIDocument
  {
    return _fib;
  }
+
+    public abstract TextPieceTable getTextTable();
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
@ -38,6 +38,8 @@ import org.apache.poi.util.LittleEndian;
 public class HWPFOldDocument extends HWPFDocumentCore {
    private TextPieceTable tpt;
    
+    private StringBuilder _text;
+    
    public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
        this(fs.getRoot());
    }
@ -88,13 +90,15 @@ public class HWPFOldDocument extends HWPFDocumentCore {
            byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()];
            System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length);
            TextPiece tp = new TextPiece(
-                    0, textData.length, textData, pd, 0
+                    0, textData.length, textData, pd
            );
            tpt.add(tp);
            
            text.append(tp.getStringBuffer());
        }
        
+        _text = tpt.getText();
+
        // Now we can fetch the character and paragraph properties
        _cbt = new OldCHPBinTable(
                _mainStream, chpTableOffset, chpTableSize,
@ -126,6 +130,12 @@ public class HWPFOldDocument extends HWPFDocumentCore {
      return tpt;
    }

+    @Override
+    public StringBuilder getText()
+    {
+        return _text;
+    }
+
    @Override
    public void write(OutputStream out) throws IOException {
        throw new IllegalStateException("Writing is not available for the older file formats");
--- a/src/scratchpad/src/org/apache/poi/hwpf/dev/HWPFLister.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/dev/HWPFLister.java
@ -23,9 +23,7 @@ import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.Collections;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
@ -37,10 +35,7 @@ import org.apache.poi.hwpf.OldWordFileFormatException;
 import org.apache.poi.hwpf.model.CHPX;
 import org.apache.poi.hwpf.model.FieldsDocumentPart;
 import org.apache.poi.hwpf.model.FileInformationBlock;
-import org.apache.poi.hwpf.model.GenericPropertyNode;
-import org.apache.poi.hwpf.model.PAPFormattedDiskPage;
 import org.apache.poi.hwpf.model.PAPX;
-import org.apache.poi.hwpf.model.PlexOfCps;
 import org.apache.poi.hwpf.model.StyleSheet;
 import org.apache.poi.hwpf.model.TextPiece;
 import org.apache.poi.hwpf.sprm.SprmIterator;
@ -51,10 +46,8 @@ import org.apache.poi.hwpf.usermodel.Field;
 import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Picture;
 import org.apache.poi.hwpf.usermodel.Range;
-import org.apache.poi.poifs.common.POIFSConstants;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.IOUtils;
-import org.apache.poi.util.LittleEndian;

 /**
 * Used by developers to list out key information on a HWPF file. End users will
@ -241,13 +234,10 @@ public final class HWPFLister

    private LinkedHashMap<Integer, String> paragraphs;

-    private String text;
-
    public HWPFLister( HWPFDocumentCore doc )
    {
        _doc = doc;

-        buildText();
        buildParagraphs();
    }

@ -256,6 +246,7 @@ public final class HWPFLister
        paragraphs = new LinkedHashMap<Integer, String>();

        StringBuilder part = new StringBuilder();
+        String text = _doc.getDocumentText();
        for ( int charIndex = 0; charIndex < text.length(); charIndex++ )
        {
            char c = text.charAt( charIndex );
@ -268,24 +259,6 @@ public final class HWPFLister
        }
    }

-    private void buildText()
-    {
-        StringBuilder builder = new StringBuilder();
-        for ( TextPiece textPiece : _doc.getTextTable().getTextPieces() )
-        {
-            String toAppend = textPiece.getStringBuffer().toString();
-
-            if ( toAppend.length() != ( textPiece.getEnd() - textPiece
-                    .getStart() ) )
-            {
-                throw new AssertionError();
-            }
-
-            builder.replace( textPiece.getStart(), textPiece.getEnd(), toAppend );
-        }
-        this.text = builder.toString();
-    }
-
    private void dumpBookmarks()
    {
        if ( !( _doc instanceof HWPFDocument ) )
@ -379,69 +352,69 @@ public final class HWPFLister

    public void dumpPapx( boolean withProperties ) throws Exception
    {
-        if ( _doc instanceof HWPFDocument )
-        {
-            System.out.println( "binary PAP pages " );
-
-            HWPFDocument doc = (HWPFDocument) _doc;
-
-            java.lang.reflect.Field fMainStream = HWPFDocumentCore.class
-                    .getDeclaredField( "_mainStream" );
-            fMainStream.setAccessible( true );
-            byte[] mainStream = (byte[]) fMainStream.get( _doc );
-
-            PlexOfCps binTable = new PlexOfCps( doc.getTableStream(), doc
-                    .getFileInformationBlock().getFcPlcfbtePapx(), doc
-                    .getFileInformationBlock().getLcbPlcfbtePapx(), 4 );
-
-            List<PAPX> papxs = new ArrayList<PAPX>();
-
-            int length = binTable.length();
-            for ( int x = 0; x < length; x++ )
-            {
-                GenericPropertyNode node = binTable.getProperty( x );
-
-                int pageNum = LittleEndian.getInt( node.getBytes() );
-                int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE
-                        * pageNum;
-
-                PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(
-                        mainStream, doc.getDataStream(), pageOffset,
-                        doc.getTextTable() );
-
-                System.out.println( "* PFKP: " + pfkp );
-
-                for ( PAPX papx : pfkp.getPAPXs() )
-                {
-                    System.out.println( "** " + papx );
-                    papxs.add( papx );
-                    if ( papx != null && true )
-                    {
-                        SprmIterator sprmIt = new SprmIterator(
-                                papx.getGrpprl(), 2 );
-                        while ( sprmIt.hasNext() )
-                        {
-                            SprmOperation sprm = sprmIt.next();
-                            System.out.println( "*** " + sprm.toString() );
-                        }
-                    }
-
-                }
-            }
-
-            Collections.sort( papxs );
-            System.out.println( "* Sorted by END" );
-            for ( PAPX papx : papxs )
-            {
-                System.out.println( "** " + papx );
-                SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 );
-                while ( sprmIt.hasNext() )
-                {
-                    SprmOperation sprm = sprmIt.next();
-                    System.out.println( "*** " + sprm.toString() );
-                }
-            }
-        }
+//        if ( _doc instanceof HWPFDocument )
+//        {
+//            System.out.println( "binary PAP pages " );
+//
+//            HWPFDocument doc = (HWPFDocument) _doc;
+//
+//            java.lang.reflect.Field fMainStream = HWPFDocumentCore.class
+//                    .getDeclaredField( "_mainStream" );
+//            fMainStream.setAccessible( true );
+//            byte[] mainStream = (byte[]) fMainStream.get( _doc );
+//
+//            PlexOfCps binTable = new PlexOfCps( doc.getTableStream(), doc
+//                    .getFileInformationBlock().getFcPlcfbtePapx(), doc
+//                    .getFileInformationBlock().getLcbPlcfbtePapx(), 4 );
+//
+//            List<PAPX> papxs = new ArrayList<PAPX>();
+//
+//            int length = binTable.length();
+//            for ( int x = 0; x < length; x++ )
+//            {
+//                GenericPropertyNode node = binTable.getProperty( x );
+//
+//                int pageNum = LittleEndian.getInt( node.getBytes() );
+//                int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE
+//                        * pageNum;
+//
+//                PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(
+//                        mainStream, doc.getDataStream(), pageOffset,
+//                        doc.getTextTable() );
+//
+//                System.out.println( "* PFKP: " + pfkp );
+//
+//                for ( PAPX papx : pfkp.getPAPXs() )
+//                {
+//                    System.out.println( "** " + papx );
+//                    papxs.add( papx );
+//                    if ( papx != null && true )
+//                    {
+//                        SprmIterator sprmIt = new SprmIterator(
+//                                papx.getGrpprl(), 2 );
+//                        while ( sprmIt.hasNext() )
+//                        {
+//                            SprmOperation sprm = sprmIt.next();
+//                            System.out.println( "*** " + sprm.toString() );
+//                        }
+//                    }
+//
+//                }
+//            }
+//
+//            Collections.sort( papxs );
+//            System.out.println( "* Sorted by END" );
+//            for ( PAPX papx : papxs )
+//            {
+//                System.out.println( "** " + papx );
+//                SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 );
+//                while ( sprmIt.hasNext() )
+//                {
+//                    SprmOperation sprm = sprmIt.next();
+//                    System.out.println( "*** " + sprm.toString() );
+//                }
+//            }
+//        }

        // for ( PAPX papx : _doc.getParagraphTable().getParagraphs() )
        // {
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
@ -20,13 +20,11 @@ package org.apache.poi.hwpf.extractor;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.UnsupportedEncodingException;
 import java.util.ArrayList;
 import java.util.Arrays;

 import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.hwpf.HWPFDocument;
-import org.apache.poi.hwpf.model.TextPiece;
 import org.apache.poi.hwpf.usermodel.HeaderStories;
 import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Range;
@ -218,22 +216,7 @@ public final class WordExtractor extends POIOLE2TextExtractor {
 	 *  mapping is broken. Fast too.
 	 */
 	public String getTextFromPieces() {
-    	StringBuffer textBuf = new StringBuffer();
-
-    	for(TextPiece piece : doc.getTextTable().getTextPieces()) {
-    		String encoding = "Cp1252";
-    		if (piece.isUnicode()) {
-    			encoding = "UTF-16LE";
-    		}
-    		try {
-    			String text = new String(piece.getRawBytes(), encoding);
-    			textBuf.append(text);
-    		} catch(UnsupportedEncodingException e) {
-    			throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken");
-    		}
-    	}
-
-    	String text = textBuf.toString();
+    	String text = doc.getDocumentText();

    	// Fix line endings (Note - won't get all of them
    	text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java
@ -179,34 +179,6 @@ public class CHPBinTable
            start = System.currentTimeMillis();
        }

-        // rebuild document paragraphs structure
-        StringBuilder docText = new StringBuilder();
-        for ( TextPiece textPiece : tpt.getTextPieces() )
-        {
-            String toAppend = textPiece.getStringBuffer().toString();
-            int toAppendLength = toAppend.length();
-
-            if ( toAppendLength != textPiece.getEnd() - textPiece.getStart() )
-            {
-                logger.log(
-                        POILogger.WARN,
-                        "Text piece has boundaries [",
-                        Integer.valueOf( textPiece.getStart() ),
-                        "; ",
-                        Integer.valueOf( textPiece.getEnd() ),
-                        ") but length ",
-                        Integer.valueOf( textPiece.getEnd()
-                                - textPiece.getStart() ) );
-            }
-
-            docText.replace( textPiece.getStart(), textPiece.getStart()
-                    + toAppendLength, toAppend );
-        }
-        logger.log( POILogger.DEBUG, "Document text rebuilded in ",
-                Long.valueOf( System.currentTimeMillis() - start ), " ms (",
-                Integer.valueOf( docText.length() ), " chars)" );
-        start = System.currentTimeMillis();
-
        List<CHPX> oldChpxSortedByStartPos = new ArrayList<CHPX>( _textRuns );
        Collections.sort( oldChpxSortedByStartPos,
                PropertyNode.StartComparator.instance );
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
@ -54,9 +54,6 @@ public class PAPBinTable
  protected ArrayList<PAPX> _paragraphs = new ArrayList<PAPX>();
  byte[] _dataStream;

-  /** So we can know if things are unicode or not */
-  private TextPieceTable tpt;
-
  public PAPBinTable()
  {
  }
@ -81,7 +78,6 @@ public class PAPBinTable

        {
            PlexOfCps binTable = new PlexOfCps( tableStream, offset, size, 4 );
-            this.tpt = tpt;

            int length = binTable.length();
            for ( int x = 0; x < length; x++ )
@ -112,7 +108,8 @@ public class PAPBinTable
                Integer.valueOf( _paragraphs.size() ), " elements)" );
    }

-    public void rebuild( byte[] dataStream, ComplexFileTable complexFileTable )
+    public void rebuild( final StringBuilder docText, byte[] dataStream,
+            ComplexFileTable complexFileTable )
    {
        long start = System.currentTimeMillis();

@ -121,7 +118,8 @@ public class PAPBinTable
            SprmBuffer[] sprmBuffers = complexFileTable.getGrpprls();

            // adding PAPX from fast-saved SPRMs
-            for ( TextPiece textPiece : tpt.getTextPieces() )
+            for ( TextPiece textPiece : complexFileTable.getTextPieceTable()
+                    .getTextPieces() )
            {
                PropertyModifier prm = textPiece.getPieceDescriptor().getPrm();
                if ( !prm.isComplex() )
@ -167,34 +165,6 @@ public class PAPBinTable
            start = System.currentTimeMillis();
        }

-        // rebuild document paragraphs structure
-        StringBuilder docText = new StringBuilder();
-        for ( TextPiece textPiece : tpt.getTextPieces() )
-        {
-            String toAppend = textPiece.getStringBuffer().toString();
-            int toAppendLength = toAppend.length();
-
-            if ( toAppendLength != textPiece.getEnd() - textPiece.getStart() )
-            {
-                logger.log(
-                        POILogger.WARN,
-                        "Text piece has boundaries [",
-                        Integer.valueOf( textPiece.getStart() ),
-                        "; ",
-                        Integer.valueOf( textPiece.getEnd() ),
-                        ") but length ",
-                        Integer.valueOf( textPiece.getEnd()
-                                - textPiece.getStart() ) );
-            }
-
-            docText.replace( textPiece.getStart(), textPiece.getStart()
-                    + toAppendLength, toAppend );
-        }
-        logger.log( POILogger.DEBUG, "Document text rebuilded in ",
-                Long.valueOf( System.currentTimeMillis() - start ), " ms (",
-                Integer.valueOf( docText.length() ), " chars)" );
-        start = System.currentTimeMillis();
-
        List<PAPX> oldPapxSortedByEndPos = new ArrayList<PAPX>( _paragraphs );
        Collections.sort( oldPapxSortedByEndPos,
                PropertyNode.EndComparator.instance );
@ -274,7 +244,8 @@ public class PAPBinTable
            {
                // can we reuse existing?
                PAPX existing = papxs.get( 0 );
-                if ( existing.getStart() == startInclusive && existing.getEnd() == endExclusive )
+                if ( existing.getStart() == startInclusive
+                        && existing.getEnd() == endExclusive )
                {
                    newPapxs.add( existing );
                    lastParStart = endExclusive;
@ -311,7 +282,8 @@ public class PAPBinTable
        this._paragraphs = new ArrayList<PAPX>( newPapxs );

        logger.log( POILogger.DEBUG, "PAPX rebuilded from document text in ",
-                Long.valueOf( System.currentTimeMillis() - start ), " ms" );
+                Long.valueOf( System.currentTimeMillis() - start ), " ms (",
+                Integer.valueOf( _paragraphs.size() ), " elements)" );
        start = System.currentTimeMillis();

        _dataStream = dataStream;
@ -320,7 +292,7 @@ public class PAPBinTable
  public void insert(int listIndex, int cpStart, SprmBuffer buf)
  {

-    PAPX forInsert = new PAPX(0, 0, tpt, buf, _dataStream);
+    PAPX forInsert = new PAPX(0, 0, buf, _dataStream);

    // Ensure character offsets are really characters
    forInsert.setStart(cpStart);
@ -350,7 +322,7 @@ public class PAPBinTable
    	//  Original, until insert at point
    	//  New one
    	//  Clone of original, on to the old end
-        PAPX clone = new PAPX(0, 0, tpt, clonedBuf, _dataStream);
+        PAPX clone = new PAPX(0, 0, clonedBuf, _dataStream);
        // Again ensure contains character based offsets no matter what
        clone.setStart(cpStart);
        clone.setEnd(currentPap.getEnd());
@ -427,9 +399,8 @@ public class PAPBinTable
    return _paragraphs;
  }

-  public void writeTo(HWPFFileSystem sys, int fcMin)
-    throws IOException
-  {
+    public void writeTo( HWPFFileSystem sys, int fcMin, CharIndexTranslator translator ) throws IOException
+    {

    HWPFOutputStream docStream = sys.getStream("WordDocument");
    OutputStream tableStream = sys.getStream("1Table");
@ -463,7 +434,7 @@ public class PAPBinTable
      PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(_dataStream);
      pfkp.fill(overflow);

-      byte[] bufFkp = pfkp.toByteArray(tpt, fcMin);
+      byte[] bufFkp = pfkp.toByteArray(translator, fcMin);
      docStream.write(bufFkp);
      overflow = pfkp.getOverflow();

--- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java
@ -19,6 +19,9 @@ package org.apache.poi.hwpf.model;


 import java.io.UnsupportedEncodingException;
+
+import org.apache.poi.util.Internal;
+
 /**
 * Lightweight representation of a text piece.
 * Works in the character domain, not the byte domain, so you
@ -27,19 +30,39 @@ import java.io.UnsupportedEncodingException;
 *
 * @author Ryan Ackley
 */
-
+@Internal
 public final class TextPiece extends PropertyNode<TextPiece>
 {
  private boolean _usesUnicode;

  private PieceDescriptor _pd;

-  /**
-   * @param start Beginning offset in main document stream, in characters.
-   * @param end Ending offset in main document stream, in characters.
-   * @param text The raw bytes of our text
-   */
-  public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, int cpStart) {
+    /**
+     * @param start
+     *            Beginning offset in main document stream, in characters.
+     * @param end
+     *            Ending offset in main document stream, in characters.
+     * @param text
+     *            The raw bytes of our text
+     * @deprecated Use {@link #TextPiece(int,int,byte[],PieceDescriptor)}
+     *             instead
+     */
+    public TextPiece( int start, int end, byte[] text, PieceDescriptor pd,
+            int cpStart )
+    {
+        this( start, end, text, pd );
+    }
+
+    /**
+     * @param start
+     *            Beginning offset in main document stream, in characters.
+     * @param end
+     *            Ending offset in main document stream, in characters.
+     * @param text
+     *            The raw bytes of our text
+     */
+    public TextPiece( int start, int end, byte[] text, PieceDescriptor pd )
+    {
 	  super(start, end, buildInitSB(text, pd));
 	  _usesUnicode = pd.isUnicode();
 	  _pd = pd;
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
@ -24,6 +24,8 @@ import java.util.List;

 import org.apache.poi.hwpf.model.io.HWPFOutputStream;
 import org.apache.poi.poifs.common.POIFSConstants;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;

 /**
 * The piece table for matching up character positions to bits of text. This
@ -34,6 +36,9 @@ import org.apache.poi.poifs.common.POIFSConstants;
 */
 public class TextPieceTable implements CharIndexTranslator
 {
+    private static final POILogger logger = POILogFactory
+            .getLogger( TextPieceTable.class );
+
    // int _multiple;
    int _cpMin;
    protected ArrayList<TextPiece> _textPieces = new ArrayList<TextPiece>();
@ -101,7 +106,7 @@ public class TextPieceTable implements CharIndexTranslator

            // And now build the piece
            _textPieces.add( new TextPiece( nodeStartChars, nodeEndChars, buf,
-                    pieces[x], node.getStart() ) );
+                    pieces[x] ) );
        }

        // In the interest of our sanity, now sort the text pieces
@ -251,6 +256,41 @@ public class TextPieceTable implements CharIndexTranslator
        return _cpMin;
    }

+    public StringBuilder getText()
+    {
+        final long start = System.currentTimeMillis();
+
+        // rebuild document paragraphs structure
+        StringBuilder docText = new StringBuilder();
+        for ( TextPiece textPiece : _textPieces )
+        {
+            String toAppend = textPiece.getStringBuffer().toString();
+            int toAppendLength = toAppend.length();
+
+            if ( toAppendLength != textPiece.getEnd() - textPiece.getStart() )
+            {
+                logger.log(
+                        POILogger.WARN,
+                        "Text piece has boundaries [",
+                        Integer.valueOf( textPiece.getStart() ),
+                        "; ",
+                        Integer.valueOf( textPiece.getEnd() ),
+                        ") but length ",
+                        Integer.valueOf( textPiece.getEnd()
+                                - textPiece.getStart() ) );
+            }
+
+            docText.replace( textPiece.getStart(), textPiece.getStart()
+                    + toAppendLength, toAppend );
+        }
+
+        logger.log( POILogger.DEBUG, "Document text were rebuilded in ",
+                Long.valueOf( System.currentTimeMillis() - start ), " ms (",
+                Integer.valueOf( docText.length() ), " chars)" );
+
+        return docText;
+    }
+
    public List<TextPiece> getTextPieces()
    {
        return _textPieces;
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
@ -31,7 +31,7 @@ import org.apache.poi.hwpf.model.PropertyNode;
 import org.apache.poi.hwpf.model.SEPX;
 import org.apache.poi.hwpf.model.StyleSheet;
 import org.apache.poi.hwpf.model.SubdocumentType;
-import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.model.TextPieceTable;
 import org.apache.poi.hwpf.sprm.CharacterSprmCompressor;
 import org.apache.poi.hwpf.sprm.ParagraphSprmCompressor;
 import org.apache.poi.hwpf.sprm.SprmBuffer;
@ -108,18 +108,8 @@ public class Range { // TODO -instantiable superclass
 	/** The end index in the characterRuns list for this Range. */
 	protected int _charEnd;

-	/** Have we loaded the Text indexes yet */
-	protected boolean _textRangeFound;
-
-	/** All text pieces that belong to the document this Range belongs to. */
-	protected List<TextPiece> _text;
-
-	/** The start index in the text list for this Range. */
-	protected int _textStart;
-
-	/** The end index in the text list for this Range. */
-	protected int _textEnd;
-
+	protected StringBuilder _text;
+	
 	// protected Range()
 	// {
 	//
@ -144,7 +134,7 @@ public class Range { // TODO -instantiable superclass
 		_sections = _doc.getSectionTable().getSections();
 		_paragraphs = _doc.getParagraphTable().getParagraphs();
 		_characters = _doc.getCharacterTable().getTextRuns();
-		_text = _doc.getTextTable().getTextPieces();
+		_text = _doc.getText();
 		_parent = new WeakReference<Range>(null);

 		sanityCheckStartEnd();
@ -171,6 +161,7 @@ public class Range { // TODO -instantiable superclass
 		_parent = new WeakReference<Range>(parent);

 		sanityCheckStartEnd();
+		assert sanityCheck();
 	}

 	/**
@ -212,23 +203,17 @@ public class Range { // TODO -instantiable superclass
 		}
 	}

-	/**
-	 * Does any <code>TextPiece</code> in this Range use unicode?
-	 *
-	 * @return true if it does and false if it doesn't
-	 */
-	public boolean usesUnicode() {
-
-		initText();
-
-		for (int i = _textStart; i < _textEnd; i++) {
-			TextPiece piece = _text.get(i);
-			if (piece.isUnicode())
-				return true;
-		}
-
-		return false;
-	}
+    /**
+     * @return always return true
+     * @deprecated Range is not linked to any text piece anymore, so to check if
+     *             unicode is used please access {@link TextPieceTable} during
+     *             document load time
+     */
+    @Deprecated
+    public boolean usesUnicode()
+    {
+        return true;
+    }

 	/**
 	 * Gets the text that this Range contains.
@ -236,29 +221,7 @@ public class Range { // TODO -instantiable superclass
 	 * @return The text for this range.
 	 */
 	public String text() {
-		initText();
-
-		StringBuffer sb = new StringBuffer();
-
-		for (int x = _textStart; x < _textEnd; x++) {
-			TextPiece piece = _text.get(x);
-
-			// Figure out where in this piece the text
-			// we're after lives
-			int rStart = 0;
-			int rEnd = piece.characterLength();
-			if (_start > piece.getStart()) {
-				rStart = _start - piece.getStart();
-			}
-			if (_end < piece.getEnd()) {
-				rEnd -= (piece.getEnd() - _end);
-			}
-
-			// Luckily TextPieces work in characters, so we don't
-			// need to worry about unicode here
-			sb.append(piece.substring(rStart, rEnd));
-		}
-		return sb.toString();
+	    return _text.substring( _start, _end );
 	}

 	/**
@ -346,67 +309,52 @@ public class Range { // TODO -instantiable superclass
 		return _charEnd - _charStart;
 	}

-	/**
-	 * Inserts text into the front of this range.
-	 *
-	 * @param text
-	 *            The text to insert
-	 * @return The character run that text was inserted into.
-	 */
-	public CharacterRun insertBefore(String text)
-	// throws UnsupportedEncodingException
-	{
-		initAll();
+    /**
+     * Inserts text into the front of this range.
+     * 
+     * @param text
+     *            The text to insert
+     * @return The character run that text was inserted into.
+     */
+    public CharacterRun insertBefore( String text )
+    {
+        initAll();

-		TextPiece tp = _text.get(_textStart);
-		StringBuffer sb = tp.getStringBuffer();
+        _text.insert( _start, text );
+        _doc.getCharacterTable().adjustForInsert( _charStart, text.length() );
+        _doc.getParagraphTable().adjustForInsert( _parStart, text.length() );
+        _doc.getSectionTable().adjustForInsert( _sectionStart, text.length() );
+        adjustForInsert( text.length() );

-		// Since this is the first item in our list, it is safe to assume that
-		// _start >= tp.getStart()
-		int insertIndex = _start - tp.getStart();
-		sb.insert(insertIndex, text);
+        // update the FIB.CCPText + friends fields
+        adjustFIB( text.length() );

-		int adjustedLength = _doc.getTextTable().adjustForInsert(_textStart, text.length());
-		_doc.getCharacterTable().adjustForInsert(_charStart, adjustedLength);
-		_doc.getParagraphTable().adjustForInsert(_parStart, adjustedLength);
-		_doc.getSectionTable().adjustForInsert(_sectionStart, adjustedLength);
-		adjustForInsert(adjustedLength);
+        assert sanityCheck();

-		// update the FIB.CCPText + friends fields
-		adjustFIB(text.length());
+        return getCharacterRun( 0 );
+    }

-		return getCharacterRun(0);
-	}
+    /**
+     * Inserts text onto the end of this range
+     * 
+     * @param text
+     *            The text to insert
+     * @return The character run the text was inserted into.
+     */
+    public CharacterRun insertAfter( String text )
+    {
+        initAll();

-	/**
-	 * Inserts text onto the end of this range
-	 *
-	 * @param text
-	 *            The text to insert
-	 * @return The character run the text was inserted into.
-	 */
-	public CharacterRun insertAfter(String text) {
-		initAll();
+        _text.insert( _end, text );

-		int listIndex = _textEnd - 1;
-		TextPiece tp = _text.get(listIndex);
-		StringBuffer sb = tp.getStringBuffer();
+        _doc.getCharacterTable().adjustForInsert( _charEnd - 1, text.length() );
+        _doc.getParagraphTable().adjustForInsert( _parEnd - 1, text.length() );
+        _doc.getSectionTable().adjustForInsert( _sectionEnd - 1, text.length() );
+        adjustForInsert( text.length() );

-		int insertIndex = _end - tp.getStart();
-
-		if (tp.getStringBuffer().charAt(_end - 1) == '\r' && text.charAt(0) != '\u0007') {
-			insertIndex--;
-		}
-		sb.insert(insertIndex, text);
-		int adjustedLength = _doc.getTextTable().adjustForInsert(listIndex, text.length());
-		_doc.getCharacterTable().adjustForInsert(_charEnd - 1, adjustedLength);
-		_doc.getParagraphTable().adjustForInsert(_parEnd - 1, adjustedLength);
-		_doc.getSectionTable().adjustForInsert(_sectionEnd - 1, adjustedLength);
-		adjustForInsert(text.length());
-
-		return getCharacterRun(numCharacterRuns() - 1);
-
-	}
+        assert sanityCheck();
+        return getCharacterRun( numCharacterRuns() - 1 );
+    }

 	/**
 	 * Inserts text into the front of this range and it gives that text the
@ -580,7 +528,6 @@ public class Range { // TODO -instantiable superclass
 		int numSections = _sections.size();
 		int numRuns = _characters.size();
 		int numParagraphs = _paragraphs.size();
-		int numTextPieces = _text.size();

 		for (int x = _charStart; x < numRuns; x++) {
 			CHPX chpx = _characters.get(x);
@ -605,10 +552,12 @@ public class Range { // TODO -instantiable superclass
 			// + " -> " + sepx.getEnd());
 		}

-		for (int x = _textStart; x < numTextPieces; x++) {
-			TextPiece piece = _text.get(x);
-			piece.adjustForDelete(_start, _end - _start);
-		}
+        _text.delete( _start, _end );
+        Range parent = _parent.get();
+        if ( parent != null )
+        {
+            parent.adjustForInsert( -( _end - _start ) );
+        }

 		// update the FIB.CCPText + friends field
 		adjustFIB(-(_end - _start));
@ -623,7 +572,7 @@ public class Range { // TODO -instantiable superclass
 	 * @param rows
 	 *            The number of rows.
 	 * @return The empty Table that is now part of the document.
-     * @deprecated Use code shall not work with {@link ParagraphProperties}
+     * @deprecated Use code shall not work with {@link TableProperties}
 	 */
 	@Deprecated
 	public Table insertBefore(TableProperties props, int rows) {
@ -631,19 +580,28 @@ public class Range { // TODO -instantiable superclass
 		parProps.setFInTable(true);
 		parProps.setItap( 1 );

+		final int oldEnd = this._end;
+		
 		int columns = props.getItcMac();
-		for (int x = 0; x < rows; x++) {
-			Paragraph cell = this.insertBefore(parProps, StyleSheet.NIL_STYLE);
-			cell.insertAfter(String.valueOf('\u0007'));
-			for (int y = 1; y < columns; y++) {
-				cell = cell.insertAfter(parProps, StyleSheet.NIL_STYLE);
-				cell.insertAfter(String.valueOf('\u0007'));
-			}
-			cell = cell.insertAfter(parProps, StyleSheet.NIL_STYLE, String.valueOf('\u0007'));
-			cell.setTableRowEnd(props);
-		}
-		return new Table(_start, _start + (rows * (columns + 1)) * 2, this, 1);
-	}
+        for ( int x = 0; x < rows; x++ )
+        {
+            Paragraph cell = this.insertBefore( parProps, StyleSheet.NIL_STYLE );
+            cell.insertAfter( String.valueOf( '\u0007' ) );
+            for ( int y = 1; y < columns; y++ )
+            {
+                cell = cell.insertAfter( parProps, StyleSheet.NIL_STYLE );
+                cell.insertAfter( String.valueOf( '\u0007' ) );
+            }
+            cell = cell.insertAfter( parProps, StyleSheet.NIL_STYLE,
+                    String.valueOf( '\u0007' ) );
+            cell.setTableRowEnd( props );
+        }
+
+        final int newEnd = this._end;
+        final int diff = newEnd - oldEnd;
+
+        return new Table( _start, _start + diff, this, 1 );
+    }

 	/**
 	 * Inserts a list into the beginning of this range.
@ -715,23 +673,14 @@ public class Range { // TODO -instantiable superclass
 	 */
 	public void replaceText(String pPlaceHolder, String pValue, int pOffset) {
 		int absPlaceHolderIndex = getStartOffset() + pOffset;
+
 		Range subRange = new Range(absPlaceHolderIndex, (absPlaceHolderIndex + pPlaceHolder
-				.length()), getDocument());
-
-		// this Range isn't a proper parent of the subRange() so we'll have to
-		// keep
-		// track of an updated endOffset on our own
-		int previousEndOffset = subRange.getEndOffset();
-
+				.length()), this);
 		subRange.insertBefore(pValue);

-		if (subRange.getEndOffset() != previousEndOffset) {
-			adjustForInsert(subRange.getEndOffset() - previousEndOffset);
-		}
-
 		// re-create the sub-range so we can delete it
 		subRange = new Range((absPlaceHolderIndex + pValue.length()), (absPlaceHolderIndex
-				+ pPlaceHolder.length() + pValue.length()), getDocument());
+				+ pPlaceHolder.length() + pValue.length()), this);

 		// deletes are automagically propagated
 		subRange.delete();
@ -921,7 +870,6 @@ public class Range { // TODO -instantiable superclass
 	 * loads all of the list indexes.
 	 */
 	protected void initAll() {
-		initText();
 		initCharacterRuns();
 		initParagraphs();
 		initSections();
@ -951,18 +899,6 @@ public class Range { // TODO -instantiable superclass
 		}
 	}

-	/**
-	 * inits the text piece list indexes.
-	 */
-	private void initText() {
-		if (!_textRangeFound) {
-			int[] point = findRange(_text, _textStart, _start, _end);
-			_textStart = point[0];
-			_textEnd = point[1];
-			_textRangeFound = true;
-		}
-	}
-
 	/**
 	 * inits the section list indexes.
 	 */
@ -1038,7 +974,6 @@ public class Range { // TODO -instantiable superclass
 	 * resets the list indexes.
 	 */
 	protected void reset() {
-		_textRangeFound = false;
 		_charRangeFound = false;
 		_parRangeFound = false;
 		_sectionRangeFound = false;
@ -1153,8 +1088,19 @@ public class Range { // TODO -instantiable superclass
     * Method for debug purposes. Checks that all resolved elements are inside
     * of current range.
     */
-    public void sanityCheck()
+    public boolean sanityCheck()
    {
+        if ( _start < 0 )
+            throw new AssertionError();
+        if ( _start >= _text.length() )
+            throw new AssertionError();
+        if ( _end < 0 )
+            throw new AssertionError();
+        if ( _end > _text.length() )
+            throw new AssertionError();
+        if ( _start > _end )
+            throw new AssertionError();
+
        if ( _charRangeFound )
        {
            for ( int c = _charStart; c < _charEnd; c++ )
@ -1181,5 +1127,7 @@ public class Range { // TODO -instantiable superclass
                    throw new AssertionError();
            }
        }
+
+        return true;
    }
 }
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java
@ -17,16 +17,13 @@

 package org.apache.poi.hwpf.extractor;

-import java.util.Iterator;
+import junit.framework.TestCase;

 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.HWPFTestDataSamples;
-import org.apache.poi.hwpf.model.TextPiece;
 import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Range;

-import junit.framework.TestCase;
-
 /**
 * Test the different routes to extracting text
 *
@ -78,24 +75,10 @@ public final class TestDifferentRoutes extends TestCase {
 	 * Test textPieces based extraction
 	 */
 	public void testExtractFromTextPieces() throws Exception {
-		StringBuffer textBuf = new StringBuffer();
-
-		Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
-		while (textPieces.hasNext()) {
-			TextPiece piece = (TextPiece) textPieces.next();
-
-			String encoding = "Cp1252";
-			if (piece.isUnicode()) {
-				encoding = "UTF-16LE";
-			}
-			String text = new String(piece.getRawBytes(), encoding);
-			textBuf.append(text);
-		}
-
 		StringBuffer exp = new StringBuffer();
 		for (int i = 0; i < p_text.length; i++) {
 			exp.append(p_text[i]);
 		}
-		assertEquals(exp.toString(), textBuf.toString());
+		assertEquals(exp.toString(), doc.getDocumentText());
 	}
 }
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestPAPBinTable.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestPAPBinTable.java
@ -53,7 +53,7 @@ public final class TestPAPBinTable extends TestCase

        HWPFFileSystem fileSys = new HWPFFileSystem();

-        _pAPBinTable.writeTo( fileSys, 0 );
+        _pAPBinTable.writeTo( fileSys, 0, fakeTPT );
        ByteArrayOutputStream tableOut = fileSys.getStream( "1Table" );
        ByteArrayOutputStream mainOut = fileSys.getStream( "WordDocument" );

--- a/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestTextPieceTable.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestTextPieceTable.java
@ -169,6 +169,7 @@ public final class TestTextPieceTable extends TestCase {
    throws Exception
  {
    super.setUp();
+    System.setProperty( "org.apache.poi.hwpf.preserveTextTable", Boolean.TRUE.toString() );

    _hWPFDocFixture = new HWPFDocFixture(this, HWPFDocFixture.DEFAULT_TEST_FILE);
    _hWPFDocFixture.setUp();
@ -178,8 +179,9 @@ public final class TestTextPieceTable extends TestCase {
    throws Exception
  {
    _hWPFDocFixture.tearDown();
-
    _hWPFDocFixture = null;
+
+    System.setProperty( "org.apache.poi.hwpf.preserveTextTable", Boolean.FALSE.toString() );
    super.tearDown();
  }

--- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java
@ -103,10 +103,6 @@ public final class TestProblems extends HWPFTestCase {
      assertEquals("One paragraph is ok\7", r.getParagraph(3).text());
      assertEquals("\7", r.getParagraph(4).text());
      assertEquals("\r", r.getParagraph(5).text());
-      for(int i=0; i<=5; i++) {
-         assertFalse(r.getParagraph(i).usesUnicode());
-      }
-

      // Get the table
      Table t = r.getTable(p);
@ -304,9 +300,6 @@ public final class TestProblems extends HWPFTestCase {
      assertEquals("Row 3/Cell 3\u0007", r.getParagraph(10).text());
      assertEquals("\u0007", r.getParagraph(11).text());
      assertEquals("\r", r.getParagraph(12).text());
-      for(int i=0; i<=12; i++) {
-         assertFalse(r.getParagraph(i).usesUnicode());
-      }

      Paragraph p;

@ -791,7 +784,9 @@ public final class TestProblems extends HWPFTestCase {
            Paragraph actParagraph = actual.getParagraph( p );

            assertEquals( expParagraph.text(), actParagraph.text() );
-            assertEquals( expParagraph.isInTable(), actParagraph.isInTable() );
+            assertEquals( "Diffent isInTable flags for paragraphs #" + p
+                    + " -- " + expParagraph + " -- " + actParagraph + ".",
+                    expParagraph.isInTable(), actParagraph.isInTable() );
            assertEquals( expParagraph.isTableRowEnd(),
                    actParagraph.isTableRowEnd() );

--- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRangeDelete.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRangeDelete.java
@ -150,6 +150,8 @@ public final class TestRangeDelete extends TestCase {
 		assertEquals(searchText, subRange.text());

 		subRange.delete();
+		daDoc.getOverallRange().sanityCheck();
+		daDoc.getRange().sanityCheck();

 		// we need to let the model re-calculate the Range before we evaluate it
 		range = daDoc.getRange();
@ -166,6 +168,7 @@ public final class TestRangeDelete extends TestCase {
 		// this can lead to a StringBufferOutOfBoundsException, so we will add it
 		// even though we don't have an assertion for it
 		Range daRange = daDoc.getRange();
+		daRange.sanityCheck();
 		daRange.text();
 	}