diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java index 296136ccc9..72843973fb 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java @@ -76,6 +76,9 @@ import org.apache.poi.util.Internal; */ public final class HWPFDocument extends HWPFDocumentCore { + private static final String PROPERTY_PRESERVE_BIN_TABLES = "org.apache.poi.hwpf.preserveBinTables"; + private static final String PROPERTY_PRESERVE_TEXT_TABLE = "org.apache.poi.hwpf.preserveTextTable"; + /** And for making sense of CP lengths in the FIB */ @Deprecated protected CPSplitCalculator _cpSplit; @@ -267,20 +270,43 @@ public final class HWPFDocument extends HWPFDocumentCore _pbt = new PAPBinTable(_mainStream, _tableStream, _dataStream, _fib.getFcPlcfbtePapx(), _fib.getLcbPlcfbtePapx(), _tpt); _text = _tpt.getText(); - _cbt.rebuild( _cft ); - _pbt.rebuild( _text, _cft ); - boolean preserve = false; + /* + * in this mode we preserving PAPX/CHPX structure from file, so text may + * miss from output, and text order may be corrupted + */ + boolean preserveBinTables = false; try { - preserve = Boolean.parseBoolean( System - .getProperty( "org.apache.poi.hwpf.preserveTextTable" ) ); + preserveBinTables = Boolean.parseBoolean( System + .getProperty( PROPERTY_PRESERVE_BIN_TABLES ) ); } catch ( Exception exc ) { // ignore; } - if ( !preserve ) + + if ( !preserveBinTables ) + { + _cbt.rebuild( _cft ); + _pbt.rebuild( _text, _cft ); + } + + /* + * Property to disable text rebuilding. In this mode changing the text + * will lead to unpredictable behavior + */ + boolean preserveTextTable = false; + try + { + preserveTextTable = Boolean.parseBoolean( System + .getProperty( PROPERTY_PRESERVE_TEXT_TABLE ) ); + } + catch ( Exception exc ) + { + // ignore; + } + if ( !preserveTextTable ) { _cft = new ComplexFileTable(); _tpt = _cft.getTextPieceTable(); @@ -350,11 +376,13 @@ public final class HWPFDocument extends HWPFDocumentCore _fields = new FieldsImpl(_fieldsTables); } + @Internal public TextPieceTable getTextTable() { return _cft.getTextPieceTable(); } + @Internal @Override public StringBuilder getText() { @@ -497,6 +525,7 @@ public final class HWPFDocument extends HWPFDocumentCore * * @return the saved-by table. */ + @Internal public SavedByTable getSavedByTable() { return _sbt; @@ -507,6 +536,7 @@ public final class HWPFDocument extends HWPFDocumentCore * * @return the saved-by table. */ + @Internal public RevisionMarkAuthorTable getRevisionMarkAuthorTable() { return _rmat; @@ -519,6 +549,7 @@ public final class HWPFDocument extends HWPFDocumentCore return _pictures; } + @Internal public EscherRecordHolder getEscherRecordHolder() { return _escherRecordHolder; } @@ -529,6 +560,7 @@ public final class HWPFDocument extends HWPFDocumentCore * @deprecated use {@link #getOfficeDrawingsMain()} instead */ @Deprecated + @Internal public ShapesTable getShapesTable() { return _officeArts; @@ -908,10 +940,12 @@ public final class HWPFDocument extends HWPFDocumentCore this._dataStream = dataBuf; } + @Internal public byte[] getDataStream() { return _dataStream; } + @Internal public byte[] getTableStream() { return _tableStream; diff --git a/src/scratchpad/src/org/apache/poi/hwpf/dev/HWPFLister.java b/src/scratchpad/src/org/apache/poi/hwpf/dev/HWPFLister.java index 8328fe9c66..028dbf4ef8 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/dev/HWPFLister.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/dev/HWPFLister.java @@ -23,7 +23,9 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -35,7 +37,10 @@ import org.apache.poi.hwpf.OldWordFileFormatException; import org.apache.poi.hwpf.model.CHPX; import org.apache.poi.hwpf.model.FieldsDocumentPart; import org.apache.poi.hwpf.model.FileInformationBlock; +import org.apache.poi.hwpf.model.GenericPropertyNode; +import org.apache.poi.hwpf.model.PAPFormattedDiskPage; import org.apache.poi.hwpf.model.PAPX; +import org.apache.poi.hwpf.model.PlexOfCps; import org.apache.poi.hwpf.model.StyleSheet; import org.apache.poi.hwpf.model.TextPiece; import org.apache.poi.hwpf.sprm.SprmIterator; @@ -47,9 +52,11 @@ import org.apache.poi.hwpf.usermodel.OfficeDrawing; import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.hwpf.usermodel.Range; +import org.apache.poi.poifs.common.POIFSConstants; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.util.Beta; import org.apache.poi.util.IOUtils; +import org.apache.poi.util.LittleEndian; /** * Used by developers to list out key information on a HWPF file. End users will @@ -94,16 +101,14 @@ public final class HWPFLister if ( args.length == 0 ) { System.err.println( "Use:" ); - System.err - .println( "\tHWPFLister \n" - + "\t\t[--textPieces] [--textPiecesText]\n" - + "\t\t[--chpx] [--chpxProperties] [--chpxSprms]\n" - + "\t\t[--papx] [--papxProperties]\n" - + "\t\t[--paragraphs] [--paragraphsSprms] [--paragraphsText]\n" - + "\t\t[--bookmarks]\n" + "\t\t[--escher]\n" - + "\t\t[--fields]\n" + "\t\t[--pictures]\n" - + "\t\t[--officeDrawings]\n" - + "\t\t[--writereadback]\n" ); + System.err.println( "\tHWPFLister \n" + + "\t\t[--textPieces] [--textPiecesText]\n" + + "\t\t[--chpx] [--chpxProperties] [--chpxSprms]\n" + + "\t\t[--papx] [--papxProperties] [--papxSprms]\n" + + "\t\t[--paragraphs] [--paragraphsText]\n" + + "\t\t[--bookmarks]\n" + "\t\t[--escher]\n" + + "\t\t[--fields]\n" + "\t\t[--pictures]\n" + + "\t\t[--officeDrawings]\n" + "\t\t[--writereadback]\n" ); System.exit( 1 ); } @@ -115,10 +120,10 @@ public final class HWPFLister boolean outputChpxSprms = false; boolean outputParagraphs = false; - boolean outputParagraphsSprms = false; boolean outputParagraphsText = false; boolean outputPapx = false; + boolean outputPapxSprms = false; boolean outputPapxProperties = false; boolean outputBookmarks = false; @@ -145,8 +150,6 @@ public final class HWPFLister if ( "--paragraphs".equals( arg ) ) outputParagraphs = true; - if ( "--paragraphsSprms".equals( arg ) ) - outputParagraphsSprms = true; if ( "--paragraphsText".equals( arg ) ) outputParagraphsText = true; @@ -154,6 +157,8 @@ public final class HWPFLister outputPapx = true; if ( "--papxProperties".equals( arg ) ) outputPapxProperties = true; + if ( "--papxSprms".equals( arg ) ) + outputPapxSprms = true; if ( "--bookmarks".equals( arg ) ) outputBookmarks = true; @@ -174,65 +179,85 @@ public final class HWPFLister if ( writereadback ) doc = writeOutAndReadBack( doc ); - HWPFLister lister = new HWPFLister( doc ); - lister.dumpFIB(); + HWPFDocumentCore original; + { + System.setProperty( "org.apache.poi.hwpf.preserveBinTables", + Boolean.TRUE.toString() ); + System.setProperty( "org.apache.poi.hwpf.preserveTextTable", + Boolean.TRUE.toString() ); + + original = loadDoc( new File( args[0] ) ); + if ( writereadback ) + original = writeOutAndReadBack( original ); + } + + HWPFLister listerOriginal = new HWPFLister( original ); + HWPFLister listerRebuilded = new HWPFLister( doc ); + + System.out.println( "== FIB (original) ==" ); + listerOriginal.dumpFIB(); if ( outputTextPieces ) { - System.out.println( "== Text pieces ==" ); - lister.dumpTextPieces( outputTextPiecesText ); + System.out.println( "== Text pieces (original) ==" ); + listerOriginal.dumpTextPieces( outputTextPiecesText ); } if ( outputChpx ) { - System.out.println( "== CHPX ==" ); - lister.dumpChpx( outputChpxProperties, outputChpxSprms ); + System.out.println( "== CHPX (original) ==" ); + listerOriginal.dumpChpx( outputChpxProperties, outputChpxSprms ); + + System.out.println( "== CHPX (rebuilded) ==" ); + listerRebuilded.dumpChpx( outputChpxProperties, outputChpxSprms ); } if ( outputPapx ) { - System.out.println( "== PAPX ==" ); - lister.dumpPapx( outputPapxProperties ); + System.out.println( "== PAPX (original) ==" ); + listerOriginal.dumpPapx( outputPapxProperties, outputPapxSprms ); + + System.out.println( "== PAPX (rebuilded) ==" ); + listerRebuilded.dumpPapx( outputPapxProperties, outputPapxSprms ); } if ( outputParagraphs ) { - System.out.println( "== Text paragraphs ==" ); - lister.dumpParagraphs( true ); + System.out.println( "== Text paragraphs (original) ==" ); + listerRebuilded.dumpParagraphs( true ); - System.out.println( "== DOM paragraphs ==" ); - lister.dumpParagraphsDom( outputParagraphsSprms, outputPapx, - outputParagraphsText ); + System.out.println( "== DOM paragraphs (rebuilded) ==" ); + listerRebuilded.dumpParagraphsDom( outputParagraphsText ); } if ( outputBookmarks ) { - System.out.println( "== BOOKMARKS ==" ); - lister.dumpBookmarks(); + System.out.println( "== BOOKMARKS (rebuilded) ==" ); + listerRebuilded.dumpBookmarks(); } if ( outputEscher ) { - System.out.println( "== ESCHER PROPERTIES ==" ); - lister.dumpEscher(); + System.out.println( "== ESCHER PROPERTIES (rebuilded) ==" ); + listerRebuilded.dumpEscher(); } if ( outputFields ) { - System.out.println( "== FIELDS ==" ); - lister.dumpFields(); + System.out.println( "== FIELDS (rebuilded) ==" ); + listerRebuilded.dumpFields(); } if ( outputOfficeDrawings ) { - System.out.println( "== OFFICE DRAWINGS ==" ); - lister.dumpOfficeDrawings(); + System.out.println( "== OFFICE DRAWINGS (rebuilded) ==" ); + listerRebuilded.dumpOfficeDrawings(); } if ( outputPictures ) { - System.out.println( "== PICTURES ==" ); - lister.dumpPictures(); + System.out.println( "== PICTURES (rebuilded) ==" ); + listerRebuilded.dumpPictures(); } } @@ -402,90 +427,94 @@ public final class HWPFLister } } - public void dumpPapx( boolean withProperties ) throws Exception + public void dumpPapx( boolean withProperties, boolean withSprms ) + throws Exception { - // if ( _doc instanceof HWPFDocument ) - // { - // System.out.println( "binary PAP pages " ); - // - // HWPFDocument doc = (HWPFDocument) _doc; - // - // java.lang.reflect.Field fMainStream = HWPFDocumentCore.class - // .getDeclaredField( "_mainStream" ); - // fMainStream.setAccessible( true ); - // byte[] mainStream = (byte[]) fMainStream.get( _doc ); - // - // PlexOfCps binTable = new PlexOfCps( doc.getTableStream(), doc - // .getFileInformationBlock().getFcPlcfbtePapx(), doc - // .getFileInformationBlock().getLcbPlcfbtePapx(), 4 ); - // - // List papxs = new ArrayList(); - // - // int length = binTable.length(); - // for ( int x = 0; x < length; x++ ) - // { - // GenericPropertyNode node = binTable.getProperty( x ); - // - // int pageNum = LittleEndian.getInt( node.getBytes() ); - // int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE - // * pageNum; - // - // PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage( - // mainStream, doc.getDataStream(), pageOffset, - // doc.getTextTable() ); - // - // System.out.println( "* PFKP: " + pfkp ); - // - // for ( PAPX papx : pfkp.getPAPXs() ) - // { - // System.out.println( "** " + papx ); - // papxs.add( papx ); - // if ( papx != null && true ) - // { - // SprmIterator sprmIt = new SprmIterator( - // papx.getGrpprl(), 2 ); - // while ( sprmIt.hasNext() ) - // { - // SprmOperation sprm = sprmIt.next(); - // System.out.println( "*** " + sprm.toString() ); - // } - // } - // - // } - // } - // - // Collections.sort( papxs ); - // System.out.println( "* Sorted by END" ); - // for ( PAPX papx : papxs ) - // { - // System.out.println( "** " + papx ); - // SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 ); - // while ( sprmIt.hasNext() ) - // { - // SprmOperation sprm = sprmIt.next(); - // System.out.println( "*** " + sprm.toString() ); - // } - // } - // } + if ( _doc instanceof HWPFDocument ) + { + System.out.println( "binary PAP pages " ); - // for ( PAPX papx : _doc.getParagraphTable().getParagraphs() ) - // { - // System.out.println( papx ); - // - // if ( withProperties ) - // System.out.println( papx.getParagraphProperties( _doc - // .getStyleSheet() ) ); - // - // if ( true ) - // { - // SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 ); - // while ( sprmIt.hasNext() ) - // { - // SprmOperation sprm = sprmIt.next(); - // System.out.println( "\t" + sprm.toString() ); - // } - // } - // } + HWPFDocument doc = (HWPFDocument) _doc; + + java.lang.reflect.Field fMainStream = HWPFDocumentCore.class + .getDeclaredField( "_mainStream" ); + fMainStream.setAccessible( true ); + byte[] mainStream = (byte[]) fMainStream.get( _doc ); + + PlexOfCps binTable = new PlexOfCps( doc.getTableStream(), doc + .getFileInformationBlock().getFcPlcfbtePapx(), doc + .getFileInformationBlock().getLcbPlcfbtePapx(), 4 ); + + List papxs = new ArrayList(); + + int length = binTable.length(); + for ( int x = 0; x < length; x++ ) + { + GenericPropertyNode node = binTable.getProperty( x ); + + int pageNum = LittleEndian.getInt( node.getBytes() ); + int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE + * pageNum; + + PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage( + mainStream, doc.getDataStream(), pageOffset, + doc.getTextTable() ); + + System.out.println( "* PFKP: " + pfkp ); + + for ( PAPX papx : pfkp.getPAPXs() ) + { + System.out.println( "** " + papx ); + papxs.add( papx ); + if ( papx != null && withSprms ) + { + SprmIterator sprmIt = new SprmIterator( + papx.getGrpprl(), 2 ); + while ( sprmIt.hasNext() ) + { + SprmOperation sprm = sprmIt.next(); + System.out.println( "*** " + sprm.toString() ); + } + } + + } + } + + Collections.sort( papxs ); + System.out.println( "* Sorted by END" ); + for ( PAPX papx : papxs ) + { + System.out.println( "** " + papx ); + if ( papx != null && withSprms ) + { + SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 ); + while ( sprmIt.hasNext() ) + { + SprmOperation sprm = sprmIt.next(); + System.out.println( "*** " + sprm.toString() ); + } + } + } + } + + for ( PAPX papx : _doc.getParagraphTable().getParagraphs() ) + { + System.out.println( papx ); + + if ( withProperties ) + System.out.println( papx.getParagraphProperties( _doc + .getStyleSheet() ) ); + + if ( true ) + { + SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 ); + while ( sprmIt.hasNext() ) + { + SprmOperation sprm = sprmIt.next(); + System.out.println( "\t" + sprm.toString() ); + } + } + } } public void dumpParagraphs( boolean dumpAssotiatedPapx ) @@ -526,8 +555,7 @@ public final class HWPFLister } } - public void dumpParagraphsDom( boolean withSprms, boolean withPapx, - boolean withText ) + public void dumpParagraphsDom( boolean withText ) { Range range = _doc.getOverallRange(); for ( int p = 0; p < range.numParagraphs(); p++ )