mirror of https://github.com/apache/poi.git
replace ComplexFileTable with single-element-one right after load; replace text piece table as well
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1150675 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
23d2678a0e
commit
4c724bf71c
|
@ -23,8 +23,6 @@ import java.io.FileNotFoundException;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.hwpf.model.BookmarksTables;
|
||||
import org.apache.poi.hwpf.model.CHPBinTable;
|
||||
|
@ -40,6 +38,7 @@ import org.apache.poi.hwpf.model.NoteType;
|
|||
import org.apache.poi.hwpf.model.NotesTables;
|
||||
import org.apache.poi.hwpf.model.PAPBinTable;
|
||||
import org.apache.poi.hwpf.model.PicturesTable;
|
||||
import org.apache.poi.hwpf.model.PieceDescriptor;
|
||||
import org.apache.poi.hwpf.model.RevisionMarkAuthorTable;
|
||||
import org.apache.poi.hwpf.model.SavedByTable;
|
||||
import org.apache.poi.hwpf.model.SectionTable;
|
||||
|
@ -92,7 +91,7 @@ public final class HWPFDocument extends HWPFDocumentCore
|
|||
* structure*/
|
||||
protected ComplexFileTable _cft;
|
||||
|
||||
protected TextPieceTable _tpt;
|
||||
protected final StringBuilder _text;
|
||||
|
||||
/** Holds the save history for this document. */
|
||||
protected SavedByTable _sbt;
|
||||
|
@ -139,6 +138,7 @@ public final class HWPFDocument extends HWPFDocumentCore
|
|||
protected HWPFDocument()
|
||||
{
|
||||
super();
|
||||
this._text = new StringBuilder("\r");
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -246,15 +246,35 @@ public final class HWPFDocument extends HWPFDocumentCore
|
|||
// Start to load up our standard structures.
|
||||
_dop = new DocumentProperties(_tableStream, _fib.getFcDop());
|
||||
_cft = new ComplexFileTable(_mainStream, _tableStream, _fib.getFcClx(), fcMin);
|
||||
_tpt = _cft.getTextPieceTable();
|
||||
TextPieceTable _tpt = _cft.getTextPieceTable();
|
||||
|
||||
// Now load the rest of the properties, which need to be adjusted
|
||||
// for where text really begin
|
||||
_cbt = new CHPBinTable(_mainStream, _tableStream, _fib.getFcPlcfbteChpx(), _fib.getLcbPlcfbteChpx(), _tpt);
|
||||
_pbt = new PAPBinTable(_mainStream, _tableStream, _dataStream, _fib.getFcPlcfbtePapx(), _fib.getLcbPlcfbtePapx(), _tpt);
|
||||
|
||||
_text = _tpt.getText();
|
||||
_cbt.rebuild( _cft );
|
||||
_pbt.rebuild( _dataStream, _cft );
|
||||
_pbt.rebuild( _text, _dataStream, _cft );
|
||||
|
||||
boolean preserve = false;
|
||||
try
|
||||
{
|
||||
preserve = Boolean.parseBoolean( System
|
||||
.getProperty( "org.apache.poi.hwpf.preserveTextTable" ) );
|
||||
}
|
||||
catch ( Exception exc )
|
||||
{
|
||||
// ignore;
|
||||
}
|
||||
if ( !preserve )
|
||||
{
|
||||
_cft = new ComplexFileTable();
|
||||
_tpt = _cft.getTextPieceTable();
|
||||
_tpt.add( new TextPiece( 0, _text.length(), _text.toString()
|
||||
.getBytes( "UTF-16LE" ), new PieceDescriptor( new byte[8],
|
||||
0 ) ) );
|
||||
}
|
||||
|
||||
// Read FSPA and Escher information
|
||||
_fspa = new FSPATable(_tableStream, _fib.getFcPlcspaMom(), _fib.getLcbPlcspaMom(), getTextTable().getTextPieces());
|
||||
|
@ -314,6 +334,12 @@ public final class HWPFDocument extends HWPFDocumentCore
|
|||
return _cft.getTextPieceTable();
|
||||
}
|
||||
|
||||
@Override
|
||||
public StringBuilder getText()
|
||||
{
|
||||
return _text;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public CPSplitCalculator getCPSplitCalculator()
|
||||
{
|
||||
|
@ -326,10 +352,7 @@ public final class HWPFDocument extends HWPFDocumentCore
|
|||
}
|
||||
|
||||
public Range getOverallRange() {
|
||||
// hack to get the ending cp of the document, Have to revisit this.
|
||||
TextPiece p = _tpt.getTextPieces().get(_tpt.getTextPieces().size() - 1);
|
||||
|
||||
return new Range(0, p.getEnd(), this);
|
||||
return new Range(0, _text.length(), this);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -445,16 +468,7 @@ public final class HWPFDocument extends HWPFDocumentCore
|
|||
*/
|
||||
public int characterLength()
|
||||
{
|
||||
List<TextPiece> textPieces = _tpt.getTextPieces();
|
||||
Iterator<TextPiece> textIt = textPieces.iterator();
|
||||
|
||||
int length = 0;
|
||||
while(textIt.hasNext())
|
||||
{
|
||||
TextPiece tp = textIt.next();
|
||||
length += tp.characterLength();
|
||||
}
|
||||
return length;
|
||||
return _text.length();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -643,7 +657,7 @@ public final class HWPFDocument extends HWPFDocumentCore
|
|||
|
||||
// write out the PAPBinTable.
|
||||
_fib.setFcPlcfbtePapx(tableOffset);
|
||||
_pbt.writeTo(docSys, fcMin);
|
||||
_pbt.writeTo(docSys, fcMin, _cft.getTextPieceTable());
|
||||
_fib.setLcbPlcfbtePapx(tableStream.getOffset() - tableOffset);
|
||||
tableOffset = tableStream.getOffset();
|
||||
|
||||
|
|
|
@ -35,6 +35,7 @@ import org.apache.poi.hwpf.usermodel.Range;
|
|||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.DocumentEntry;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.util.Internal;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -161,7 +162,19 @@ public abstract class HWPFDocumentCore extends POIDocument
|
|||
*/
|
||||
public abstract Range getOverallRange();
|
||||
|
||||
public abstract TextPieceTable getTextTable();
|
||||
/**
|
||||
* Returns document text, i.e. text information from all text pieces,
|
||||
* including OLE descriptions and field codes
|
||||
*/
|
||||
public String getDocumentText() {
|
||||
return getText().toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal method to access document text
|
||||
*/
|
||||
@Internal
|
||||
public abstract StringBuilder getText();
|
||||
|
||||
public CHPBinTable getCharacterTable()
|
||||
{
|
||||
|
@ -197,4 +210,6 @@ public abstract class HWPFDocumentCore extends POIDocument
|
|||
{
|
||||
return _fib;
|
||||
}
|
||||
|
||||
public abstract TextPieceTable getTextTable();
|
||||
}
|
||||
|
|
|
@ -38,6 +38,8 @@ import org.apache.poi.util.LittleEndian;
|
|||
public class HWPFOldDocument extends HWPFDocumentCore {
|
||||
private TextPieceTable tpt;
|
||||
|
||||
private StringBuilder _text;
|
||||
|
||||
public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
|
||||
this(fs.getRoot());
|
||||
}
|
||||
|
@ -88,13 +90,15 @@ public class HWPFOldDocument extends HWPFDocumentCore {
|
|||
byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()];
|
||||
System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length);
|
||||
TextPiece tp = new TextPiece(
|
||||
0, textData.length, textData, pd, 0
|
||||
0, textData.length, textData, pd
|
||||
);
|
||||
tpt.add(tp);
|
||||
|
||||
text.append(tp.getStringBuffer());
|
||||
}
|
||||
|
||||
_text = tpt.getText();
|
||||
|
||||
// Now we can fetch the character and paragraph properties
|
||||
_cbt = new OldCHPBinTable(
|
||||
_mainStream, chpTableOffset, chpTableSize,
|
||||
|
@ -126,6 +130,12 @@ public class HWPFOldDocument extends HWPFDocumentCore {
|
|||
return tpt;
|
||||
}
|
||||
|
||||
@Override
|
||||
public StringBuilder getText()
|
||||
{
|
||||
return _text;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(OutputStream out) throws IOException {
|
||||
throw new IllegalStateException("Writing is not available for the older file formats");
|
||||
|
|
|
@ -23,9 +23,7 @@ import java.io.File;
|
|||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -37,10 +35,7 @@ import org.apache.poi.hwpf.OldWordFileFormatException;
|
|||
import org.apache.poi.hwpf.model.CHPX;
|
||||
import org.apache.poi.hwpf.model.FieldsDocumentPart;
|
||||
import org.apache.poi.hwpf.model.FileInformationBlock;
|
||||
import org.apache.poi.hwpf.model.GenericPropertyNode;
|
||||
import org.apache.poi.hwpf.model.PAPFormattedDiskPage;
|
||||
import org.apache.poi.hwpf.model.PAPX;
|
||||
import org.apache.poi.hwpf.model.PlexOfCps;
|
||||
import org.apache.poi.hwpf.model.StyleSheet;
|
||||
import org.apache.poi.hwpf.model.TextPiece;
|
||||
import org.apache.poi.hwpf.sprm.SprmIterator;
|
||||
|
@ -51,10 +46,8 @@ import org.apache.poi.hwpf.usermodel.Field;
|
|||
import org.apache.poi.hwpf.usermodel.Paragraph;
|
||||
import org.apache.poi.hwpf.usermodel.Picture;
|
||||
import org.apache.poi.hwpf.usermodel.Range;
|
||||
import org.apache.poi.poifs.common.POIFSConstants;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.util.IOUtils;
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
|
||||
/**
|
||||
* Used by developers to list out key information on a HWPF file. End users will
|
||||
|
@ -241,13 +234,10 @@ public final class HWPFLister
|
|||
|
||||
private LinkedHashMap<Integer, String> paragraphs;
|
||||
|
||||
private String text;
|
||||
|
||||
public HWPFLister( HWPFDocumentCore doc )
|
||||
{
|
||||
_doc = doc;
|
||||
|
||||
buildText();
|
||||
buildParagraphs();
|
||||
}
|
||||
|
||||
|
@ -256,6 +246,7 @@ public final class HWPFLister
|
|||
paragraphs = new LinkedHashMap<Integer, String>();
|
||||
|
||||
StringBuilder part = new StringBuilder();
|
||||
String text = _doc.getDocumentText();
|
||||
for ( int charIndex = 0; charIndex < text.length(); charIndex++ )
|
||||
{
|
||||
char c = text.charAt( charIndex );
|
||||
|
@ -268,24 +259,6 @@ public final class HWPFLister
|
|||
}
|
||||
}
|
||||
|
||||
private void buildText()
|
||||
{
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for ( TextPiece textPiece : _doc.getTextTable().getTextPieces() )
|
||||
{
|
||||
String toAppend = textPiece.getStringBuffer().toString();
|
||||
|
||||
if ( toAppend.length() != ( textPiece.getEnd() - textPiece
|
||||
.getStart() ) )
|
||||
{
|
||||
throw new AssertionError();
|
||||
}
|
||||
|
||||
builder.replace( textPiece.getStart(), textPiece.getEnd(), toAppend );
|
||||
}
|
||||
this.text = builder.toString();
|
||||
}
|
||||
|
||||
private void dumpBookmarks()
|
||||
{
|
||||
if ( !( _doc instanceof HWPFDocument ) )
|
||||
|
@ -379,69 +352,69 @@ public final class HWPFLister
|
|||
|
||||
public void dumpPapx( boolean withProperties ) throws Exception
|
||||
{
|
||||
if ( _doc instanceof HWPFDocument )
|
||||
{
|
||||
System.out.println( "binary PAP pages " );
|
||||
|
||||
HWPFDocument doc = (HWPFDocument) _doc;
|
||||
|
||||
java.lang.reflect.Field fMainStream = HWPFDocumentCore.class
|
||||
.getDeclaredField( "_mainStream" );
|
||||
fMainStream.setAccessible( true );
|
||||
byte[] mainStream = (byte[]) fMainStream.get( _doc );
|
||||
|
||||
PlexOfCps binTable = new PlexOfCps( doc.getTableStream(), doc
|
||||
.getFileInformationBlock().getFcPlcfbtePapx(), doc
|
||||
.getFileInformationBlock().getLcbPlcfbtePapx(), 4 );
|
||||
|
||||
List<PAPX> papxs = new ArrayList<PAPX>();
|
||||
|
||||
int length = binTable.length();
|
||||
for ( int x = 0; x < length; x++ )
|
||||
{
|
||||
GenericPropertyNode node = binTable.getProperty( x );
|
||||
|
||||
int pageNum = LittleEndian.getInt( node.getBytes() );
|
||||
int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE
|
||||
* pageNum;
|
||||
|
||||
PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(
|
||||
mainStream, doc.getDataStream(), pageOffset,
|
||||
doc.getTextTable() );
|
||||
|
||||
System.out.println( "* PFKP: " + pfkp );
|
||||
|
||||
for ( PAPX papx : pfkp.getPAPXs() )
|
||||
{
|
||||
System.out.println( "** " + papx );
|
||||
papxs.add( papx );
|
||||
if ( papx != null && true )
|
||||
{
|
||||
SprmIterator sprmIt = new SprmIterator(
|
||||
papx.getGrpprl(), 2 );
|
||||
while ( sprmIt.hasNext() )
|
||||
{
|
||||
SprmOperation sprm = sprmIt.next();
|
||||
System.out.println( "*** " + sprm.toString() );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
Collections.sort( papxs );
|
||||
System.out.println( "* Sorted by END" );
|
||||
for ( PAPX papx : papxs )
|
||||
{
|
||||
System.out.println( "** " + papx );
|
||||
SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 );
|
||||
while ( sprmIt.hasNext() )
|
||||
{
|
||||
SprmOperation sprm = sprmIt.next();
|
||||
System.out.println( "*** " + sprm.toString() );
|
||||
}
|
||||
}
|
||||
}
|
||||
// if ( _doc instanceof HWPFDocument )
|
||||
// {
|
||||
// System.out.println( "binary PAP pages " );
|
||||
//
|
||||
// HWPFDocument doc = (HWPFDocument) _doc;
|
||||
//
|
||||
// java.lang.reflect.Field fMainStream = HWPFDocumentCore.class
|
||||
// .getDeclaredField( "_mainStream" );
|
||||
// fMainStream.setAccessible( true );
|
||||
// byte[] mainStream = (byte[]) fMainStream.get( _doc );
|
||||
//
|
||||
// PlexOfCps binTable = new PlexOfCps( doc.getTableStream(), doc
|
||||
// .getFileInformationBlock().getFcPlcfbtePapx(), doc
|
||||
// .getFileInformationBlock().getLcbPlcfbtePapx(), 4 );
|
||||
//
|
||||
// List<PAPX> papxs = new ArrayList<PAPX>();
|
||||
//
|
||||
// int length = binTable.length();
|
||||
// for ( int x = 0; x < length; x++ )
|
||||
// {
|
||||
// GenericPropertyNode node = binTable.getProperty( x );
|
||||
//
|
||||
// int pageNum = LittleEndian.getInt( node.getBytes() );
|
||||
// int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE
|
||||
// * pageNum;
|
||||
//
|
||||
// PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(
|
||||
// mainStream, doc.getDataStream(), pageOffset,
|
||||
// doc.getTextTable() );
|
||||
//
|
||||
// System.out.println( "* PFKP: " + pfkp );
|
||||
//
|
||||
// for ( PAPX papx : pfkp.getPAPXs() )
|
||||
// {
|
||||
// System.out.println( "** " + papx );
|
||||
// papxs.add( papx );
|
||||
// if ( papx != null && true )
|
||||
// {
|
||||
// SprmIterator sprmIt = new SprmIterator(
|
||||
// papx.getGrpprl(), 2 );
|
||||
// while ( sprmIt.hasNext() )
|
||||
// {
|
||||
// SprmOperation sprm = sprmIt.next();
|
||||
// System.out.println( "*** " + sprm.toString() );
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// Collections.sort( papxs );
|
||||
// System.out.println( "* Sorted by END" );
|
||||
// for ( PAPX papx : papxs )
|
||||
// {
|
||||
// System.out.println( "** " + papx );
|
||||
// SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 );
|
||||
// while ( sprmIt.hasNext() )
|
||||
// {
|
||||
// SprmOperation sprm = sprmIt.next();
|
||||
// System.out.println( "*** " + sprm.toString() );
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// for ( PAPX papx : _doc.getParagraphTable().getParagraphs() )
|
||||
// {
|
||||
|
|
|
@ -20,13 +20,11 @@ package org.apache.poi.hwpf.extractor;
|
|||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.poi.POIOLE2TextExtractor;
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
import org.apache.poi.hwpf.model.TextPiece;
|
||||
import org.apache.poi.hwpf.usermodel.HeaderStories;
|
||||
import org.apache.poi.hwpf.usermodel.Paragraph;
|
||||
import org.apache.poi.hwpf.usermodel.Range;
|
||||
|
@ -218,22 +216,7 @@ public final class WordExtractor extends POIOLE2TextExtractor {
|
|||
* mapping is broken. Fast too.
|
||||
*/
|
||||
public String getTextFromPieces() {
|
||||
StringBuffer textBuf = new StringBuffer();
|
||||
|
||||
for(TextPiece piece : doc.getTextTable().getTextPieces()) {
|
||||
String encoding = "Cp1252";
|
||||
if (piece.isUnicode()) {
|
||||
encoding = "UTF-16LE";
|
||||
}
|
||||
try {
|
||||
String text = new String(piece.getRawBytes(), encoding);
|
||||
textBuf.append(text);
|
||||
} catch(UnsupportedEncodingException e) {
|
||||
throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken");
|
||||
}
|
||||
}
|
||||
|
||||
String text = textBuf.toString();
|
||||
String text = doc.getDocumentText();
|
||||
|
||||
// Fix line endings (Note - won't get all of them
|
||||
text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
|
||||
|
|
|
@ -179,34 +179,6 @@ public class CHPBinTable
|
|||
start = System.currentTimeMillis();
|
||||
}
|
||||
|
||||
// rebuild document paragraphs structure
|
||||
StringBuilder docText = new StringBuilder();
|
||||
for ( TextPiece textPiece : tpt.getTextPieces() )
|
||||
{
|
||||
String toAppend = textPiece.getStringBuffer().toString();
|
||||
int toAppendLength = toAppend.length();
|
||||
|
||||
if ( toAppendLength != textPiece.getEnd() - textPiece.getStart() )
|
||||
{
|
||||
logger.log(
|
||||
POILogger.WARN,
|
||||
"Text piece has boundaries [",
|
||||
Integer.valueOf( textPiece.getStart() ),
|
||||
"; ",
|
||||
Integer.valueOf( textPiece.getEnd() ),
|
||||
") but length ",
|
||||
Integer.valueOf( textPiece.getEnd()
|
||||
- textPiece.getStart() ) );
|
||||
}
|
||||
|
||||
docText.replace( textPiece.getStart(), textPiece.getStart()
|
||||
+ toAppendLength, toAppend );
|
||||
}
|
||||
logger.log( POILogger.DEBUG, "Document text rebuilded in ",
|
||||
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
|
||||
Integer.valueOf( docText.length() ), " chars)" );
|
||||
start = System.currentTimeMillis();
|
||||
|
||||
List<CHPX> oldChpxSortedByStartPos = new ArrayList<CHPX>( _textRuns );
|
||||
Collections.sort( oldChpxSortedByStartPos,
|
||||
PropertyNode.StartComparator.instance );
|
||||
|
|
|
@ -54,9 +54,6 @@ public class PAPBinTable
|
|||
protected ArrayList<PAPX> _paragraphs = new ArrayList<PAPX>();
|
||||
byte[] _dataStream;
|
||||
|
||||
/** So we can know if things are unicode or not */
|
||||
private TextPieceTable tpt;
|
||||
|
||||
public PAPBinTable()
|
||||
{
|
||||
}
|
||||
|
@ -81,7 +78,6 @@ public class PAPBinTable
|
|||
|
||||
{
|
||||
PlexOfCps binTable = new PlexOfCps( tableStream, offset, size, 4 );
|
||||
this.tpt = tpt;
|
||||
|
||||
int length = binTable.length();
|
||||
for ( int x = 0; x < length; x++ )
|
||||
|
@ -112,7 +108,8 @@ public class PAPBinTable
|
|||
Integer.valueOf( _paragraphs.size() ), " elements)" );
|
||||
}
|
||||
|
||||
public void rebuild( byte[] dataStream, ComplexFileTable complexFileTable )
|
||||
public void rebuild( final StringBuilder docText, byte[] dataStream,
|
||||
ComplexFileTable complexFileTable )
|
||||
{
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
|
@ -121,7 +118,8 @@ public class PAPBinTable
|
|||
SprmBuffer[] sprmBuffers = complexFileTable.getGrpprls();
|
||||
|
||||
// adding PAPX from fast-saved SPRMs
|
||||
for ( TextPiece textPiece : tpt.getTextPieces() )
|
||||
for ( TextPiece textPiece : complexFileTable.getTextPieceTable()
|
||||
.getTextPieces() )
|
||||
{
|
||||
PropertyModifier prm = textPiece.getPieceDescriptor().getPrm();
|
||||
if ( !prm.isComplex() )
|
||||
|
@ -167,34 +165,6 @@ public class PAPBinTable
|
|||
start = System.currentTimeMillis();
|
||||
}
|
||||
|
||||
// rebuild document paragraphs structure
|
||||
StringBuilder docText = new StringBuilder();
|
||||
for ( TextPiece textPiece : tpt.getTextPieces() )
|
||||
{
|
||||
String toAppend = textPiece.getStringBuffer().toString();
|
||||
int toAppendLength = toAppend.length();
|
||||
|
||||
if ( toAppendLength != textPiece.getEnd() - textPiece.getStart() )
|
||||
{
|
||||
logger.log(
|
||||
POILogger.WARN,
|
||||
"Text piece has boundaries [",
|
||||
Integer.valueOf( textPiece.getStart() ),
|
||||
"; ",
|
||||
Integer.valueOf( textPiece.getEnd() ),
|
||||
") but length ",
|
||||
Integer.valueOf( textPiece.getEnd()
|
||||
- textPiece.getStart() ) );
|
||||
}
|
||||
|
||||
docText.replace( textPiece.getStart(), textPiece.getStart()
|
||||
+ toAppendLength, toAppend );
|
||||
}
|
||||
logger.log( POILogger.DEBUG, "Document text rebuilded in ",
|
||||
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
|
||||
Integer.valueOf( docText.length() ), " chars)" );
|
||||
start = System.currentTimeMillis();
|
||||
|
||||
List<PAPX> oldPapxSortedByEndPos = new ArrayList<PAPX>( _paragraphs );
|
||||
Collections.sort( oldPapxSortedByEndPos,
|
||||
PropertyNode.EndComparator.instance );
|
||||
|
@ -274,7 +244,8 @@ public class PAPBinTable
|
|||
{
|
||||
// can we reuse existing?
|
||||
PAPX existing = papxs.get( 0 );
|
||||
if ( existing.getStart() == startInclusive && existing.getEnd() == endExclusive )
|
||||
if ( existing.getStart() == startInclusive
|
||||
&& existing.getEnd() == endExclusive )
|
||||
{
|
||||
newPapxs.add( existing );
|
||||
lastParStart = endExclusive;
|
||||
|
@ -311,7 +282,8 @@ public class PAPBinTable
|
|||
this._paragraphs = new ArrayList<PAPX>( newPapxs );
|
||||
|
||||
logger.log( POILogger.DEBUG, "PAPX rebuilded from document text in ",
|
||||
Long.valueOf( System.currentTimeMillis() - start ), " ms" );
|
||||
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
|
||||
Integer.valueOf( _paragraphs.size() ), " elements)" );
|
||||
start = System.currentTimeMillis();
|
||||
|
||||
_dataStream = dataStream;
|
||||
|
@ -320,7 +292,7 @@ public class PAPBinTable
|
|||
public void insert(int listIndex, int cpStart, SprmBuffer buf)
|
||||
{
|
||||
|
||||
PAPX forInsert = new PAPX(0, 0, tpt, buf, _dataStream);
|
||||
PAPX forInsert = new PAPX(0, 0, buf, _dataStream);
|
||||
|
||||
// Ensure character offsets are really characters
|
||||
forInsert.setStart(cpStart);
|
||||
|
@ -350,7 +322,7 @@ public class PAPBinTable
|
|||
// Original, until insert at point
|
||||
// New one
|
||||
// Clone of original, on to the old end
|
||||
PAPX clone = new PAPX(0, 0, tpt, clonedBuf, _dataStream);
|
||||
PAPX clone = new PAPX(0, 0, clonedBuf, _dataStream);
|
||||
// Again ensure contains character based offsets no matter what
|
||||
clone.setStart(cpStart);
|
||||
clone.setEnd(currentPap.getEnd());
|
||||
|
@ -427,8 +399,7 @@ public class PAPBinTable
|
|||
return _paragraphs;
|
||||
}
|
||||
|
||||
public void writeTo(HWPFFileSystem sys, int fcMin)
|
||||
throws IOException
|
||||
public void writeTo( HWPFFileSystem sys, int fcMin, CharIndexTranslator translator ) throws IOException
|
||||
{
|
||||
|
||||
HWPFOutputStream docStream = sys.getStream("WordDocument");
|
||||
|
@ -463,7 +434,7 @@ public class PAPBinTable
|
|||
PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(_dataStream);
|
||||
pfkp.fill(overflow);
|
||||
|
||||
byte[] bufFkp = pfkp.toByteArray(tpt, fcMin);
|
||||
byte[] bufFkp = pfkp.toByteArray(translator, fcMin);
|
||||
docStream.write(bufFkp);
|
||||
overflow = pfkp.getOverflow();
|
||||
|
||||
|
|
|
@ -19,6 +19,9 @@ package org.apache.poi.hwpf.model;
|
|||
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
|
||||
import org.apache.poi.util.Internal;
|
||||
|
||||
/**
|
||||
* Lightweight representation of a text piece.
|
||||
* Works in the character domain, not the byte domain, so you
|
||||
|
@ -27,7 +30,7 @@ import java.io.UnsupportedEncodingException;
|
|||
*
|
||||
* @author Ryan Ackley
|
||||
*/
|
||||
|
||||
@Internal
|
||||
public final class TextPiece extends PropertyNode<TextPiece>
|
||||
{
|
||||
private boolean _usesUnicode;
|
||||
|
@ -35,11 +38,31 @@ public final class TextPiece extends PropertyNode<TextPiece>
|
|||
private PieceDescriptor _pd;
|
||||
|
||||
/**
|
||||
* @param start Beginning offset in main document stream, in characters.
|
||||
* @param end Ending offset in main document stream, in characters.
|
||||
* @param text The raw bytes of our text
|
||||
* @param start
|
||||
* Beginning offset in main document stream, in characters.
|
||||
* @param end
|
||||
* Ending offset in main document stream, in characters.
|
||||
* @param text
|
||||
* The raw bytes of our text
|
||||
* @deprecated Use {@link #TextPiece(int,int,byte[],PieceDescriptor)}
|
||||
* instead
|
||||
*/
|
||||
public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, int cpStart) {
|
||||
public TextPiece( int start, int end, byte[] text, PieceDescriptor pd,
|
||||
int cpStart )
|
||||
{
|
||||
this( start, end, text, pd );
|
||||
}
|
||||
|
||||
/**
|
||||
* @param start
|
||||
* Beginning offset in main document stream, in characters.
|
||||
* @param end
|
||||
* Ending offset in main document stream, in characters.
|
||||
* @param text
|
||||
* The raw bytes of our text
|
||||
*/
|
||||
public TextPiece( int start, int end, byte[] text, PieceDescriptor pd )
|
||||
{
|
||||
super(start, end, buildInitSB(text, pd));
|
||||
_usesUnicode = pd.isUnicode();
|
||||
_pd = pd;
|
||||
|
|
|
@ -24,6 +24,8 @@ import java.util.List;
|
|||
|
||||
import org.apache.poi.hwpf.model.io.HWPFOutputStream;
|
||||
import org.apache.poi.poifs.common.POIFSConstants;
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
import org.apache.poi.util.POILogger;
|
||||
|
||||
/**
|
||||
* The piece table for matching up character positions to bits of text. This
|
||||
|
@ -34,6 +36,9 @@ import org.apache.poi.poifs.common.POIFSConstants;
|
|||
*/
|
||||
public class TextPieceTable implements CharIndexTranslator
|
||||
{
|
||||
private static final POILogger logger = POILogFactory
|
||||
.getLogger( TextPieceTable.class );
|
||||
|
||||
// int _multiple;
|
||||
int _cpMin;
|
||||
protected ArrayList<TextPiece> _textPieces = new ArrayList<TextPiece>();
|
||||
|
@ -101,7 +106,7 @@ public class TextPieceTable implements CharIndexTranslator
|
|||
|
||||
// And now build the piece
|
||||
_textPieces.add( new TextPiece( nodeStartChars, nodeEndChars, buf,
|
||||
pieces[x], node.getStart() ) );
|
||||
pieces[x] ) );
|
||||
}
|
||||
|
||||
// In the interest of our sanity, now sort the text pieces
|
||||
|
@ -251,6 +256,41 @@ public class TextPieceTable implements CharIndexTranslator
|
|||
return _cpMin;
|
||||
}
|
||||
|
||||
public StringBuilder getText()
|
||||
{
|
||||
final long start = System.currentTimeMillis();
|
||||
|
||||
// rebuild document paragraphs structure
|
||||
StringBuilder docText = new StringBuilder();
|
||||
for ( TextPiece textPiece : _textPieces )
|
||||
{
|
||||
String toAppend = textPiece.getStringBuffer().toString();
|
||||
int toAppendLength = toAppend.length();
|
||||
|
||||
if ( toAppendLength != textPiece.getEnd() - textPiece.getStart() )
|
||||
{
|
||||
logger.log(
|
||||
POILogger.WARN,
|
||||
"Text piece has boundaries [",
|
||||
Integer.valueOf( textPiece.getStart() ),
|
||||
"; ",
|
||||
Integer.valueOf( textPiece.getEnd() ),
|
||||
") but length ",
|
||||
Integer.valueOf( textPiece.getEnd()
|
||||
- textPiece.getStart() ) );
|
||||
}
|
||||
|
||||
docText.replace( textPiece.getStart(), textPiece.getStart()
|
||||
+ toAppendLength, toAppend );
|
||||
}
|
||||
|
||||
logger.log( POILogger.DEBUG, "Document text were rebuilded in ",
|
||||
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
|
||||
Integer.valueOf( docText.length() ), " chars)" );
|
||||
|
||||
return docText;
|
||||
}
|
||||
|
||||
public List<TextPiece> getTextPieces()
|
||||
{
|
||||
return _textPieces;
|
||||
|
|
|
@ -31,7 +31,7 @@ import org.apache.poi.hwpf.model.PropertyNode;
|
|||
import org.apache.poi.hwpf.model.SEPX;
|
||||
import org.apache.poi.hwpf.model.StyleSheet;
|
||||
import org.apache.poi.hwpf.model.SubdocumentType;
|
||||
import org.apache.poi.hwpf.model.TextPiece;
|
||||
import org.apache.poi.hwpf.model.TextPieceTable;
|
||||
import org.apache.poi.hwpf.sprm.CharacterSprmCompressor;
|
||||
import org.apache.poi.hwpf.sprm.ParagraphSprmCompressor;
|
||||
import org.apache.poi.hwpf.sprm.SprmBuffer;
|
||||
|
@ -108,17 +108,7 @@ public class Range { // TODO -instantiable superclass
|
|||
/** The end index in the characterRuns list for this Range. */
|
||||
protected int _charEnd;
|
||||
|
||||
/** Have we loaded the Text indexes yet */
|
||||
protected boolean _textRangeFound;
|
||||
|
||||
/** All text pieces that belong to the document this Range belongs to. */
|
||||
protected List<TextPiece> _text;
|
||||
|
||||
/** The start index in the text list for this Range. */
|
||||
protected int _textStart;
|
||||
|
||||
/** The end index in the text list for this Range. */
|
||||
protected int _textEnd;
|
||||
protected StringBuilder _text;
|
||||
|
||||
// protected Range()
|
||||
// {
|
||||
|
@ -144,7 +134,7 @@ public class Range { // TODO -instantiable superclass
|
|||
_sections = _doc.getSectionTable().getSections();
|
||||
_paragraphs = _doc.getParagraphTable().getParagraphs();
|
||||
_characters = _doc.getCharacterTable().getTextRuns();
|
||||
_text = _doc.getTextTable().getTextPieces();
|
||||
_text = _doc.getText();
|
||||
_parent = new WeakReference<Range>(null);
|
||||
|
||||
sanityCheckStartEnd();
|
||||
|
@ -171,6 +161,7 @@ public class Range { // TODO -instantiable superclass
|
|||
_parent = new WeakReference<Range>(parent);
|
||||
|
||||
sanityCheckStartEnd();
|
||||
assert sanityCheck();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -213,52 +204,24 @@ public class Range { // TODO -instantiable superclass
|
|||
}
|
||||
|
||||
/**
|
||||
* Does any <code>TextPiece</code> in this Range use unicode?
|
||||
*
|
||||
* @return true if it does and false if it doesn't
|
||||
* @return always return true
|
||||
* @deprecated Range is not linked to any text piece anymore, so to check if
|
||||
* unicode is used please access {@link TextPieceTable} during
|
||||
* document load time
|
||||
*/
|
||||
public boolean usesUnicode() {
|
||||
|
||||
initText();
|
||||
|
||||
for (int i = _textStart; i < _textEnd; i++) {
|
||||
TextPiece piece = _text.get(i);
|
||||
if (piece.isUnicode())
|
||||
@Deprecated
|
||||
public boolean usesUnicode()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the text that this Range contains.
|
||||
*
|
||||
* @return The text for this range.
|
||||
*/
|
||||
public String text() {
|
||||
initText();
|
||||
|
||||
StringBuffer sb = new StringBuffer();
|
||||
|
||||
for (int x = _textStart; x < _textEnd; x++) {
|
||||
TextPiece piece = _text.get(x);
|
||||
|
||||
// Figure out where in this piece the text
|
||||
// we're after lives
|
||||
int rStart = 0;
|
||||
int rEnd = piece.characterLength();
|
||||
if (_start > piece.getStart()) {
|
||||
rStart = _start - piece.getStart();
|
||||
}
|
||||
if (_end < piece.getEnd()) {
|
||||
rEnd -= (piece.getEnd() - _end);
|
||||
}
|
||||
|
||||
// Luckily TextPieces work in characters, so we don't
|
||||
// need to worry about unicode here
|
||||
sb.append(piece.substring(rStart, rEnd));
|
||||
}
|
||||
return sb.toString();
|
||||
return _text.substring( _start, _end );
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -354,27 +317,20 @@ public class Range { // TODO -instantiable superclass
|
|||
* @return The character run that text was inserted into.
|
||||
*/
|
||||
public CharacterRun insertBefore( String text )
|
||||
// throws UnsupportedEncodingException
|
||||
{
|
||||
initAll();
|
||||
|
||||
TextPiece tp = _text.get(_textStart);
|
||||
StringBuffer sb = tp.getStringBuffer();
|
||||
|
||||
// Since this is the first item in our list, it is safe to assume that
|
||||
// _start >= tp.getStart()
|
||||
int insertIndex = _start - tp.getStart();
|
||||
sb.insert(insertIndex, text);
|
||||
|
||||
int adjustedLength = _doc.getTextTable().adjustForInsert(_textStart, text.length());
|
||||
_doc.getCharacterTable().adjustForInsert(_charStart, adjustedLength);
|
||||
_doc.getParagraphTable().adjustForInsert(_parStart, adjustedLength);
|
||||
_doc.getSectionTable().adjustForInsert(_sectionStart, adjustedLength);
|
||||
adjustForInsert(adjustedLength);
|
||||
_text.insert( _start, text );
|
||||
_doc.getCharacterTable().adjustForInsert( _charStart, text.length() );
|
||||
_doc.getParagraphTable().adjustForInsert( _parStart, text.length() );
|
||||
_doc.getSectionTable().adjustForInsert( _sectionStart, text.length() );
|
||||
adjustForInsert( text.length() );
|
||||
|
||||
// update the FIB.CCPText + friends fields
|
||||
adjustFIB( text.length() );
|
||||
|
||||
assert sanityCheck();
|
||||
|
||||
return getCharacterRun( 0 );
|
||||
}
|
||||
|
||||
|
@ -385,27 +341,19 @@ public class Range { // TODO -instantiable superclass
|
|||
* The text to insert
|
||||
* @return The character run the text was inserted into.
|
||||
*/
|
||||
public CharacterRun insertAfter(String text) {
|
||||
public CharacterRun insertAfter( String text )
|
||||
{
|
||||
initAll();
|
||||
|
||||
int listIndex = _textEnd - 1;
|
||||
TextPiece tp = _text.get(listIndex);
|
||||
StringBuffer sb = tp.getStringBuffer();
|
||||
_text.insert( _end, text );
|
||||
|
||||
int insertIndex = _end - tp.getStart();
|
||||
|
||||
if (tp.getStringBuffer().charAt(_end - 1) == '\r' && text.charAt(0) != '\u0007') {
|
||||
insertIndex--;
|
||||
}
|
||||
sb.insert(insertIndex, text);
|
||||
int adjustedLength = _doc.getTextTable().adjustForInsert(listIndex, text.length());
|
||||
_doc.getCharacterTable().adjustForInsert(_charEnd - 1, adjustedLength);
|
||||
_doc.getParagraphTable().adjustForInsert(_parEnd - 1, adjustedLength);
|
||||
_doc.getSectionTable().adjustForInsert(_sectionEnd - 1, adjustedLength);
|
||||
_doc.getCharacterTable().adjustForInsert( _charEnd - 1, text.length() );
|
||||
_doc.getParagraphTable().adjustForInsert( _parEnd - 1, text.length() );
|
||||
_doc.getSectionTable().adjustForInsert( _sectionEnd - 1, text.length() );
|
||||
adjustForInsert( text.length() );
|
||||
|
||||
assert sanityCheck();
|
||||
return getCharacterRun( numCharacterRuns() - 1 );
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -580,7 +528,6 @@ public class Range { // TODO -instantiable superclass
|
|||
int numSections = _sections.size();
|
||||
int numRuns = _characters.size();
|
||||
int numParagraphs = _paragraphs.size();
|
||||
int numTextPieces = _text.size();
|
||||
|
||||
for (int x = _charStart; x < numRuns; x++) {
|
||||
CHPX chpx = _characters.get(x);
|
||||
|
@ -605,9 +552,11 @@ public class Range { // TODO -instantiable superclass
|
|||
// + " -> " + sepx.getEnd());
|
||||
}
|
||||
|
||||
for (int x = _textStart; x < numTextPieces; x++) {
|
||||
TextPiece piece = _text.get(x);
|
||||
piece.adjustForDelete(_start, _end - _start);
|
||||
_text.delete( _start, _end );
|
||||
Range parent = _parent.get();
|
||||
if ( parent != null )
|
||||
{
|
||||
parent.adjustForInsert( -( _end - _start ) );
|
||||
}
|
||||
|
||||
// update the FIB.CCPText + friends field
|
||||
|
@ -623,7 +572,7 @@ public class Range { // TODO -instantiable superclass
|
|||
* @param rows
|
||||
* The number of rows.
|
||||
* @return The empty Table that is now part of the document.
|
||||
* @deprecated Use code shall not work with {@link ParagraphProperties}
|
||||
* @deprecated Use code shall not work with {@link TableProperties}
|
||||
*/
|
||||
@Deprecated
|
||||
public Table insertBefore(TableProperties props, int rows) {
|
||||
|
@ -631,18 +580,27 @@ public class Range { // TODO -instantiable superclass
|
|||
parProps.setFInTable(true);
|
||||
parProps.setItap( 1 );
|
||||
|
||||
final int oldEnd = this._end;
|
||||
|
||||
int columns = props.getItcMac();
|
||||
for (int x = 0; x < rows; x++) {
|
||||
for ( int x = 0; x < rows; x++ )
|
||||
{
|
||||
Paragraph cell = this.insertBefore( parProps, StyleSheet.NIL_STYLE );
|
||||
cell.insertAfter( String.valueOf( '\u0007' ) );
|
||||
for (int y = 1; y < columns; y++) {
|
||||
for ( int y = 1; y < columns; y++ )
|
||||
{
|
||||
cell = cell.insertAfter( parProps, StyleSheet.NIL_STYLE );
|
||||
cell.insertAfter( String.valueOf( '\u0007' ) );
|
||||
}
|
||||
cell = cell.insertAfter(parProps, StyleSheet.NIL_STYLE, String.valueOf('\u0007'));
|
||||
cell = cell.insertAfter( parProps, StyleSheet.NIL_STYLE,
|
||||
String.valueOf( '\u0007' ) );
|
||||
cell.setTableRowEnd( props );
|
||||
}
|
||||
return new Table(_start, _start + (rows * (columns + 1)) * 2, this, 1);
|
||||
|
||||
final int newEnd = this._end;
|
||||
final int diff = newEnd - oldEnd;
|
||||
|
||||
return new Table( _start, _start + diff, this, 1 );
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -715,23 +673,14 @@ public class Range { // TODO -instantiable superclass
|
|||
*/
|
||||
public void replaceText(String pPlaceHolder, String pValue, int pOffset) {
|
||||
int absPlaceHolderIndex = getStartOffset() + pOffset;
|
||||
|
||||
Range subRange = new Range(absPlaceHolderIndex, (absPlaceHolderIndex + pPlaceHolder
|
||||
.length()), getDocument());
|
||||
|
||||
// this Range isn't a proper parent of the subRange() so we'll have to
|
||||
// keep
|
||||
// track of an updated endOffset on our own
|
||||
int previousEndOffset = subRange.getEndOffset();
|
||||
|
||||
.length()), this);
|
||||
subRange.insertBefore(pValue);
|
||||
|
||||
if (subRange.getEndOffset() != previousEndOffset) {
|
||||
adjustForInsert(subRange.getEndOffset() - previousEndOffset);
|
||||
}
|
||||
|
||||
// re-create the sub-range so we can delete it
|
||||
subRange = new Range((absPlaceHolderIndex + pValue.length()), (absPlaceHolderIndex
|
||||
+ pPlaceHolder.length() + pValue.length()), getDocument());
|
||||
+ pPlaceHolder.length() + pValue.length()), this);
|
||||
|
||||
// deletes are automagically propagated
|
||||
subRange.delete();
|
||||
|
@ -921,7 +870,6 @@ public class Range { // TODO -instantiable superclass
|
|||
* loads all of the list indexes.
|
||||
*/
|
||||
protected void initAll() {
|
||||
initText();
|
||||
initCharacterRuns();
|
||||
initParagraphs();
|
||||
initSections();
|
||||
|
@ -951,18 +899,6 @@ public class Range { // TODO -instantiable superclass
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* inits the text piece list indexes.
|
||||
*/
|
||||
private void initText() {
|
||||
if (!_textRangeFound) {
|
||||
int[] point = findRange(_text, _textStart, _start, _end);
|
||||
_textStart = point[0];
|
||||
_textEnd = point[1];
|
||||
_textRangeFound = true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* inits the section list indexes.
|
||||
*/
|
||||
|
@ -1038,7 +974,6 @@ public class Range { // TODO -instantiable superclass
|
|||
* resets the list indexes.
|
||||
*/
|
||||
protected void reset() {
|
||||
_textRangeFound = false;
|
||||
_charRangeFound = false;
|
||||
_parRangeFound = false;
|
||||
_sectionRangeFound = false;
|
||||
|
@ -1153,8 +1088,19 @@ public class Range { // TODO -instantiable superclass
|
|||
* Method for debug purposes. Checks that all resolved elements are inside
|
||||
* of current range.
|
||||
*/
|
||||
public void sanityCheck()
|
||||
public boolean sanityCheck()
|
||||
{
|
||||
if ( _start < 0 )
|
||||
throw new AssertionError();
|
||||
if ( _start >= _text.length() )
|
||||
throw new AssertionError();
|
||||
if ( _end < 0 )
|
||||
throw new AssertionError();
|
||||
if ( _end > _text.length() )
|
||||
throw new AssertionError();
|
||||
if ( _start > _end )
|
||||
throw new AssertionError();
|
||||
|
||||
if ( _charRangeFound )
|
||||
{
|
||||
for ( int c = _charStart; c < _charEnd; c++ )
|
||||
|
@ -1181,5 +1127,7 @@ public class Range { // TODO -instantiable superclass
|
|||
throw new AssertionError();
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,16 +17,13 @@
|
|||
|
||||
package org.apache.poi.hwpf.extractor;
|
||||
|
||||
import java.util.Iterator;
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
import org.apache.poi.hwpf.HWPFTestDataSamples;
|
||||
import org.apache.poi.hwpf.model.TextPiece;
|
||||
import org.apache.poi.hwpf.usermodel.Paragraph;
|
||||
import org.apache.poi.hwpf.usermodel.Range;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
/**
|
||||
* Test the different routes to extracting text
|
||||
*
|
||||
|
@ -78,24 +75,10 @@ public final class TestDifferentRoutes extends TestCase {
|
|||
* Test textPieces based extraction
|
||||
*/
|
||||
public void testExtractFromTextPieces() throws Exception {
|
||||
StringBuffer textBuf = new StringBuffer();
|
||||
|
||||
Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
|
||||
while (textPieces.hasNext()) {
|
||||
TextPiece piece = (TextPiece) textPieces.next();
|
||||
|
||||
String encoding = "Cp1252";
|
||||
if (piece.isUnicode()) {
|
||||
encoding = "UTF-16LE";
|
||||
}
|
||||
String text = new String(piece.getRawBytes(), encoding);
|
||||
textBuf.append(text);
|
||||
}
|
||||
|
||||
StringBuffer exp = new StringBuffer();
|
||||
for (int i = 0; i < p_text.length; i++) {
|
||||
exp.append(p_text[i]);
|
||||
}
|
||||
assertEquals(exp.toString(), textBuf.toString());
|
||||
assertEquals(exp.toString(), doc.getDocumentText());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -53,7 +53,7 @@ public final class TestPAPBinTable extends TestCase
|
|||
|
||||
HWPFFileSystem fileSys = new HWPFFileSystem();
|
||||
|
||||
_pAPBinTable.writeTo( fileSys, 0 );
|
||||
_pAPBinTable.writeTo( fileSys, 0, fakeTPT );
|
||||
ByteArrayOutputStream tableOut = fileSys.getStream( "1Table" );
|
||||
ByteArrayOutputStream mainOut = fileSys.getStream( "WordDocument" );
|
||||
|
||||
|
|
|
@ -169,6 +169,7 @@ public final class TestTextPieceTable extends TestCase {
|
|||
throws Exception
|
||||
{
|
||||
super.setUp();
|
||||
System.setProperty( "org.apache.poi.hwpf.preserveTextTable", Boolean.TRUE.toString() );
|
||||
|
||||
_hWPFDocFixture = new HWPFDocFixture(this, HWPFDocFixture.DEFAULT_TEST_FILE);
|
||||
_hWPFDocFixture.setUp();
|
||||
|
@ -178,8 +179,9 @@ public final class TestTextPieceTable extends TestCase {
|
|||
throws Exception
|
||||
{
|
||||
_hWPFDocFixture.tearDown();
|
||||
|
||||
_hWPFDocFixture = null;
|
||||
|
||||
System.setProperty( "org.apache.poi.hwpf.preserveTextTable", Boolean.FALSE.toString() );
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
|
|
|
@ -103,10 +103,6 @@ public final class TestProblems extends HWPFTestCase {
|
|||
assertEquals("One paragraph is ok\7", r.getParagraph(3).text());
|
||||
assertEquals("\7", r.getParagraph(4).text());
|
||||
assertEquals("\r", r.getParagraph(5).text());
|
||||
for(int i=0; i<=5; i++) {
|
||||
assertFalse(r.getParagraph(i).usesUnicode());
|
||||
}
|
||||
|
||||
|
||||
// Get the table
|
||||
Table t = r.getTable(p);
|
||||
|
@ -304,9 +300,6 @@ public final class TestProblems extends HWPFTestCase {
|
|||
assertEquals("Row 3/Cell 3\u0007", r.getParagraph(10).text());
|
||||
assertEquals("\u0007", r.getParagraph(11).text());
|
||||
assertEquals("\r", r.getParagraph(12).text());
|
||||
for(int i=0; i<=12; i++) {
|
||||
assertFalse(r.getParagraph(i).usesUnicode());
|
||||
}
|
||||
|
||||
Paragraph p;
|
||||
|
||||
|
@ -791,7 +784,9 @@ public final class TestProblems extends HWPFTestCase {
|
|||
Paragraph actParagraph = actual.getParagraph( p );
|
||||
|
||||
assertEquals( expParagraph.text(), actParagraph.text() );
|
||||
assertEquals( expParagraph.isInTable(), actParagraph.isInTable() );
|
||||
assertEquals( "Diffent isInTable flags for paragraphs #" + p
|
||||
+ " -- " + expParagraph + " -- " + actParagraph + ".",
|
||||
expParagraph.isInTable(), actParagraph.isInTable() );
|
||||
assertEquals( expParagraph.isTableRowEnd(),
|
||||
actParagraph.isTableRowEnd() );
|
||||
|
||||
|
|
|
@ -150,6 +150,8 @@ public final class TestRangeDelete extends TestCase {
|
|||
assertEquals(searchText, subRange.text());
|
||||
|
||||
subRange.delete();
|
||||
daDoc.getOverallRange().sanityCheck();
|
||||
daDoc.getRange().sanityCheck();
|
||||
|
||||
// we need to let the model re-calculate the Range before we evaluate it
|
||||
range = daDoc.getRange();
|
||||
|
@ -166,6 +168,7 @@ public final class TestRangeDelete extends TestCase {
|
|||
// this can lead to a StringBufferOutOfBoundsException, so we will add it
|
||||
// even though we don't have an assertion for it
|
||||
Range daRange = daDoc.getRange();
|
||||
daRange.sanityCheck();
|
||||
daRange.text();
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue