mirror of https://github.com/apache/poi.git
Big big unicode rationalisation in text piece code
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@684319 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
69a90eb5e9
commit
c71c0851d5
|
@ -22,20 +22,22 @@ import java.util.Arrays;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Represents a lightweight node in the Trees used to store content
|
* Represents a lightweight node in the Trees used to store content
|
||||||
* properties.
|
* properties. Works only in characters.
|
||||||
*
|
*
|
||||||
* @author Ryan Ackley
|
* @author Ryan Ackley
|
||||||
*/
|
*/
|
||||||
public abstract class PropertyNode implements Comparable, Cloneable
|
public abstract class PropertyNode implements Comparable, Cloneable
|
||||||
{
|
{
|
||||||
protected Object _buf;
|
protected Object _buf;
|
||||||
|
/** The start, in characters */
|
||||||
private int _cpStart;
|
private int _cpStart;
|
||||||
|
/** The end, in characters */
|
||||||
private int _cpEnd;
|
private int _cpEnd;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param fcStart The start of the text for this property.
|
* @param fcStart The start of the text for this property, in characters.
|
||||||
* @param fcEnd The end of the text for this property.
|
* @param fcEnd The end of the text for this property, in characters.
|
||||||
* @param buf FIXME: Old documentation is: "grpprl The property description in compressed form."
|
* @param buf FIXME: Old documentation is: "grpprl The property description in compressed form."
|
||||||
*/
|
*/
|
||||||
protected PropertyNode(int fcStart, int fcEnd, Object buf)
|
protected PropertyNode(int fcStart, int fcEnd, Object buf)
|
||||||
|
@ -43,11 +45,10 @@ public abstract class PropertyNode implements Comparable, Cloneable
|
||||||
_cpStart = fcStart;
|
_cpStart = fcStart;
|
||||||
_cpEnd = fcEnd;
|
_cpEnd = fcEnd;
|
||||||
_buf = buf;
|
_buf = buf;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return The offset of this property's text.
|
* @return The start offset of this property's text.
|
||||||
*/
|
*/
|
||||||
public int getStart()
|
public int getStart()
|
||||||
{
|
{
|
||||||
|
@ -142,9 +143,4 @@ public abstract class PropertyNode implements Comparable, Cloneable
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,9 @@ package org.apache.poi.hwpf.model;
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
/**
|
/**
|
||||||
* Lightweight representation of a text piece.
|
* Lightweight representation of a text piece.
|
||||||
|
* Works in the character domain, not the byte domain, so you
|
||||||
|
* need to have turned byte references into character
|
||||||
|
* references before getting here.
|
||||||
*
|
*
|
||||||
* @author Ryan Ackley
|
* @author Ryan Ackley
|
||||||
*/
|
*/
|
||||||
|
@ -32,21 +35,43 @@ public class TextPiece extends PropertyNode implements Comparable
|
||||||
|
|
||||||
private PieceDescriptor _pd;
|
private PieceDescriptor _pd;
|
||||||
|
|
||||||
private int _cpStart;
|
/**
|
||||||
|
* @param start Beginning offset in main document stream, in characters.
|
||||||
|
* @param end Ending offset in main document stream, in characters.
|
||||||
|
* @param text The raw bytes of our text
|
||||||
|
*/
|
||||||
|
public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, int cpStart) {
|
||||||
|
super(start, end, buildInitSB(text, pd));
|
||||||
|
_usesUnicode = pd.isUnicode();
|
||||||
|
_pd = pd;
|
||||||
|
|
||||||
|
// Validate
|
||||||
|
int textLength = ((StringBuffer)_buf).length();
|
||||||
|
if(end-start != textLength) {
|
||||||
|
throw new IllegalStateException("Told we're for characters " + start + " -> " + end + ", but actually covers " + textLength + " characters!");
|
||||||
|
}
|
||||||
|
if(end < start) {
|
||||||
|
throw new IllegalStateException("Told we're of negative size! start="+start + " end="+end);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param start Offset in main document stream.
|
* Create the StringBuffer from the text and unicode flag
|
||||||
*/
|
*/
|
||||||
public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, int cpStart)
|
private static StringBuffer buildInitSB(byte[] text, PieceDescriptor pd) {
|
||||||
throws UnsupportedEncodingException
|
String str;
|
||||||
{
|
try {
|
||||||
/** start - end is length on file. This is double the expected when its
|
if(pd.isUnicode()) {
|
||||||
* unicode.*/
|
str = new String(text, "UTF-16LE");
|
||||||
super(start, end, new StringBuffer(new String(text, pd.isUnicode() ? "UTF-16LE" : "Cp1252")));
|
} else {
|
||||||
_usesUnicode = pd.isUnicode();
|
str = new String(text, "Cp1252");
|
||||||
_pd = pd;
|
}
|
||||||
_cpStart = cpStart;
|
} catch(UnsupportedEncodingException e) {
|
||||||
|
throw new RuntimeException("Your Java is broken! It doesn't know about basic, required character encodings!");
|
||||||
|
}
|
||||||
|
return new StringBuffer(str);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return If this text piece uses unicode
|
* @return If this text piece uses unicode
|
||||||
*/
|
*/
|
||||||
|
@ -67,38 +92,43 @@ public class TextPiece extends PropertyNode implements Comparable
|
||||||
|
|
||||||
public byte[] getRawBytes()
|
public byte[] getRawBytes()
|
||||||
{
|
{
|
||||||
try
|
try {
|
||||||
{
|
|
||||||
return ((StringBuffer)_buf).toString().getBytes(_usesUnicode ?
|
return ((StringBuffer)_buf).toString().getBytes(_usesUnicode ?
|
||||||
"UTF-16LE" : "Cp1252");
|
"UTF-16LE" : "Cp1252");
|
||||||
|
} catch (UnsupportedEncodingException ignore) {
|
||||||
|
throw new RuntimeException("Your Java is broken! It doesn't know about basic, required character encodings!");
|
||||||
}
|
}
|
||||||
catch (UnsupportedEncodingException ignore)
|
|
||||||
{
|
|
||||||
// shouldn't ever happen considering we wouldn't have been able to
|
|
||||||
// create the StringBuffer w/o getting this exception
|
|
||||||
return ((StringBuffer)_buf).toString().getBytes();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns part of the string.
|
||||||
|
* Works only in characters, not in bytes!
|
||||||
|
* @param start Local start position, in characters
|
||||||
|
* @param end Local end position, in characters
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
public String substring(int start, int end)
|
public String substring(int start, int end)
|
||||||
{
|
{
|
||||||
int denominator = _usesUnicode ? 2 : 1;
|
StringBuffer buf = (StringBuffer)_buf;
|
||||||
|
|
||||||
return ((StringBuffer)_buf).substring(start/denominator, end/denominator);
|
// Validate
|
||||||
|
if(start < 0) {
|
||||||
|
throw new StringIndexOutOfBoundsException("Can't request a substring before 0 - asked for " + start);
|
||||||
|
}
|
||||||
|
if(end > buf.length()) {
|
||||||
|
throw new StringIndexOutOfBoundsException("Index " + end + " out of range 0 -> " + buf.length());
|
||||||
|
}
|
||||||
|
return buf.substring(start, end);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void adjustForDelete(int start, int length)
|
/**
|
||||||
{
|
* Adjusts the internal string for deletinging
|
||||||
|
* some characters within this.
|
||||||
// length is expected to be the number of code-points,
|
* @param start The start position for the delete, in characters
|
||||||
// not the number of characters
|
* @param length The number of characters to delete
|
||||||
|
*/
|
||||||
|
public void adjustForDelete(int start, int length) {
|
||||||
int numChars = length;
|
int numChars = length;
|
||||||
if (usesUnicode()) {
|
|
||||||
|
|
||||||
start /= 2;
|
|
||||||
numChars = (length / 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
int myStart = getStart();
|
int myStart = getStart();
|
||||||
int myEnd = getEnd();
|
int myEnd = getEnd();
|
||||||
|
@ -121,9 +151,18 @@ public class TextPiece extends PropertyNode implements Comparable
|
||||||
super.adjustForDelete(start, length);
|
super.adjustForDelete(start, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the length, in characters
|
||||||
|
*/
|
||||||
public int characterLength()
|
public int characterLength()
|
||||||
{
|
{
|
||||||
return (getEnd() - getStart()) / (_usesUnicode ? 2 : 1);
|
return (getEnd() - getStart());
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Returns the length, in bytes
|
||||||
|
*/
|
||||||
|
public int bytesLength() {
|
||||||
|
return (getEnd() - getStart()) * (_usesUnicode ? 2 : 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean equals(Object o)
|
public boolean equals(Object o)
|
||||||
|
@ -138,9 +177,11 @@ public class TextPiece extends PropertyNode implements Comparable
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the character position we start at.
|
||||||
|
*/
|
||||||
public int getCP()
|
public int getCP()
|
||||||
{
|
{
|
||||||
return _cpStart;
|
return getStart();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,6 +28,11 @@ import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* The piece table for matching up character positions
|
||||||
|
* to bits of text.
|
||||||
|
* This mostly works in bytes, but the TextPieces
|
||||||
|
* themselves work in characters. This does the icky
|
||||||
|
* convertion.
|
||||||
* @author Ryan Ackley
|
* @author Ryan Ackley
|
||||||
*/
|
*/
|
||||||
public class TextPieceTable
|
public class TextPieceTable
|
||||||
|
@ -36,8 +41,7 @@ public class TextPieceTable
|
||||||
//int _multiple;
|
//int _multiple;
|
||||||
int _cpMin;
|
int _cpMin;
|
||||||
|
|
||||||
public TextPieceTable()
|
public TextPieceTable() {
|
||||||
{
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public TextPieceTable(byte[] documentStream, byte[] tableStream, int offset,
|
public TextPieceTable(byte[] documentStream, byte[] tableStream, int offset,
|
||||||
|
@ -47,7 +51,6 @@ public class TextPieceTable
|
||||||
// get our plex of PieceDescriptors
|
// get our plex of PieceDescriptors
|
||||||
PlexOfCps pieceTable = new PlexOfCps(tableStream, offset, size, PieceDescriptor.getSizeInBytes());
|
PlexOfCps pieceTable = new PlexOfCps(tableStream, offset, size, PieceDescriptor.getSizeInBytes());
|
||||||
|
|
||||||
//_multiple = 2;
|
|
||||||
int length = pieceTable.length();
|
int length = pieceTable.length();
|
||||||
PieceDescriptor[] pieces = new PieceDescriptor[length];
|
PieceDescriptor[] pieces = new PieceDescriptor[length];
|
||||||
|
|
||||||
|
@ -57,11 +60,6 @@ public class TextPieceTable
|
||||||
{
|
{
|
||||||
GenericPropertyNode node = pieceTable.getProperty(x);
|
GenericPropertyNode node = pieceTable.getProperty(x);
|
||||||
pieces[x] = new PieceDescriptor(node.getBytes(), 0);
|
pieces[x] = new PieceDescriptor(node.getBytes(), 0);
|
||||||
|
|
||||||
// if (!pieces[x].isUnicode())
|
|
||||||
// {
|
|
||||||
// _multiple = 1;
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int firstPieceFilePosition = pieces[0].getFilePosition();
|
int firstPieceFilePosition = pieces[0].getFilePosition();
|
||||||
|
@ -72,26 +70,28 @@ public class TextPieceTable
|
||||||
{
|
{
|
||||||
int start = pieces[x].getFilePosition();
|
int start = pieces[x].getFilePosition();
|
||||||
PropertyNode node = pieceTable.getProperty(x);
|
PropertyNode node = pieceTable.getProperty(x);
|
||||||
int nodeStart = node.getStart();
|
|
||||||
|
|
||||||
// multiple will be 2 if there is only one piece and its unicode. Some
|
// Grab the start and end, which are in characters
|
||||||
// type of optimization.
|
int nodeStartChars = node.getStart();
|
||||||
|
int nodeEndChars = node.getEnd();
|
||||||
|
|
||||||
|
// What's the relationship between bytes and characters?
|
||||||
boolean unicode = pieces[x].isUnicode();
|
boolean unicode = pieces[x].isUnicode();
|
||||||
|
|
||||||
int multiple = 1;
|
int multiple = 1;
|
||||||
if (unicode)
|
if (unicode) {
|
||||||
{
|
|
||||||
multiple = 2;
|
multiple = 2;
|
||||||
}
|
}
|
||||||
int nodeEnd = ((node.getEnd() - nodeStart) * multiple) + nodeStart;
|
|
||||||
int textSize = nodeEnd - nodeStart;
|
|
||||||
|
|
||||||
|
// Figure out the length, in bytes and chars
|
||||||
|
int textSizeChars = (nodeEndChars - nodeStartChars);
|
||||||
|
int textSizeBytes = textSizeChars * multiple;
|
||||||
|
|
||||||
byte[] buf = new byte[textSize];
|
// Grab the data that makes up the piece
|
||||||
System.arraycopy(documentStream, start, buf, 0, textSize);
|
byte[] buf = new byte[textSizeBytes];
|
||||||
|
System.arraycopy(documentStream, start, buf, 0, textSizeBytes);
|
||||||
|
|
||||||
int startFilePosition = start - firstPieceFilePosition;
|
// And now build the piece
|
||||||
_textPieces.add(new TextPiece(startFilePosition, startFilePosition+textSize, buf, pieces[x], node.getStart()));
|
_textPieces.add(new TextPiece(nodeStartChars, nodeEndChars, buf, pieces[x], node.getStart()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -113,7 +113,6 @@ public class TextPieceTable
|
||||||
//int fcMin = docStream.getOffset();
|
//int fcMin = docStream.getOffset();
|
||||||
|
|
||||||
int size = _textPieces.size();
|
int size = _textPieces.size();
|
||||||
int bumpDown = 0;
|
|
||||||
for (int x = 0; x < size; x++)
|
for (int x = 0; x < size; x++)
|
||||||
{
|
{
|
||||||
TextPiece next = (TextPiece)_textPieces.get(x);
|
TextPiece next = (TextPiece)_textPieces.get(x);
|
||||||
|
@ -134,47 +133,43 @@ public class TextPieceTable
|
||||||
|
|
||||||
// write the text to the docstream and save the piece descriptor to the
|
// write the text to the docstream and save the piece descriptor to the
|
||||||
// plex which will be written later to the tableStream.
|
// plex which will be written later to the tableStream.
|
||||||
//if (_multiple == 1 && pd.isUnicode() &&
|
|
||||||
docStream.write(next.getRawBytes());
|
docStream.write(next.getRawBytes());
|
||||||
|
|
||||||
|
// The TextPiece is already in characters, which
|
||||||
|
// makes our life much easier
|
||||||
int nodeStart = next.getStart();
|
int nodeStart = next.getStart();
|
||||||
int multiple = 1;
|
int nodeEnd = next.getEnd();
|
||||||
if (pd.isUnicode())
|
textPlex.addProperty(new GenericPropertyNode(nodeStart, nodeEnd,
|
||||||
{
|
|
||||||
multiple = 2;
|
|
||||||
}
|
|
||||||
textPlex.addProperty(new GenericPropertyNode(nodeStart - bumpDown,
|
|
||||||
((next.getEnd() - nodeStart)/multiple + nodeStart) - bumpDown,
|
|
||||||
pd.toByteArray()));
|
pd.toByteArray()));
|
||||||
|
|
||||||
if (pd.isUnicode())
|
|
||||||
{
|
|
||||||
bumpDown += ((next.getEnd() - nodeStart)/multiple);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return textPlex.toByteArray();
|
return textPlex.toByteArray();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
public int adjustForInsert(int listIndex, int length)
|
* Adjust all the text piece after inserting
|
||||||
{
|
* some text into one of them
|
||||||
|
* @param listIndex The TextPiece that had characters inserted into
|
||||||
|
* @param length The number of characters inserted
|
||||||
|
*/
|
||||||
|
public int adjustForInsert(int listIndex, int length) {
|
||||||
int size = _textPieces.size();
|
int size = _textPieces.size();
|
||||||
|
|
||||||
TextPiece tp = (TextPiece)_textPieces.get(listIndex);
|
TextPiece tp = (TextPiece)_textPieces.get(listIndex);
|
||||||
|
|
||||||
//The text piece stores the length on file.
|
// Update with the new end
|
||||||
length = length * (tp.usesUnicode() ? 2 : 1);
|
|
||||||
tp.setEnd(tp.getEnd() + length);
|
tp.setEnd(tp.getEnd() + length);
|
||||||
|
|
||||||
|
// Now change all subsequent ones
|
||||||
for (int x = listIndex + 1; x < size; x++)
|
for (int x = listIndex + 1; x < size; x++)
|
||||||
{
|
{
|
||||||
tp = (TextPiece)_textPieces.get(x);
|
tp = (TextPiece)_textPieces.get(x);
|
||||||
tp.setStart(tp.getStart() + length);
|
tp.setStart(tp.getStart() + length);
|
||||||
tp.setEnd(tp.getEnd() + length);
|
tp.setEnd(tp.getEnd() + length);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// All done
|
||||||
return length;
|
return length;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -137,7 +137,8 @@ public class Range
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Used to construct a Range from a document. This is generally used to
|
* Used to construct a Range from a document. This is generally used to
|
||||||
* create a Range that spans the whole document.
|
* create a Range that spans the whole document, or at least one
|
||||||
|
* whole part of the document (eg main text, header, comment)
|
||||||
*
|
*
|
||||||
* @param start Starting character offset of the range.
|
* @param start Starting character offset of the range.
|
||||||
* @param end Ending character offset of the range.
|
* @param end Ending character offset of the range.
|
||||||
|
@ -259,15 +260,21 @@ public class Range
|
||||||
for (int x = _textStart; x < _textEnd; x++)
|
for (int x = _textStart; x < _textEnd; x++)
|
||||||
{
|
{
|
||||||
TextPiece piece = (TextPiece)_text.get(x);
|
TextPiece piece = (TextPiece)_text.get(x);
|
||||||
int start = _start > piece.getStart() ? _start - piece.getStart() : 0;
|
|
||||||
int end = _end <= piece.getEnd() ? _end - piece.getStart() : piece.getEnd() - piece.getStart();
|
|
||||||
|
|
||||||
if(piece.usesUnicode()) // convert the byte pointers to char pointers
|
// Figure out where in this piece the text
|
||||||
{
|
// we're after lives
|
||||||
start/=2;
|
int rStart = 0;
|
||||||
end/=2;
|
int rEnd = piece.characterLength();
|
||||||
|
if(_start > piece.getStart()) {
|
||||||
|
rStart = _start - piece.getStart();
|
||||||
}
|
}
|
||||||
sb.append(piece.getStringBuffer().substring(start, end));
|
if(_end < piece.getEnd()) {
|
||||||
|
rEnd -= (piece.getEnd() - _end);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Luckily TextPieces work in characters, so we don't
|
||||||
|
// need to worry about unicode here
|
||||||
|
sb.append(piece.substring(rStart, rEnd));
|
||||||
}
|
}
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
@ -929,9 +936,11 @@ public class Range
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Adjust the value of <code>FIB.CCPText</code> after an insert or a delete...
|
* Adjust the value of <code>FIB.CCPText</code> after an insert or a delete...
|
||||||
*
|
*
|
||||||
* @param adjustment The (signed) value that should be added to <code>FIB.CCPText</code>
|
* TODO - handle other kinds of text, eg Headers
|
||||||
|
*
|
||||||
|
* @param adjustment The (signed) value that should be added to <code>FIB.CCPText</code>
|
||||||
*/
|
*/
|
||||||
protected void adjustFIB(int adjustment) {
|
protected void adjustFIB(int adjustment) {
|
||||||
|
|
||||||
|
|
|
@ -78,10 +78,13 @@ public class TestHWPFRangeParts extends TestCase {
|
||||||
;
|
;
|
||||||
|
|
||||||
private static final String u_header =
|
private static final String u_header =
|
||||||
|
"\r\r" +
|
||||||
"This is a simple header, with a \u20ac euro symbol in it.\r"
|
"This is a simple header, with a \u20ac euro symbol in it.\r"
|
||||||
;
|
;
|
||||||
private static final String u_footer =
|
private static final String u_footer =
|
||||||
"The footer, with Moli\u00e8re, has Unicode in it.\r"
|
"\r\r\r" +
|
||||||
|
"The footer, with Moli\u00e8re, has Unicode in it.\r" +
|
||||||
|
"\r\r\r\r"
|
||||||
;
|
;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -18,19 +18,21 @@
|
||||||
|
|
||||||
package org.apache.poi.hwpf.model;
|
package org.apache.poi.hwpf.model;
|
||||||
|
|
||||||
import junit.framework.*;
|
import java.io.ByteArrayInputStream;
|
||||||
|
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.util.ArrayList;
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
|
||||||
import org.apache.poi.hwpf.*;
|
import junit.framework.TestCase;
|
||||||
import org.apache.poi.hwpf.model.io.*;
|
|
||||||
|
import org.apache.poi.hwpf.HWPFDocFixture;
|
||||||
|
import org.apache.poi.hwpf.HWPFDocument;
|
||||||
|
import org.apache.poi.hwpf.model.io.HWPFFileSystem;
|
||||||
|
|
||||||
|
|
||||||
public class TestTextPieceTable
|
public class TestTextPieceTable extends TestCase {
|
||||||
extends TestCase
|
|
||||||
{
|
|
||||||
private HWPFDocFixture _hWPFDocFixture;
|
private HWPFDocFixture _hWPFDocFixture;
|
||||||
|
private String dirname;
|
||||||
|
|
||||||
public TestTextPieceTable(String name)
|
public TestTextPieceTable(String name)
|
||||||
{
|
{
|
||||||
|
@ -63,9 +65,117 @@ public class TestTextPieceTable
|
||||||
TextPieceTable newTextPieceTable = newCft.getTextPieceTable();
|
TextPieceTable newTextPieceTable = newCft.getTextPieceTable();
|
||||||
|
|
||||||
assertEquals(oldTextPieceTable, newTextPieceTable);
|
assertEquals(oldTextPieceTable, newTextPieceTable);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check that we do the positions correctly when
|
||||||
|
* working with pure-ascii
|
||||||
|
*/
|
||||||
|
public void testAsciiParts() throws Exception {
|
||||||
|
HWPFDocument doc = new HWPFDocument(
|
||||||
|
new FileInputStream(new File(dirname, "ThreeColHeadFoot.doc"))
|
||||||
|
);
|
||||||
|
TextPieceTable tbl = doc.getTextTable();
|
||||||
|
|
||||||
|
// All ascii, so stored in one big lump
|
||||||
|
assertEquals(1, tbl.getTextPieces().size());
|
||||||
|
TextPiece tp = (TextPiece)tbl.getTextPieces().get(0);
|
||||||
|
|
||||||
|
assertEquals(0, tp.getStart());
|
||||||
|
assertEquals(339, tp.getEnd());
|
||||||
|
assertEquals(339, tp.characterLength());
|
||||||
|
assertEquals(339, tp.bytesLength());
|
||||||
|
assertTrue(tp.getStringBuffer().toString().startsWith("This is a sample word document"));
|
||||||
|
|
||||||
|
|
||||||
|
// Save and re-load
|
||||||
|
HWPFDocument docB = saveAndReload(doc);
|
||||||
|
tbl = docB.getTextTable();
|
||||||
|
|
||||||
|
assertEquals(1, tbl.getTextPieces().size());
|
||||||
|
tp = (TextPiece)tbl.getTextPieces().get(0);
|
||||||
|
|
||||||
|
assertEquals(0, tp.getStart());
|
||||||
|
assertEquals(339, tp.getEnd());
|
||||||
|
assertEquals(339, tp.characterLength());
|
||||||
|
assertEquals(339, tp.bytesLength());
|
||||||
|
assertTrue(tp.getStringBuffer().toString().startsWith("This is a sample word document"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check that we do the positions correctly when
|
||||||
|
* working with a mix ascii, unicode file
|
||||||
|
*/
|
||||||
|
public void testUnicodeParts() throws Exception {
|
||||||
|
HWPFDocument doc = new HWPFDocument(
|
||||||
|
new FileInputStream(new File(dirname, "HeaderFooterUnicode.doc"))
|
||||||
|
);
|
||||||
|
TextPieceTable tbl = doc.getTextTable();
|
||||||
|
|
||||||
|
// In three bits, split every 512 bytes
|
||||||
|
assertEquals(3, tbl.getTextPieces().size());
|
||||||
|
TextPiece tpA = (TextPiece)tbl.getTextPieces().get(0);
|
||||||
|
TextPiece tpB = (TextPiece)tbl.getTextPieces().get(1);
|
||||||
|
TextPiece tpC = (TextPiece)tbl.getTextPieces().get(2);
|
||||||
|
|
||||||
|
assertTrue(tpA.usesUnicode());
|
||||||
|
assertTrue(tpB.usesUnicode());
|
||||||
|
assertTrue(tpC.usesUnicode());
|
||||||
|
|
||||||
|
assertEquals(256, tpA.characterLength());
|
||||||
|
assertEquals(256, tpB.characterLength());
|
||||||
|
assertEquals(19, tpC.characterLength());
|
||||||
|
|
||||||
|
assertEquals(512, tpA.bytesLength());
|
||||||
|
assertEquals(512, tpB.bytesLength());
|
||||||
|
assertEquals(38, tpC.bytesLength());
|
||||||
|
|
||||||
|
assertEquals(0, tpA.getStart());
|
||||||
|
assertEquals(256, tpA.getEnd());
|
||||||
|
assertEquals(256, tpB.getStart());
|
||||||
|
assertEquals(512, tpB.getEnd());
|
||||||
|
assertEquals(512, tpC.getStart());
|
||||||
|
assertEquals(531, tpC.getEnd());
|
||||||
|
|
||||||
|
|
||||||
|
// Save and re-load
|
||||||
|
HWPFDocument docB = saveAndReload(doc);
|
||||||
|
tbl = docB.getTextTable();
|
||||||
|
|
||||||
|
assertEquals(3, tbl.getTextPieces().size());
|
||||||
|
tpA = (TextPiece)tbl.getTextPieces().get(0);
|
||||||
|
tpB = (TextPiece)tbl.getTextPieces().get(1);
|
||||||
|
tpC = (TextPiece)tbl.getTextPieces().get(2);
|
||||||
|
|
||||||
|
assertTrue(tpA.usesUnicode());
|
||||||
|
assertTrue(tpB.usesUnicode());
|
||||||
|
assertTrue(tpC.usesUnicode());
|
||||||
|
|
||||||
|
assertEquals(256, tpA.characterLength());
|
||||||
|
assertEquals(256, tpB.characterLength());
|
||||||
|
assertEquals(19, tpC.characterLength());
|
||||||
|
|
||||||
|
assertEquals(512, tpA.bytesLength());
|
||||||
|
assertEquals(512, tpB.bytesLength());
|
||||||
|
assertEquals(38, tpC.bytesLength());
|
||||||
|
|
||||||
|
assertEquals(0, tpA.getStart());
|
||||||
|
assertEquals(256, tpA.getEnd());
|
||||||
|
assertEquals(256, tpB.getStart());
|
||||||
|
assertEquals(512, tpB.getEnd());
|
||||||
|
assertEquals(512, tpC.getStart());
|
||||||
|
assertEquals(531, tpC.getEnd());
|
||||||
|
}
|
||||||
|
|
||||||
|
protected HWPFDocument saveAndReload(HWPFDocument doc) throws Exception {
|
||||||
|
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||||
|
doc.write(baos);
|
||||||
|
|
||||||
|
return new HWPFDocument(
|
||||||
|
new ByteArrayInputStream(baos.toByteArray())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
protected void setUp()
|
protected void setUp()
|
||||||
throws Exception
|
throws Exception
|
||||||
{
|
{
|
||||||
|
@ -73,6 +183,8 @@ public class TestTextPieceTable
|
||||||
|
|
||||||
_hWPFDocFixture = new HWPFDocFixture(this);
|
_hWPFDocFixture = new HWPFDocFixture(this);
|
||||||
_hWPFDocFixture.setUp();
|
_hWPFDocFixture.setUp();
|
||||||
|
|
||||||
|
dirname = System.getProperty("HWPF.testdata.path");
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void tearDown()
|
protected void tearDown()
|
||||||
|
|
Loading…
Reference in New Issue