Big big unicode rationalisation in text piece code

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@684319 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-08-09 19:34:38 +00:00
parent 69a90eb5e9
commit c71c0851d5
6 changed files with 269 additions and 113 deletions

View File

@ -22,20 +22,22 @@ import java.util.Arrays;
/** /**
* Represents a lightweight node in the Trees used to store content * Represents a lightweight node in the Trees used to store content
* properties. * properties. Works only in characters.
* *
* @author Ryan Ackley * @author Ryan Ackley
*/ */
public abstract class PropertyNode implements Comparable, Cloneable public abstract class PropertyNode implements Comparable, Cloneable
{ {
protected Object _buf; protected Object _buf;
/** The start, in characters */
private int _cpStart; private int _cpStart;
/** The end, in characters */
private int _cpEnd; private int _cpEnd;
/** /**
* @param fcStart The start of the text for this property. * @param fcStart The start of the text for this property, in characters.
* @param fcEnd The end of the text for this property. * @param fcEnd The end of the text for this property, in characters.
* @param buf FIXME: Old documentation is: "grpprl The property description in compressed form." * @param buf FIXME: Old documentation is: "grpprl The property description in compressed form."
*/ */
protected PropertyNode(int fcStart, int fcEnd, Object buf) protected PropertyNode(int fcStart, int fcEnd, Object buf)
@ -43,11 +45,10 @@ public abstract class PropertyNode implements Comparable, Cloneable
_cpStart = fcStart; _cpStart = fcStart;
_cpEnd = fcEnd; _cpEnd = fcEnd;
_buf = buf; _buf = buf;
} }
/** /**
* @return The offset of this property's text. * @return The start offset of this property's text.
*/ */
public int getStart() public int getStart()
{ {
@ -142,9 +143,4 @@ public abstract class PropertyNode implements Comparable, Cloneable
return 1; return 1;
} }
} }
} }

View File

@ -22,6 +22,9 @@ package org.apache.poi.hwpf.model;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
/** /**
* Lightweight representation of a text piece. * Lightweight representation of a text piece.
* Works in the character domain, not the byte domain, so you
* need to have turned byte references into character
* references before getting here.
* *
* @author Ryan Ackley * @author Ryan Ackley
*/ */
@ -32,21 +35,43 @@ public class TextPiece extends PropertyNode implements Comparable
private PieceDescriptor _pd; private PieceDescriptor _pd;
private int _cpStart; /**
* @param start Beginning offset in main document stream, in characters.
* @param end Ending offset in main document stream, in characters.
* @param text The raw bytes of our text
*/
public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, int cpStart) {
super(start, end, buildInitSB(text, pd));
_usesUnicode = pd.isUnicode();
_pd = pd;
// Validate
int textLength = ((StringBuffer)_buf).length();
if(end-start != textLength) {
throw new IllegalStateException("Told we're for characters " + start + " -> " + end + ", but actually covers " + textLength + " characters!");
}
if(end < start) {
throw new IllegalStateException("Told we're of negative size! start="+start + " end="+end);
}
}
/** /**
* @param start Offset in main document stream. * Create the StringBuffer from the text and unicode flag
*/ */
public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, int cpStart) private static StringBuffer buildInitSB(byte[] text, PieceDescriptor pd) {
throws UnsupportedEncodingException String str;
{ try {
/** start - end is length on file. This is double the expected when its if(pd.isUnicode()) {
* unicode.*/ str = new String(text, "UTF-16LE");
super(start, end, new StringBuffer(new String(text, pd.isUnicode() ? "UTF-16LE" : "Cp1252"))); } else {
_usesUnicode = pd.isUnicode(); str = new String(text, "Cp1252");
_pd = pd; }
_cpStart = cpStart; } catch(UnsupportedEncodingException e) {
throw new RuntimeException("Your Java is broken! It doesn't know about basic, required character encodings!");
}
return new StringBuffer(str);
} }
/** /**
* @return If this text piece uses unicode * @return If this text piece uses unicode
*/ */
@ -67,38 +92,43 @@ public class TextPiece extends PropertyNode implements Comparable
public byte[] getRawBytes() public byte[] getRawBytes()
{ {
try try {
{
return ((StringBuffer)_buf).toString().getBytes(_usesUnicode ? return ((StringBuffer)_buf).toString().getBytes(_usesUnicode ?
"UTF-16LE" : "Cp1252"); "UTF-16LE" : "Cp1252");
} catch (UnsupportedEncodingException ignore) {
throw new RuntimeException("Your Java is broken! It doesn't know about basic, required character encodings!");
} }
catch (UnsupportedEncodingException ignore)
{
// shouldn't ever happen considering we wouldn't have been able to
// create the StringBuffer w/o getting this exception
return ((StringBuffer)_buf).toString().getBytes();
}
} }
/**
* Returns part of the string.
* Works only in characters, not in bytes!
* @param start Local start position, in characters
* @param end Local end position, in characters
* @return
*/
public String substring(int start, int end) public String substring(int start, int end)
{ {
int denominator = _usesUnicode ? 2 : 1; StringBuffer buf = (StringBuffer)_buf;
return ((StringBuffer)_buf).substring(start/denominator, end/denominator); // Validate
if(start < 0) {
throw new StringIndexOutOfBoundsException("Can't request a substring before 0 - asked for " + start);
}
if(end > buf.length()) {
throw new StringIndexOutOfBoundsException("Index " + end + " out of range 0 -> " + buf.length());
}
return buf.substring(start, end);
} }
public void adjustForDelete(int start, int length) /**
{ * Adjusts the internal string for deletinging
* some characters within this.
// length is expected to be the number of code-points, * @param start The start position for the delete, in characters
// not the number of characters * @param length The number of characters to delete
*/
public void adjustForDelete(int start, int length) {
int numChars = length; int numChars = length;
if (usesUnicode()) {
start /= 2;
numChars = (length / 2);
}
int myStart = getStart(); int myStart = getStart();
int myEnd = getEnd(); int myEnd = getEnd();
@ -121,9 +151,18 @@ public class TextPiece extends PropertyNode implements Comparable
super.adjustForDelete(start, length); super.adjustForDelete(start, length);
} }
/**
* Returns the length, in characters
*/
public int characterLength() public int characterLength()
{ {
return (getEnd() - getStart()) / (_usesUnicode ? 2 : 1); return (getEnd() - getStart());
}
/**
* Returns the length, in bytes
*/
public int bytesLength() {
return (getEnd() - getStart()) * (_usesUnicode ? 2 : 1);
} }
public boolean equals(Object o) public boolean equals(Object o)
@ -138,9 +177,11 @@ public class TextPiece extends PropertyNode implements Comparable
} }
/**
* Returns the character position we start at.
*/
public int getCP() public int getCP()
{ {
return _cpStart; return getStart();
} }
} }

View File

@ -28,6 +28,11 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
/** /**
* The piece table for matching up character positions
* to bits of text.
* This mostly works in bytes, but the TextPieces
* themselves work in characters. This does the icky
* convertion.
* @author Ryan Ackley * @author Ryan Ackley
*/ */
public class TextPieceTable public class TextPieceTable
@ -36,8 +41,7 @@ public class TextPieceTable
//int _multiple; //int _multiple;
int _cpMin; int _cpMin;
public TextPieceTable() public TextPieceTable() {
{
} }
public TextPieceTable(byte[] documentStream, byte[] tableStream, int offset, public TextPieceTable(byte[] documentStream, byte[] tableStream, int offset,
@ -47,7 +51,6 @@ public class TextPieceTable
// get our plex of PieceDescriptors // get our plex of PieceDescriptors
PlexOfCps pieceTable = new PlexOfCps(tableStream, offset, size, PieceDescriptor.getSizeInBytes()); PlexOfCps pieceTable = new PlexOfCps(tableStream, offset, size, PieceDescriptor.getSizeInBytes());
//_multiple = 2;
int length = pieceTable.length(); int length = pieceTable.length();
PieceDescriptor[] pieces = new PieceDescriptor[length]; PieceDescriptor[] pieces = new PieceDescriptor[length];
@ -57,11 +60,6 @@ public class TextPieceTable
{ {
GenericPropertyNode node = pieceTable.getProperty(x); GenericPropertyNode node = pieceTable.getProperty(x);
pieces[x] = new PieceDescriptor(node.getBytes(), 0); pieces[x] = new PieceDescriptor(node.getBytes(), 0);
// if (!pieces[x].isUnicode())
// {
// _multiple = 1;
// }
} }
int firstPieceFilePosition = pieces[0].getFilePosition(); int firstPieceFilePosition = pieces[0].getFilePosition();
@ -72,26 +70,28 @@ public class TextPieceTable
{ {
int start = pieces[x].getFilePosition(); int start = pieces[x].getFilePosition();
PropertyNode node = pieceTable.getProperty(x); PropertyNode node = pieceTable.getProperty(x);
int nodeStart = node.getStart();
// multiple will be 2 if there is only one piece and its unicode. Some // Grab the start and end, which are in characters
// type of optimization. int nodeStartChars = node.getStart();
int nodeEndChars = node.getEnd();
// What's the relationship between bytes and characters?
boolean unicode = pieces[x].isUnicode(); boolean unicode = pieces[x].isUnicode();
int multiple = 1; int multiple = 1;
if (unicode) if (unicode) {
{
multiple = 2; multiple = 2;
} }
int nodeEnd = ((node.getEnd() - nodeStart) * multiple) + nodeStart;
int textSize = nodeEnd - nodeStart;
// Figure out the length, in bytes and chars
int textSizeChars = (nodeEndChars - nodeStartChars);
int textSizeBytes = textSizeChars * multiple;
byte[] buf = new byte[textSize]; // Grab the data that makes up the piece
System.arraycopy(documentStream, start, buf, 0, textSize); byte[] buf = new byte[textSizeBytes];
System.arraycopy(documentStream, start, buf, 0, textSizeBytes);
int startFilePosition = start - firstPieceFilePosition; // And now build the piece
_textPieces.add(new TextPiece(startFilePosition, startFilePosition+textSize, buf, pieces[x], node.getStart())); _textPieces.add(new TextPiece(nodeStartChars, nodeEndChars, buf, pieces[x], node.getStart()));
} }
} }
@ -113,7 +113,6 @@ public class TextPieceTable
//int fcMin = docStream.getOffset(); //int fcMin = docStream.getOffset();
int size = _textPieces.size(); int size = _textPieces.size();
int bumpDown = 0;
for (int x = 0; x < size; x++) for (int x = 0; x < size; x++)
{ {
TextPiece next = (TextPiece)_textPieces.get(x); TextPiece next = (TextPiece)_textPieces.get(x);
@ -134,47 +133,43 @@ public class TextPieceTable
// write the text to the docstream and save the piece descriptor to the // write the text to the docstream and save the piece descriptor to the
// plex which will be written later to the tableStream. // plex which will be written later to the tableStream.
//if (_multiple == 1 && pd.isUnicode() &&
docStream.write(next.getRawBytes()); docStream.write(next.getRawBytes());
// The TextPiece is already in characters, which
// makes our life much easier
int nodeStart = next.getStart(); int nodeStart = next.getStart();
int multiple = 1; int nodeEnd = next.getEnd();
if (pd.isUnicode()) textPlex.addProperty(new GenericPropertyNode(nodeStart, nodeEnd,
{
multiple = 2;
}
textPlex.addProperty(new GenericPropertyNode(nodeStart - bumpDown,
((next.getEnd() - nodeStart)/multiple + nodeStart) - bumpDown,
pd.toByteArray())); pd.toByteArray()));
if (pd.isUnicode())
{
bumpDown += ((next.getEnd() - nodeStart)/multiple);
}
} }
return textPlex.toByteArray(); return textPlex.toByteArray();
} }
/**
public int adjustForInsert(int listIndex, int length) * Adjust all the text piece after inserting
{ * some text into one of them
* @param listIndex The TextPiece that had characters inserted into
* @param length The number of characters inserted
*/
public int adjustForInsert(int listIndex, int length) {
int size = _textPieces.size(); int size = _textPieces.size();
TextPiece tp = (TextPiece)_textPieces.get(listIndex); TextPiece tp = (TextPiece)_textPieces.get(listIndex);
//The text piece stores the length on file. // Update with the new end
length = length * (tp.usesUnicode() ? 2 : 1);
tp.setEnd(tp.getEnd() + length); tp.setEnd(tp.getEnd() + length);
// Now change all subsequent ones
for (int x = listIndex + 1; x < size; x++) for (int x = listIndex + 1; x < size; x++)
{ {
tp = (TextPiece)_textPieces.get(x); tp = (TextPiece)_textPieces.get(x);
tp.setStart(tp.getStart() + length); tp.setStart(tp.getStart() + length);
tp.setEnd(tp.getEnd() + length); tp.setEnd(tp.getEnd() + length);
} }
// All done
return length; return length;
} }

View File

@ -137,7 +137,8 @@ public class Range
/** /**
* Used to construct a Range from a document. This is generally used to * Used to construct a Range from a document. This is generally used to
* create a Range that spans the whole document. * create a Range that spans the whole document, or at least one
* whole part of the document (eg main text, header, comment)
* *
* @param start Starting character offset of the range. * @param start Starting character offset of the range.
* @param end Ending character offset of the range. * @param end Ending character offset of the range.
@ -259,15 +260,21 @@ public class Range
for (int x = _textStart; x < _textEnd; x++) for (int x = _textStart; x < _textEnd; x++)
{ {
TextPiece piece = (TextPiece)_text.get(x); TextPiece piece = (TextPiece)_text.get(x);
int start = _start > piece.getStart() ? _start - piece.getStart() : 0;
int end = _end <= piece.getEnd() ? _end - piece.getStart() : piece.getEnd() - piece.getStart();
if(piece.usesUnicode()) // convert the byte pointers to char pointers // Figure out where in this piece the text
{ // we're after lives
start/=2; int rStart = 0;
end/=2; int rEnd = piece.characterLength();
if(_start > piece.getStart()) {
rStart = _start - piece.getStart();
} }
sb.append(piece.getStringBuffer().substring(start, end)); if(_end < piece.getEnd()) {
rEnd -= (piece.getEnd() - _end);
}
// Luckily TextPieces work in characters, so we don't
// need to worry about unicode here
sb.append(piece.substring(rStart, rEnd));
} }
return sb.toString(); return sb.toString();
} }
@ -929,9 +936,11 @@ public class Range
} }
/** /**
* Adjust the value of <code>FIB.CCPText</code> after an insert or a delete... * Adjust the value of <code>FIB.CCPText</code> after an insert or a delete...
* *
* @param adjustment The (signed) value that should be added to <code>FIB.CCPText</code> * TODO - handle other kinds of text, eg Headers
*
* @param adjustment The (signed) value that should be added to <code>FIB.CCPText</code>
*/ */
protected void adjustFIB(int adjustment) { protected void adjustFIB(int adjustment) {

View File

@ -78,10 +78,13 @@ public class TestHWPFRangeParts extends TestCase {
; ;
private static final String u_header = private static final String u_header =
"\r\r" +
"This is a simple header, with a \u20ac euro symbol in it.\r" "This is a simple header, with a \u20ac euro symbol in it.\r"
; ;
private static final String u_footer = private static final String u_footer =
"The footer, with Moli\u00e8re, has Unicode in it.\r" "\r\r\r" +
"The footer, with Moli\u00e8re, has Unicode in it.\r" +
"\r\r\r\r"
; ;
/** /**

View File

@ -18,19 +18,21 @@
package org.apache.poi.hwpf.model; package org.apache.poi.hwpf.model;
import junit.framework.*; import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.util.ArrayList; import java.io.File;
import java.io.FileInputStream;
import org.apache.poi.hwpf.*; import junit.framework.TestCase;
import org.apache.poi.hwpf.model.io.*;
import org.apache.poi.hwpf.HWPFDocFixture;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.io.HWPFFileSystem;
public class TestTextPieceTable public class TestTextPieceTable extends TestCase {
extends TestCase
{
private HWPFDocFixture _hWPFDocFixture; private HWPFDocFixture _hWPFDocFixture;
private String dirname;
public TestTextPieceTable(String name) public TestTextPieceTable(String name)
{ {
@ -63,9 +65,117 @@ public class TestTextPieceTable
TextPieceTable newTextPieceTable = newCft.getTextPieceTable(); TextPieceTable newTextPieceTable = newCft.getTextPieceTable();
assertEquals(oldTextPieceTable, newTextPieceTable); assertEquals(oldTextPieceTable, newTextPieceTable);
} }
/**
* Check that we do the positions correctly when
* working with pure-ascii
*/
public void testAsciiParts() throws Exception {
HWPFDocument doc = new HWPFDocument(
new FileInputStream(new File(dirname, "ThreeColHeadFoot.doc"))
);
TextPieceTable tbl = doc.getTextTable();
// All ascii, so stored in one big lump
assertEquals(1, tbl.getTextPieces().size());
TextPiece tp = (TextPiece)tbl.getTextPieces().get(0);
assertEquals(0, tp.getStart());
assertEquals(339, tp.getEnd());
assertEquals(339, tp.characterLength());
assertEquals(339, tp.bytesLength());
assertTrue(tp.getStringBuffer().toString().startsWith("This is a sample word document"));
// Save and re-load
HWPFDocument docB = saveAndReload(doc);
tbl = docB.getTextTable();
assertEquals(1, tbl.getTextPieces().size());
tp = (TextPiece)tbl.getTextPieces().get(0);
assertEquals(0, tp.getStart());
assertEquals(339, tp.getEnd());
assertEquals(339, tp.characterLength());
assertEquals(339, tp.bytesLength());
assertTrue(tp.getStringBuffer().toString().startsWith("This is a sample word document"));
}
/**
* Check that we do the positions correctly when
* working with a mix ascii, unicode file
*/
public void testUnicodeParts() throws Exception {
HWPFDocument doc = new HWPFDocument(
new FileInputStream(new File(dirname, "HeaderFooterUnicode.doc"))
);
TextPieceTable tbl = doc.getTextTable();
// In three bits, split every 512 bytes
assertEquals(3, tbl.getTextPieces().size());
TextPiece tpA = (TextPiece)tbl.getTextPieces().get(0);
TextPiece tpB = (TextPiece)tbl.getTextPieces().get(1);
TextPiece tpC = (TextPiece)tbl.getTextPieces().get(2);
assertTrue(tpA.usesUnicode());
assertTrue(tpB.usesUnicode());
assertTrue(tpC.usesUnicode());
assertEquals(256, tpA.characterLength());
assertEquals(256, tpB.characterLength());
assertEquals(19, tpC.characterLength());
assertEquals(512, tpA.bytesLength());
assertEquals(512, tpB.bytesLength());
assertEquals(38, tpC.bytesLength());
assertEquals(0, tpA.getStart());
assertEquals(256, tpA.getEnd());
assertEquals(256, tpB.getStart());
assertEquals(512, tpB.getEnd());
assertEquals(512, tpC.getStart());
assertEquals(531, tpC.getEnd());
// Save and re-load
HWPFDocument docB = saveAndReload(doc);
tbl = docB.getTextTable();
assertEquals(3, tbl.getTextPieces().size());
tpA = (TextPiece)tbl.getTextPieces().get(0);
tpB = (TextPiece)tbl.getTextPieces().get(1);
tpC = (TextPiece)tbl.getTextPieces().get(2);
assertTrue(tpA.usesUnicode());
assertTrue(tpB.usesUnicode());
assertTrue(tpC.usesUnicode());
assertEquals(256, tpA.characterLength());
assertEquals(256, tpB.characterLength());
assertEquals(19, tpC.characterLength());
assertEquals(512, tpA.bytesLength());
assertEquals(512, tpB.bytesLength());
assertEquals(38, tpC.bytesLength());
assertEquals(0, tpA.getStart());
assertEquals(256, tpA.getEnd());
assertEquals(256, tpB.getStart());
assertEquals(512, tpB.getEnd());
assertEquals(512, tpC.getStart());
assertEquals(531, tpC.getEnd());
}
protected HWPFDocument saveAndReload(HWPFDocument doc) throws Exception {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
doc.write(baos);
return new HWPFDocument(
new ByteArrayInputStream(baos.toByteArray())
);
}
protected void setUp() protected void setUp()
throws Exception throws Exception
{ {
@ -73,6 +183,8 @@ public class TestTextPieceTable
_hWPFDocFixture = new HWPFDocFixture(this); _hWPFDocFixture = new HWPFDocFixture(this);
_hWPFDocFixture.setUp(); _hWPFDocFixture.setUp();
dirname = System.getProperty("HWPF.testdata.path");
} }
protected void tearDown() protected void tearDown()