Implement the ExtRst part of a UnicodeString

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@900746 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2010-01-19 12:04:14 +00:00
parent d7470746e1
commit 054f1bc289
5 changed files with 459 additions and 67 deletions

View File

@ -26,9 +26,9 @@ import org.apache.poi.hssf.record.RecordInputStream;
import org.apache.poi.hssf.record.cont.ContinuableRecordOutput;
import org.apache.poi.util.BitField;
import org.apache.poi.util.BitFieldFactory;
import org.apache.poi.util.HexDump;
import org.apache.poi.util.LittleEndianInput;
import org.apache.poi.util.LittleEndianOutput;
import org.apache.poi.util.StringUtil;
/**
* Title: Unicode String<p/>
@ -42,8 +42,8 @@ public final class UnicodeString implements Comparable<UnicodeString> {
private short field_1_charCount;
private byte field_2_optionflags;
private String field_3_string;
private List<FormatRun> field_4_format_runs;
private byte[] field_5_ext_rst;
private List<FormatRun> field_4_format_runs;
private ExtRst field_5_ext_rst;
private static final BitField highByte = BitFieldFactory.getInstance(0x1);
// 0x2 is reserved
private static final BitField extBit = BitFieldFactory.getInstance(0x4);
@ -98,6 +98,225 @@ public final class UnicodeString implements Comparable<UnicodeString> {
out.writeShort(_fontIndex);
}
}
// See page 681
public static class ExtRst implements Comparable<ExtRst> {
private short reserved;
// This is a Phs (see page 881)
private short formattingFontIndex;
private short formattingOptions;
// This is a RPHSSub (see page 894)
private int numberOfRuns;
private String phoneticText;
// This is an array of PhRuns (see page 881)
private PhRun[] phRuns;
// Sometimes there's some cruft at the end
private byte[] extraData;
private void populateEmpty() {
reserved = 1;
phoneticText = "";
phRuns = new PhRun[0];
extraData = new byte[0];
}
protected ExtRst() {
populateEmpty();
}
protected ExtRst(LittleEndianInput in, int expectedLength) {
reserved = in.readShort();
// Old style detection (Reserved = 0xFF)
if(reserved == -1) {
populateEmpty();
return;
}
// Spot corrupt records
if(reserved != 1) {
System.err.println("Warning - ExtRst was has wrong magic marker, expecting 1 but found " + reserved + " - ignoring");
// Grab all the remaining data, and ignore it
for(int i=0; i<expectedLength-2; i++) {
in.readByte();
}
// And make us be empty
populateEmpty();
return;
}
// Carry on reading in as normal
short stringDataSize = in.readShort();
formattingFontIndex = in.readShort();
formattingOptions = in.readShort();
// RPHSSub
numberOfRuns = in.readUShort();
short length1 = in.readShort();
// No really. Someone clearly forgot to read
// the docs on their datastructure...
short length2 = in.readShort();
// And sometimes they write out garbage :(
if(length1 == 0 && length2 > 0) {
length2 = 0;
}
if(length1 != length2) {
throw new IllegalStateException(
"The two length fields of the Phonetic Text don't agree! " +
length1 + " vs " + length2
);
}
phoneticText = StringUtil.readUnicodeLE(in, length1);
int runData = stringDataSize - 4 - 6 - (2*phoneticText.length());
int numRuns = (runData / 6);
phRuns = new PhRun[numRuns];
for(int i=0; i<phRuns.length; i++) {
phRuns[i] = new PhRun(in);
}
int extraDataLength = runData - (numRuns*6);
if(extraDataLength < 0) {
System.err.println("Warning - ExtRst overran by " + (0-extraDataLength) + " bytes");
extraDataLength = 0;
}
extraData = new byte[extraDataLength];
for(int i=0; i<extraData.length; i++) {
extraData[i] = in.readByte();
}
}
/**
* Returns our size, excluding our
* 4 byte header
*/
protected int getDataSize() {
return 4 + 6 + (2*phoneticText.length()) +
(6*phRuns.length) + extraData.length;
}
protected void serialize(ContinuableRecordOutput out) {
int dataSize = getDataSize();
out.writeContinueIfRequired(8);
out.writeShort(reserved);
out.writeShort(dataSize);
out.writeShort(formattingFontIndex);
out.writeShort(formattingOptions);
out.writeContinueIfRequired(6);
out.writeShort(numberOfRuns);
out.writeShort(phoneticText.length());
out.writeShort(phoneticText.length());
out.writeContinueIfRequired(phoneticText.length()*2);
StringUtil.putUnicodeLE(phoneticText, out);
for(int i=0; i<phRuns.length; i++) {
phRuns[i].serialize(out);
}
out.write(extraData);
}
public boolean equals(Object obj) {
if(! (obj instanceof ExtRst)) {
return false;
}
ExtRst other = (ExtRst)obj;
return (compareTo(other) == 0);
}
public int compareTo(ExtRst o) {
int result;
result = reserved - o.reserved;
if(result != 0) return result;
result = formattingFontIndex - o.formattingFontIndex;
if(result != 0) return result;
result = formattingOptions - o.formattingOptions;
if(result != 0) return result;
result = numberOfRuns - o.numberOfRuns;
if(result != 0) return result;
result = phoneticText.compareTo(o.phoneticText);
if(result != 0) return result;
result = phRuns.length - o.phRuns.length;
if(result != 0) return result;
for(int i=0; i<phRuns.length; i++) {
result = phRuns[i].phoneticTextFirstCharacterOffset - o.phRuns[i].phoneticTextFirstCharacterOffset;
if(result != 0) return result;
result = phRuns[i].realTextFirstCharacterOffset - o.phRuns[i].realTextFirstCharacterOffset;
if(result != 0) return result;
result = phRuns[i].realTextFirstCharacterOffset - o.phRuns[i].realTextLength;
if(result != 0) return result;
}
result = extraData.length - o.extraData.length;
if(result != 0) return result;
// If we get here, it's the same
return 0;
}
protected ExtRst clone() {
ExtRst ext = new ExtRst();
ext.reserved = reserved;
ext.formattingFontIndex = formattingFontIndex;
ext.formattingOptions = formattingOptions;
ext.numberOfRuns = numberOfRuns;
ext.phoneticText = new String(phoneticText);
ext.phRuns = new PhRun[phRuns.length];
for(int i=0; i<ext.phRuns.length; i++) {
ext.phRuns[i] = new PhRun(
phRuns[i].phoneticTextFirstCharacterOffset,
phRuns[i].realTextFirstCharacterOffset,
phRuns[i].realTextLength
);
}
return ext;
}
public short getFormattingFontIndex() {
return formattingFontIndex;
}
public short getFormattingOptions() {
return formattingOptions;
}
public int getNumberOfRuns() {
return numberOfRuns;
}
public String getPhoneticText() {
return phoneticText;
}
public PhRun[] getPhRuns() {
return phRuns;
}
}
public static class PhRun {
private int phoneticTextFirstCharacterOffset;
private int realTextFirstCharacterOffset;
private int realTextLength;
public PhRun(int phoneticTextFirstCharacterOffset,
int realTextFirstCharacterOffset, int realTextLength) {
this.phoneticTextFirstCharacterOffset = phoneticTextFirstCharacterOffset;
this.realTextFirstCharacterOffset = realTextFirstCharacterOffset;
this.realTextLength = realTextLength;
}
private PhRun(LittleEndianInput in) {
phoneticTextFirstCharacterOffset = in.readUShort();
realTextFirstCharacterOffset = in.readUShort();
realTextLength = in.readUShort();
}
private void serialize(ContinuableRecordOutput out) {
out.writeContinueIfRequired(6);
out.writeShort(phoneticTextFirstCharacterOffset);
out.writeShort(realTextFirstCharacterOffset);
out.writeShort(realTextLength);
}
}
private UnicodeString() {
//Used for clone method.
@ -160,22 +379,20 @@ public final class UnicodeString implements Comparable<UnicodeString> {
return false;
}
//Well the format runs are equal as well!, better check the ExtRst data
//Which by the way we dont know how to decode!
if ((field_5_ext_rst == null) && (other.field_5_ext_rst == null))
return true;
if (((field_5_ext_rst == null) && (other.field_5_ext_rst != null)) ||
((field_5_ext_rst != null) && (other.field_5_ext_rst == null)))
return false;
size = field_5_ext_rst.length;
if (size != field_5_ext_rst.length)
return false;
//Check individual bytes!
for (int i=0;i<size;i++) {
if (field_5_ext_rst[i] != other.field_5_ext_rst[i])
return false;
// Well the format runs are equal as well!, better check the ExtRst data
if(field_5_ext_rst == null && other.field_5_ext_rst == null) {
// Good
} else if(field_5_ext_rst != null && other.field_5_ext_rst != null) {
int extCmp = field_5_ext_rst.compareTo(other.field_5_ext_rst);
if(extCmp == 0) {
// Good
} else {
return false;
}
} else {
return false;
}
//Phew!! After all of that we have finally worked out that the strings
//are identical.
return true;
@ -218,10 +435,10 @@ public final class UnicodeString implements Comparable<UnicodeString> {
}
if (isExtendedText() && (extensionLength > 0)) {
field_5_ext_rst = new byte[extensionLength];
for (int i=0;i<extensionLength;i++) {
field_5_ext_rst[i] = in.readByte();
}
field_5_ext_rst = new ExtRst(in, extensionLength);
if(field_5_ext_rst.getDataSize()+4 != extensionLength) {
System.err.println("ExtRst was supposed to be " + extensionLength + " bytes long, but seems to actually be " + (field_5_ext_rst.getDataSize()+4));
}
}
}
@ -395,10 +612,15 @@ public final class UnicodeString implements Comparable<UnicodeString> {
}
void setExtendedRst(byte[] ext_rst) {
if (ext_rst != null)
field_2_optionflags = extBit.setByte(field_2_optionflags);
else field_2_optionflags = extBit.clearByte(field_2_optionflags);
public ExtRst getExtendedRst() {
return this.field_5_ext_rst;
}
void setExtendedRst(ExtRst ext_rst) {
if (ext_rst != null) {
field_2_optionflags = extBit.setByte(field_2_optionflags);
} else {
field_2_optionflags = extBit.clearByte(field_2_optionflags);
}
this.field_5_ext_rst = ext_rst;
}
@ -452,12 +674,18 @@ public final class UnicodeString implements Comparable<UnicodeString> {
}
}
if (field_5_ext_rst != null) {
buffer.append(" .field_5_ext_rst = ").append("\n").append(HexDump.toHex(field_5_ext_rst)).append("\n");
buffer.append(" .field_5_ext_rst = ").append("\n");
buffer.append( field_5_ext_rst.toString() ).append("\n");
}
buffer.append("[/UNICODESTRING]\n");
return buffer.toString();
}
/**
* Serialises out the String. There are special rules
* about where we can and can't split onto
* Continue records.
*/
public void serialize(ContinuableRecordOutput out) {
int numberOfRichTextRuns = 0;
int extendedDataSize = 0;
@ -465,9 +693,11 @@ public final class UnicodeString implements Comparable<UnicodeString> {
numberOfRichTextRuns = field_4_format_runs.size();
}
if (isExtendedText() && field_5_ext_rst != null) {
extendedDataSize = field_5_ext_rst.length;
extendedDataSize = 4 + field_5_ext_rst.getDataSize();
}
// Serialise the bulk of the String
// The writeString handles tricky continue stuff for us
out.writeString(field_3_string, numberOfRichTextRuns, extendedDataSize);
if (numberOfRichTextRuns > 0) {
@ -477,25 +707,13 @@ public final class UnicodeString implements Comparable<UnicodeString> {
if (out.getAvailableSpace() < 4) {
out.writeContinue();
}
FormatRun r = field_4_format_runs.get(i);
r.serialize(out);
FormatRun r = field_4_format_runs.get(i);
r.serialize(out);
}
}
if (extendedDataSize > 0) {
// OK ExtRst is actually not documented, so i am going to hope
// that we can actually continue on byte boundaries
int extPos = 0;
while (true) {
int nBytesToWrite = Math.min(extendedDataSize - extPos, out.getAvailableSpace());
out.write(field_5_ext_rst, extPos, nBytesToWrite);
extPos += nBytesToWrite;
if (extPos >= extendedDataSize) {
break;
}
out.writeContinue();
}
field_5_ext_rst.serialize(out);
}
}
@ -534,7 +752,6 @@ public final class UnicodeString implements Comparable<UnicodeString> {
}
//Well the format runs are equal as well!, better check the ExtRst data
//Which by the way we don't know how to decode!
if ((field_5_ext_rst == null) && (str.field_5_ext_rst == null))
return 0;
if ((field_5_ext_rst == null) && (str.field_5_ext_rst != null))
@ -542,15 +759,10 @@ public final class UnicodeString implements Comparable<UnicodeString> {
if ((field_5_ext_rst != null) && (str.field_5_ext_rst == null))
return -1;
size = field_5_ext_rst.length;
if (size != field_5_ext_rst.length)
return size - field_5_ext_rst.length;
result = field_5_ext_rst.compareTo(str.field_5_ext_rst);
if (result != 0)
return result;
//Check individual bytes!
for (int i=0;i<size;i++) {
if (field_5_ext_rst[i] != str.field_5_ext_rst[i])
return field_5_ext_rst[i] - str.field_5_ext_rst[i];
}
//Phew!! After all of that we have finally worked out that the strings
//are identical.
return 0;
@ -575,12 +787,10 @@ public final class UnicodeString implements Comparable<UnicodeString> {
str.field_4_format_runs = new ArrayList<FormatRun>();
for (FormatRun r : field_4_format_runs) {
str.field_4_format_runs.add(new FormatRun(r._character, r._fontIndex));
}
}
}
if (field_5_ext_rst != null) {
str.field_5_ext_rst = new byte[field_5_ext_rst.length];
System.arraycopy(field_5_ext_rst, 0, str.field_5_ext_rst, 0,
field_5_ext_rst.length);
str.field_5_ext_rst = field_5_ext_rst.clone();
}
return str;

View File

@ -33,9 +33,8 @@ public final class TestSSTRecordSizeCalculator extends TestCase {
private static final int COMPRESSED_PLAIN_STRING_OVERHEAD = 3;
private static final int OPTION_FIELD_SIZE = 1;
private final IntMapper strings = new IntMapper();
private final IntMapper<UnicodeString> strings = new IntMapper<UnicodeString>();
private void confirmSize(int expectedSize) {
ContinuableRecordOutput cro = ContinuableRecordOutput.createForCountingOnly();
SSTSerializer ss = new SSTSerializer(strings, 0, 0);

View File

@ -17,12 +17,19 @@
package org.apache.poi.hssf.record.common;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import junit.framework.TestCase;
import org.apache.poi.hssf.record.ContinueRecord;
import org.apache.poi.hssf.record.RecordInputStream;
import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.hssf.record.common.UnicodeString.ExtRst;
import org.apache.poi.hssf.record.common.UnicodeString.FormatRun;
import org.apache.poi.hssf.record.cont.ContinuableRecordOutput;
import org.apache.poi.util.LittleEndianInputStream;
import org.apache.poi.util.LittleEndianOutputStream;
/**
* Tests that {@link UnicodeString} record size calculates correctly. The record size
@ -85,13 +92,23 @@ public final class TestUnicodeString extends TestCase {
//Test a compressed small string that has rich text and extended text
s.setString("Test");
s.setOptionFlags((byte)0xC);
s.setExtendedRst(new byte[]{(byte)0x1,(byte)0x2,(byte)0x3,(byte)0x4,(byte)0x5});
confirmSize(26, s);
confirmSize(17, s);
// Extended phonetics data
// Minimum size is 14
// Also adds 4 bytes to hold the length
s.setExtendedRst(
new ExtRst()
);
confirmSize(35, s);
//Test a uncompressed small string that has rich text and extended text
s.setString(STR_16_BIT);
s.setOptionFlags((byte)0xD);
confirmSize(30, s);
confirmSize(39, s);
s.setExtendedRst(null);
confirmSize(21, s);
}
public void testPerfectStringSize() {
@ -144,6 +161,146 @@ public final class TestUnicodeString extends TestCase {
UnicodeString s = makeUnicodeString(strSize);
confirmSize(MAX_DATA_SIZE*2, s);
}
public void testFormatRun() throws Exception {
FormatRun fr = new FormatRun((short)4, (short)0x15c);
assertEquals(4, fr.getCharacterPos());
assertEquals(0x15c, fr.getFontIndex());
ByteArrayOutputStream baos = new ByteArrayOutputStream();
LittleEndianOutputStream out = new LittleEndianOutputStream(baos);
fr.serialize(out);
byte[] b = baos.toByteArray();
assertEquals(4, b.length);
assertEquals(4, b[0]);
assertEquals(0, b[1]);
assertEquals(0x5c, b[2]);
assertEquals(0x01, b[3]);
LittleEndianInputStream inp = new LittleEndianInputStream(
new ByteArrayInputStream(b)
);
fr = new FormatRun(inp);
assertEquals(4, fr.getCharacterPos());
assertEquals(0x15c, fr.getFontIndex());
}
public void testExtRstFromEmpty() throws Exception {
ExtRst ext = new ExtRst();
assertEquals(0, ext.getNumberOfRuns());
assertEquals(0, ext.getFormattingFontIndex());
assertEquals(0, ext.getFormattingOptions());
assertEquals("", ext.getPhoneticText());
assertEquals(0, ext.getPhRuns().length);
assertEquals(10, ext.getDataSize()); // Excludes 4 byte header
ByteArrayOutputStream baos = new ByteArrayOutputStream();
LittleEndianOutputStream out = new LittleEndianOutputStream(baos);
ContinuableRecordOutput cout = new ContinuableRecordOutput(out, 0xffff);
ext.serialize(cout);
cout.writeContinue();
byte[] b = baos.toByteArray();
assertEquals(20, b.length);
// First 4 bytes from the outputstream
assertEquals(-1, b[0]);
assertEquals(-1, b[1]);
assertEquals(14, b[2]);
assertEquals(00, b[3]);
// Reserved
assertEquals(1, b[4]);
assertEquals(0, b[5]);
// Data size
assertEquals(10, b[6]);
assertEquals(00, b[7]);
// Font*2
assertEquals(0, b[8]);
assertEquals(0, b[9]);
assertEquals(0, b[10]);
assertEquals(0, b[11]);
// 0 Runs
assertEquals(0, b[12]);
assertEquals(0, b[13]);
// Size=0, *2
assertEquals(0, b[14]);
assertEquals(0, b[15]);
assertEquals(0, b[16]);
assertEquals(0, b[17]);
// Last 2 bytes from the outputstream
assertEquals(ContinueRecord.sid, b[18]);
assertEquals(0, b[19]);
// Load in again and re-test
byte[] data = new byte[14];
System.arraycopy(b, 4, data, 0, data.length);
LittleEndianInputStream inp = new LittleEndianInputStream(
new ByteArrayInputStream(data)
);
ext = new ExtRst(inp, data.length);
assertEquals(0, ext.getNumberOfRuns());
assertEquals(0, ext.getFormattingFontIndex());
assertEquals(0, ext.getFormattingOptions());
assertEquals("", ext.getPhoneticText());
assertEquals(0, ext.getPhRuns().length);
}
public void testExtRstFromData() throws Exception {
byte[] data = new byte[] {
01, 00, 0x0C, 00,
00, 00, 0x37, 00,
00, 00,
00, 00, 00, 00,
00, 00 // Cruft at the end, as found from real files
};
assertEquals(16, data.length);
LittleEndianInputStream inp = new LittleEndianInputStream(
new ByteArrayInputStream(data)
);
ExtRst ext = new ExtRst(inp, data.length);
assertEquals(0x0c, ext.getDataSize()); // Excludes 4 byte header
assertEquals(0, ext.getNumberOfRuns());
assertEquals(0x37, ext.getFormattingOptions());
assertEquals(0, ext.getFormattingFontIndex());
assertEquals("", ext.getPhoneticText());
assertEquals(0, ext.getPhRuns().length);
}
public void testCorruptExtRstDetection() throws Exception {
byte[] data = new byte[] {
0x79, 0x79, 0x11, 0x11,
0x22, 0x22, 0x33, 0x33,
};
assertEquals(8, data.length);
LittleEndianInputStream inp = new LittleEndianInputStream(
new ByteArrayInputStream(data)
);
ExtRst ext = new ExtRst(inp, data.length);
// Will be empty
assertEquals(ext, new ExtRst());
// If written, will be the usual size
assertEquals(10, ext.getDataSize()); // Excludes 4 byte header
// Is empty
assertEquals(0, ext.getNumberOfRuns());
assertEquals(0, ext.getFormattingOptions());
assertEquals(0, ext.getFormattingFontIndex());
assertEquals("", ext.getPhoneticText());
assertEquals(0, ext.getPhRuns().length);
}
private static UnicodeString makeUnicodeString(String s) {

View File

@ -36,6 +36,7 @@ import org.apache.poi.hssf.record.CellValueRecordInterface;
import org.apache.poi.hssf.record.EmbeddedObjectRefSubRecord;
import org.apache.poi.hssf.record.NameRecord;
import org.apache.poi.hssf.record.aggregates.FormulaRecordAggregate;
import org.apache.poi.hssf.record.common.UnicodeString;
import org.apache.poi.hssf.record.formula.DeletedArea3DPtg;
import org.apache.poi.hssf.record.formula.Ptg;
import org.apache.poi.ss.usermodel.*;
@ -1538,12 +1539,37 @@ public final class TestBugs extends BaseTestBugzillaIssues {
}
/**
* Round trip a file with an unusual ExtRst record
* Round trip a file with an unusual UnicodeString/ExtRst record parts
*/
public void test47847() {
HSSFWorkbook wb = openSample("47251.xls");
assertEquals(1, wb.getNumberOfSheets());
public void test47847() throws Exception {
HSSFWorkbook wb = openSample("47847.xls");
assertEquals(3, wb.getNumberOfSheets());
// Find the SST record
UnicodeString withExt = wb.getWorkbook().getSSTString(0);
UnicodeString withoutExt = wb.getWorkbook().getSSTString(31);
assertEquals("O:Alloc:Qty", withExt.getString());
assertTrue((withExt.getOptionFlags() & 0x0004) == 0x0004);
assertEquals("RT", withoutExt.getString());
assertTrue((withoutExt.getOptionFlags() & 0x0004) == 0x0000);
// Something about continues...
// Write out and re-read
wb = writeOutAndReadBack(wb);
assertEquals(1, wb.getNumberOfSheets());
assertEquals(3, wb.getNumberOfSheets());
// Check it's the same now
withExt = wb.getWorkbook().getSSTString(0);
withoutExt = wb.getWorkbook().getSSTString(31);
assertEquals("O:Alloc:Qty", withExt.getString());
assertTrue((withExt.getOptionFlags() & 0x0004) == 0x0004);
assertEquals("RT", withoutExt.getString());
assertTrue((withoutExt.getOptionFlags() & 0x0004) == 0x0000);
}
}

Binary file not shown.