Two more differences between the LZW in HDGF and HMEF:

* Little Endian vs Big Endian storage of the code position
 * Initial dictionary position is the end of pre-fill, if there is one, rather than always being position 0


git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1078300 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2011-03-05 15:25:39 +00:00
parent 5805f3b660
commit f05c5454fc
4 changed files with 61 additions and 32 deletions

View File

@ -41,23 +41,36 @@ public abstract class LZWDecompresser {
* to get the real code length? Normally 2 or 3
*/
private final int codeLengthIncrease;
/**
* Does the 12 bits of the position get stored in
* Little Endian or Big Endian form?
* This controls whether a pos+length of 0x12 0x34
* becomes a position of 0x123 or 0x312
*/
private final boolean positionIsBigEndian;
protected LZWDecompresser(boolean maskMeansCompressed, int codeLengthIncrease) {
protected LZWDecompresser(boolean maskMeansCompressed,
int codeLengthIncrease, boolean positionIsBigEndian) {
this.maskMeansCompressed = maskMeansCompressed;
this.codeLengthIncrease = codeLengthIncrease;
this.positionIsBigEndian = positionIsBigEndian;
}
/**
* Populates the dictionary. May not need
* to do anything if all zeros is fine.
* Populates the dictionary, and returns where in it
* to begin writing new codes.
* Generally, if the dictionary is pre-populated, then new
* codes should be placed at the end of that block.
* Equally, if the dictionary is left with all zeros, then
* usually the new codes can go in at the start.
*/
protected abstract void populateDictionary(byte[] dict);
protected abstract int populateDictionary(byte[] dict);
/**
* Adjusts the position offset if needed when looking
* something up in the dictionary.
*/
protected abstract int adjustDictionaryOffset(int offset);
protected abstract int adjustDictionaryOffset(int offset);
/**
* Decompresses the given input stream, returning the array of bytes
@ -83,17 +96,10 @@ public abstract class LZWDecompresser {
* flag byte
*/
public void decompress(InputStream src, OutputStream res) throws IOException {
// We use 12 bit codes:
// * 0-255 are real bytes
// * 256-4095 are the substring codes
// Java handily initialises our buffer / dictionary
// to all zeros
byte[] buffer = new byte[4096];
populateDictionary(buffer);
// How far through the output we've got
// (This is normally used &4095, so it nicely wraps)
int pos = 0;
// The initial value is set when populating the dictionary
int pos;
// The flag byte is treated as its 8 individual
// bits, which tell us if the following 8 codes
// are compressed or un-compressed
@ -102,10 +108,18 @@ public abstract class LZWDecompresser {
// processing each bit of the flag byte in turn
int mask;
// We use 12 bit codes:
// * 0-255 are real bytes
// * 256-4095 are the substring codes
// Java handily initialises our buffer / dictionary
// to all zeros
byte[] buffer = new byte[4096];
pos = populateDictionary(buffer);
// These are bytes as looked up in the dictionary
// It needs to be signed, as it'll get passed on to
// the output stream
byte[] dataB = new byte[19];
byte[] dataB = new byte[16+codeLengthIncrease];
// This is an unsigned byte read from the stream
// It needs to be unsigned, so that bit stuff works
int dataI;
@ -121,7 +135,7 @@ public abstract class LZWDecompresser {
// Is this a new code (un-compressed), or
// the use of existing codes (compressed)?
boolean isMaskSet = (flag & mask) > 0;
if( isMaskSet && !maskMeansCompressed ) {
if( isMaskSet ^ maskMeansCompressed ) {
// Retrieve the un-compressed code
if( (dataI = src.read()) != -1) {
// Save the byte into the dictionary
@ -139,11 +153,15 @@ public abstract class LZWDecompresser {
// Build up how long the code sequence is, and
// what position of the code to start at
// (The position is the first 12 bits, the
// length is the last 4 bits)
// (The position is the usually the first 12 bits,
// and the length is usually the last 4 bits)
len = (dataIPt2 & 15) + codeLengthIncrease;
pntr = (dataIPt2 & 240)*16 + dataIPt1;
if(positionIsBigEndian) {
pntr = (dataIPt1<<4) + (dataIPt2>>4);
} else {
pntr = dataIPt1 + ((dataIPt2&0xF0)<<4);
}
// Adjust the pointer as needed
pntr = adjustDictionaryOffset(pntr);

View File

@ -37,8 +37,10 @@ import org.apache.poi.util.LZWDecompresser;
*/
public class HDGFLZW extends LZWDecompresser {
public HDGFLZW() {
// We're the wrong way round!
super(false, 3);
// Out flag is the wrong way round!
// Length wise, we're 3 longer than we say, so the max len is 19
// Endian wise, we're little endian, so 0x1234 is pos 0x312
super(false, 3, false);
}
/**
@ -63,12 +65,13 @@ public class HDGFLZW extends LZWDecompresser {
}
return pntr;
}
/**
* We want an empty dictionary, so do nothing
*/
@Override
protected void populateDictionary(byte[] dict) {
protected int populateDictionary(byte[] dict) {
return 0;
}
/**

View File

@ -54,7 +54,10 @@ public final class CompressedRTF extends LZWDecompresser {
"{\\colortbl\\red0\\green0\\blue0\n\r\\par \\pard\\plain\\f0\\fs20\\b\\i\\u\\tab\\tx";
public CompressedRTF() {
super(true, 2);
// Out flag has the normal meaning
// Length wise, we're 2 longer than we say, so the max len is 18
// Endian wise, we're big endian, so 0x1234 is pos 0x123
super(true, 2, true);
}
public void decompress(InputStream src, OutputStream res) throws IOException {
@ -80,17 +83,24 @@ public final class CompressedRTF extends LZWDecompresser {
super.decompress(src, res);
}
/**
* We use regular dictionary offsets, so no
* need to change anything
*/
@Override
protected int adjustDictionaryOffset(int offset) {
// TODO Do we need to change anything?
return 0;
return offset;
}
@Override
protected void populateDictionary(byte[] dict) {
protected int populateDictionary(byte[] dict) {
try {
// Copy in the RTF constants
byte[] preload = LZW_RTF_PRELOAD.getBytes("US-ASCII");
System.arraycopy(preload, 0, dict, 0, preload.length);
// Start adding new codes after the constants
return preload.length;
} catch(UnsupportedEncodingException e) {
throw new RuntimeException("Your JVM is broken as it doesn't support US ASCII");
}

View File

@ -93,7 +93,7 @@ public final class TestCompressedRTF extends TestCase {
* Check that we can decode the first 8 codes
* (1 flag byte + 8 codes)
*/
public void DISABLEDtestFirstBlock() throws Exception {
public void testFirstBlock() throws Exception {
HMEFMessage msg = new HMEFMessage(
_samples.openResourceAsStream("quick-winmail.dat")
);
@ -112,7 +112,6 @@ public final class TestCompressedRTF extends TestCase {
String decompStr = new String(decomp, "ASCII");
// Test
System.err.println(decompStr);
assertEquals(block1.length(), decomp.length);
assertEquals(block1, decompStr);
}
@ -121,7 +120,7 @@ System.err.println(decompStr);
* Check that we can decode the first 16 codes
* (flag + 8 codes, flag + 8 codes)
*/
public void DISABLEDtestFirstTwoBlocks() throws Exception {
public void testFirstTwoBlocks() throws Exception {
HMEFMessage msg = new HMEFMessage(
_samples.openResourceAsStream("quick-winmail.dat")
);
@ -140,7 +139,6 @@ System.err.println(decompStr);
String decompStr = new String(decomp, "ASCII");
// Test
System.err.println(decompStr);
assertEquals(block2.length(), decomp.length);
assertEquals(block2, decompStr);
}