Fix Visio compression

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1872223 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Andreas Beeker 2020-01-01 22:44:42 +00:00
parent 07b5bc667c
commit adb8424bc1
4 changed files with 525 additions and 616 deletions

View File

@ -26,13 +26,18 @@ import java.io.OutputStream;
* various LZW implementations in the different file
* formats.
* It's currently used by HDGF and HMEF.
*
* <p>
* Two good resources on LZW are:
* http://en.wikipedia.org/wiki/LZW
* http://marknelson.us/1989/10/01/lzw-data-compression/
*/
public abstract class LZWDecompresser {
/** the size of our dictionary */
public static final int DICT_SIZE = 0x1000;
/** the mask for calculating / wrapping dictionary offsets */
public static final int DICT_MASK = 0xFFF;
//arbitrarily selected; may need to increase
private static final int MAX_RECORD_LENGTH = 1_000_000;
@ -82,7 +87,7 @@ public abstract class LZWDecompresser {
*/
public byte[] decompress(InputStream src) throws IOException {
ByteArrayOutputStream res = new ByteArrayOutputStream();
decompress(src,res);
decompress(src, res);
return res.toByteArray();
}
@ -117,13 +122,13 @@ public abstract class LZWDecompresser {
// * 256-4095 are the substring codes
// Java handily initialises our buffer / dictionary
// to all zeros
byte[] buffer = new byte[4096];
final byte[] buffer = new byte[DICT_SIZE];
pos = populateDictionary(buffer);
// These are bytes as looked up in the dictionary
// It needs to be signed, as it'll get passed on to
// the output stream
byte[] dataB = IOUtils.safelyAllocate(16+codeLengthIncrease, MAX_RECORD_LENGTH);
final byte[] dataB = IOUtils.safelyAllocate(16 + codeLengthIncrease, MAX_RECORD_LENGTH);
// This is an unsigned byte read from the stream
// It needs to be unsigned, so that bit stuff works
int dataI;
@ -133,74 +138,52 @@ public abstract class LZWDecompresser {
// dictionary to start at
int len, pntr;
while( (flag = src.read()) != -1 ) {
while ((flag = src.read()) != -1) {
// Compare each bit in our flag byte in turn:
for(mask = 1; mask < 256 ; mask <<= 1) {
for (mask = 1; mask < 0x100; mask <<= 1) {
// Is this a new code (un-compressed), or
// the use of existing codes (compressed)?
boolean isMaskSet = (flag & mask) > 0;
if( isMaskSet ^ maskMeansCompressed ) {
if (isMaskSet ^ maskMeansCompressed) {
// Retrieve the un-compressed code
if( (dataI = src.read()) != -1) {
if ((dataI = src.read()) != -1) {
// Save the byte into the dictionary
buffer[(pos&4095)] = fromInt(dataI);
pos++;
buffer[pos++ & DICT_MASK] = (byte) dataI;
// And output the byte
res.write( new byte[] {fromInt(dataI)} );
res.write(dataI);
}
} else {
// We have a compressed sequence
// Grab the next 16 bits of data
dataIPt1 = src.read();
dataIPt2 = src.read();
if(dataIPt1 == -1 || dataIPt2 == -1) break;
if (dataIPt1 == -1 || dataIPt2 == -1) break;
// Build up how long the code sequence is, and
// what position of the code to start at
// (The position is the usually the first 12 bits,
// and the length is usually the last 4 bits)
len = (dataIPt2 & 15) + codeLengthIncrease;
if(positionIsBigEndian) {
pntr = (dataIPt1<<4) + (dataIPt2>>4);
len = (dataIPt2 & 0x0F) + codeLengthIncrease;
if (positionIsBigEndian) {
pntr = (dataIPt1 << 4) + (dataIPt2 >>> 4);
} else {
pntr = dataIPt1 + ((dataIPt2&0xF0)<<4);
pntr = dataIPt1 + ((dataIPt2 & 0xF0) << 4);
}
// Adjust the pointer as needed
pntr = adjustDictionaryOffset(pntr);
// Loop over the codes, outputting what they correspond to
for(int i=0; i<len; i++) {
dataB[i] = buffer[(pntr + i) & 4095];
buffer[ (pos + i) & 4095 ] = dataB[i];
for (int i = 0; i < len; i++) {
dataB[i] = buffer[(pntr + i) & DICT_MASK];
buffer[(pos + i) & DICT_MASK] = dataB[i];
}
res.write(dataB, 0, len);
// Record how far along the stream we have moved
pos = pos + len;
pos += len;
}
}
}
}
/**
* Given an integer, turn it into a java byte, handling
* the wrapping.
* This is a convenience method
*/
public static byte fromInt(int b) {
if(b < 128) return (byte)b;
return (byte)(b - 256);
}
/**
* Given a java byte, turn it into an integer between 0
* and 255 (i.e. handle the unwrapping).
* This is a convenience method
*/
public static int fromByte(byte b) {
if(b >= 0) {
return b;
}
return b + 256;
}
}

View File

@ -89,7 +89,7 @@ public class HDGFLZW extends LZWDecompresser {
* or the OutputStream can't be written to
*/
public void compress(InputStream src, OutputStream res) throws IOException {
HDGFLZWCompressor c = new HDGFLZWCompressor();
c.compress(src, res);
HDGFLZWCompressor c = new HDGFLZWCompressor(res);
c.compress(src);
}
}

View File

@ -17,18 +17,18 @@
package org.apache.poi.hdgf;
import static org.apache.poi.util.LZWDecompresser.DICT_MASK;
import static org.apache.poi.util.LZWDecompresser.DICT_SIZE;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
/**
* Helper class to handle the Visio compatible
* streaming LZW compression.
* Need our own class to handle keeping track of the
* code buffer, pending bytes to write out etc.
*
* TODO Fix this, as it starts to go wrong on
* large streams
* Helper class to handle the Visio compatible streaming LZW compression.
* Need our own class to handle keeping track of the code buffer, pending bytes to write out etc.
* <p>
* TODO Fix this, as it starts to go wrong on large streams
*/
/* package */ final class HDGFLZWCompressor {
// We use 12 bit codes:
@ -36,18 +36,16 @@ import java.io.OutputStream;
// * 256-4095 are the substring codes
// Java handily initialises our buffer / dictionary
// to all zeros
private byte[] dict = new byte[4096];
private final byte[] dict = new byte[DICT_SIZE];
// The next block of data to be written out, minus
// its mask byte
private byte[] buffer = new byte[16];
// The next block of data to be written out, minus its mask byte
private final byte[] buffer = new byte[16];
// And how long it is
// (Un-compressed codes are 1 byte each, compressed codes
// are two)
// (Un-compressed codes are 1 byte each, compressed codes are two)
private int bufferLen;
// The raw length of a code is limited to 4 bits + 2
private byte[] rawCode = new byte[18];
private final byte[] rawCode = new byte[18];
// And how much we're using
private int rawCodeLen;
@ -60,199 +58,186 @@ import java.io.OutputStream;
// And how many bits we've already set
private int maskBitsSet;
public HDGFLZWCompressor() {}
private final OutputStream res;
/**
public HDGFLZWCompressor(OutputStream res) {
this.res = res;
}
/**
* Returns the last place that the bytes from rawCode are found
* at in the buffer, or -1 if they can't be found
*/
private int findRawCodeInBuffer() {
private int findRawCodeInBuffer() {
// Work our way through all the codes until we
// find the right one. Visio starts from the end
for(int i=4096-rawCodeLen; i>0; i--) {
for (int i = rawCodeLen+1; i < DICT_SIZE; i++) {
int pos = (posInp - i) & DICT_MASK;
// in the example data it seems, that the compressor doesn't like to wrap beyond DICT_SIZE
// if (pos + rawCodeLen > DICT_SIZE) continue;
boolean matches = true;
for(int j=0; matches && j<rawCodeLen; j++) {
if(dict[i+j] == rawCode[j]) {
// Fits
} else {
for (int j = 0; j < rawCodeLen; j++) {
if (dict[(pos + j) & DICT_MASK] != rawCode[j]) {
// Doesn't fit, can't be a match
matches = false;
break;
}
}
// Was this position a match?
if(matches) {
return i;
if (matches) {
return pos;
}
}
// Not found
return -1;
}
}
/**
/**
* Output the compressed representation for the bytes
* found in rawCode
*/
private void outputCompressed(OutputStream res) throws IOException {
// It's not worth compressing only 1 or two bytes,
// due to the overheads
private void outputCompressed() throws IOException {
// It's not worth compressing only 1 or two bytes, due to the overheads
// So if asked, just output uncompressed
if(rawCodeLen < 3) {
for(int i=0; i<rawCodeLen; i++) {
outputUncompressed(rawCode[i], res);
if (rawCodeLen < 3) {
final int rcl = rawCodeLen;
for (int i = 0; i < rcl; i++) {
outputUncompressed(rawCode[i]);
}
return;
}
// Grab where the data lives
int codesAt = findRawCodeInBuffer();
codesAt -= 18;
if(codesAt < 0) {
codesAt += 4096;
}
codesAt = (codesAt-18) & DICT_MASK;
// Increment the mask bit count, we've done another code
maskBitsSet++;
// Add the length+code to the buffer
// (The position is the first 12 bits, the
// length is the last 4 bits)
int bp1 = (codesAt & 255);
int bp2 = (rawCodeLen-3) + ((codesAt-bp1) >> 4);
buffer[bufferLen] = HDGFLZW.fromInt(bp1);
bufferLen++;
buffer[bufferLen] = HDGFLZW.fromInt(bp2);
bufferLen++;
// (The position is the first 12 bits, the length is the last 4 bits)
int bp1 = (codesAt & 0xFF);
int bp2 = (rawCodeLen - 3) + ((codesAt - bp1) >>> 4);
buffer[bufferLen++] = (byte) bp1;
buffer[bufferLen++] = (byte) bp2;
// Copy the data to the dictionary in the new place
for(int i=0; i<rawCodeLen; i++) {
dict[(posOut&4095)] = rawCode[i];
posOut++;
}
assert(maskBitsSet <= 8);
// If we're now at 8 codes, output
if(maskBitsSet == 8) {
output8Codes(res);
if (maskBitsSet == 8) {
output8Codes();
}
}
/**
rawCodeLen = 0;
}
/**
* Output the un-compressed byte
*/
private void outputUncompressed(byte b, OutputStream res) throws IOException {
private void outputUncompressed(byte b) throws IOException {
// Set the mask bit for us
nextMask += (1<<maskBitsSet);
nextMask += (1 << maskBitsSet);
maskBitsSet++;
// And add us to the buffer + dictionary
buffer[bufferLen] = b;
bufferLen++;
dict[(posOut&4095)] = b;
posOut++;
buffer[bufferLen++] = b;
// If we're now at 8 codes, output
if(maskBitsSet == 8) {
output8Codes(res);
if (maskBitsSet == 8) {
output8Codes();
}
}
/**
rawCodeLen = 0;
}
/**
* We've got 8 code worth to write out, so
* output along with the header
*/
private void output8Codes(OutputStream res) throws IOException {
private void output8Codes() throws IOException {
// Output the mask and the data
res.write(new byte[] { HDGFLZW.fromInt(nextMask) } );
res.write(nextMask);
res.write(buffer, 0, bufferLen);
posOut += 1 + bufferLen;
// Reset things
nextMask = 0;
maskBitsSet = 0;
bufferLen = 0;
}
}
/**
/**
* Does the compression
*/
public void compress(InputStream src, OutputStream res) throws IOException {
// Have we hit the end of the file yet?
boolean going = true;
// This is a byte as looked up in the dictionary
// It needs to be signed, as it'll get passed on to
// the output stream
byte dataB;
public void compress(InputStream src) throws IOException {
int dataI = -1;
while (true) {
if (dataI > -1) {
// copy the last read byte into the dictionary.
// the example data compressor used self references, so we don't wait for filling the dictionary
// until we know if it's a un-/compressed token.
dict[(posInp++) & DICT_MASK] = (byte)dataI;
}
// This is an unsigned byte read from the stream
// It needs to be unsigned, so that bit stuff works
int dataI;
while( going ) {
dataI = src.read();
posInp++;
if(dataI == -1) { going = false; }
dataB = HDGFLZW.fromInt(dataI);
// If we've run out of data, output anything that's
// pending then finish
if(!going) {
if(rawCodeLen > 0) {
outputCompressed(res);
if(maskBitsSet > 0) {
output8Codes(res);
// If we've run out of data, output anything that's pending then finish
if (dataI == -1) {
if (rawCodeLen > 0) {
outputCompressed();
if (maskBitsSet > 0) {
output8Codes();
}
}
break;
}
// Try adding this new byte onto rawCode, and
// see if all of that is still found in the
// buffer dictionary or not
rawCode[rawCodeLen] = dataB;
rawCodeLen++;
// This is a byte as looked up in the dictionary
// It needs to be signed, as it'll get passed on to the output stream
byte dataB = (byte) dataI;
// Try adding this new byte onto rawCode, and see if all of that is still found
// in the buffer dictionary or not
rawCode[rawCodeLen++] = dataB;
int rawAt = findRawCodeInBuffer();
// If we found it and are now at 18 bytes,
// we need to output our pending code block
if(rawCodeLen == 18 && rawAt > -1) {
outputCompressed(res);
rawCodeLen = 0;
if (rawAt > -1) {
// If we found it and are now at 18 bytes, we need to output our pending code block
if (rawCodeLen == 18) {
outputCompressed();
}
// If we did find all of rawCode with our new byte added on,
// we can wait to see what happens with the next byte
continue;
}
// If we did find all of rawCode with our new
// byte added on, we can wait to see what happens
// with the next byte
if(rawAt > -1) {
continue;
}
// If we get here, then the rawCode + this byte weren't
// found in the dictionary
// If we get here, then the rawCode + this byte weren't found in the dictionary
// If there was something in rawCode before, then that was
// found in the dictionary, so output that compressed
rawCodeLen--;
if(rawCodeLen > 0) {
if (rawCodeLen > 0) {
// Output the old rawCode
outputCompressed(res);
outputCompressed();
// Can this byte start a new rawCode, or does
// it need outputting itself?
// Can this byte start a new rawCode, or does it need outputting itself?
rawCode[0] = dataB;
rawCodeLen = 1;
if(findRawCodeInBuffer() > -1) {
if (findRawCodeInBuffer() > -1) {
// Fits in, wait for next byte
continue;
}
// Doesn't fit, output
outputUncompressed(dataB,res);
rawCodeLen = 0;
outputUncompressed(dataB);
} else {
// Nothing in rawCode before, so this byte
// isn't in the buffer dictionary
// Nothing in rawCode before, so this byte isn't in the buffer dictionary
// Output it un-compressed
outputUncompressed(dataB,res);
outputUncompressed(dataB);
}
}
}
}
}

View File

@ -17,16 +17,17 @@
package org.apache.poi.hdgf;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import java.io.ByteArrayInputStream;
import java.util.Arrays;
import org.junit.Ignore;
import org.junit.Test;
public final class TestHDGFLZW {
public static final byte[] testTrailerComp = {
123, // *mask bit*
123, // *mask bit* 1,2,4-7
-60, 2,
-21, -16, // 3 @ 4093
1, 0, 0, -72,
@ -74,7 +75,7 @@ public final class TestHDGFLZW {
-16, -92, 66, 127, 85, 1, 98, 119, 0, 0, -48, 79, 18, -3, 50, -17,
1, 67, 85, 1, 81, -127, 0, -41, 0, 14, 6, 4, 17, 63, -63, 17, 68,
85, -65, 1, 30, -120, 0, 0, 42, 79, 18, 68, 126, -21, -16, -76, 69,
85, 1, 102, -119, 72, 37, 0, 97, 33 };
85, 1, 102, -119, 72, 37, 0, 97, 33};
public static final byte[] testTrailerDecomp = {
-60, 2, 0, 0, 0, 1, 0, 0, -72, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0,
@ -111,34 +112,6 @@ public final class TestHDGFLZW {
0, 0, 42, 1, 0, 0, 84, 0, 0, 0, 0, 0
};
@Test
public void testFromToInt() {
byte b255 = -1;
assertEquals(255, HDGFLZW.fromByte(b255));
assertEquals(-1, HDGFLZW.fromInt( HDGFLZW.fromByte(b255) ));
assertEquals(-1, HDGFLZW.fromInt( 255 ));
byte b11 = 11;
assertEquals(11, HDGFLZW.fromByte(b11));
assertEquals(11, HDGFLZW.fromInt( HDGFLZW.fromByte(b11) ));
assertEquals(11, HDGFLZW.fromInt( 11 ));
byte b0 = 0;
assertEquals(0, HDGFLZW.fromByte(b0));
assertEquals(0, HDGFLZW.fromInt( HDGFLZW.fromByte(b0) ));
assertEquals(0, HDGFLZW.fromInt( 0 ));
byte b127 = 127;
assertEquals(127, HDGFLZW.fromByte(b127));
assertEquals(127, HDGFLZW.fromInt( HDGFLZW.fromByte(b127) ));
assertEquals(127, HDGFLZW.fromInt( 127 ));
byte b128 = -128;
assertEquals(128, HDGFLZW.fromByte(b128));
assertEquals(-128, HDGFLZW.fromInt( HDGFLZW.fromByte(b128) ));
assertEquals(-128, HDGFLZW.fromInt( 128 ));
}
@Test
public void testCounts() throws Exception {
assertEquals(339, testTrailerComp.length);
@ -170,11 +143,7 @@ public final class TestHDGFLZW {
byte[] dec = lzw.decompress(new ByteArrayInputStream(testTrailerComp));
// Now check it's the right data
assertEquals(632, dec.length);
for(int i=0; i<dec.length; i++) {
if(dec[i] != testTrailerDecomp[i])
System.err.println(i + "\t" + dec[i] + "\t" + testTrailerDecomp[i]);
}
assertArrayEquals(testTrailerDecomp, dec);
}
/**
@ -185,10 +154,7 @@ public final class TestHDGFLZW {
@Test
public void testCompressMini() throws Exception {
// first 11 bytes compressed = 12 bytes uncompressed
byte[] sourceComp = new byte[11];
byte[] sourceDecomp = new byte[12];
System.arraycopy(testTrailerComp, 0, sourceComp, 0, sourceComp.length);
System.arraycopy(testTrailerDecomp, 0, sourceDecomp, 0, sourceDecomp.length);
byte[] sourceDecomp = Arrays.copyOf(testTrailerDecomp, 12);
// Compress it using our engine
HDGFLZW lzw = new HDGFLZW();
@ -199,15 +165,11 @@ public final class TestHDGFLZW {
// First up, check the round tripping
assertEquals(12, decomp.length);
for(int i=0; i<decomp.length; i++) {
assertEquals("Wrong at " + i, decomp[i], testTrailerDecomp[i]);
}
assertArrayEquals(Arrays.copyOfRange(testTrailerDecomp, 0, decomp.length), decomp);
// Now check the compressed intermediate version
assertEquals(11, comp.length);
for(int i=0; i<comp.length; i++) {
assertEquals("Wrong at " + i, comp[i], testTrailerComp[i]);
}
assertArrayEquals(Arrays.copyOfRange(testTrailerComp, 0, comp.length), comp);
}
/**
@ -217,62 +179,41 @@ public final class TestHDGFLZW {
public void testCompressMidi() throws Exception {
// First 12 -> 11
// Next 32 -> 13
byte[] sourceComp = new byte[24];
byte[] sourceDecomp = new byte[44];
System.arraycopy(testTrailerComp, 0, sourceComp, 0, sourceComp.length);
System.arraycopy(testTrailerDecomp, 0, sourceDecomp, 0, sourceDecomp.length);
byte[] sourceDecomp = Arrays.copyOf(testTrailerDecomp, 44);
// Compress it using our engine
HDGFLZW lzw = new HDGFLZW();
byte[] comp = lzw.compress(new ByteArrayInputStream(sourceDecomp));
// We should be 3 characters bigger, as
// we split one compressed bit into two
assertEquals(27, comp.length);
assertEquals(24, comp.length);
// Now decompress it again
byte[] decomp = lzw.decompress(new ByteArrayInputStream(comp));
// We can only check the round-tripping, as for now
// visio cheats on re-using a block
assertEquals(44, decomp.length);
for(int i=0; i<decomp.length; i++) {
assertEquals("Wrong at " + i, decomp[i], sourceDecomp[i]);
}
assertArrayEquals(sourceDecomp, decomp);
}
/**
* Gets 160 bytes through then starts going wrong...
* TODO Fix this
*/
@Test
@Ignore
public void testCompressFull() throws Exception {
assertEquals(339, testTrailerComp.length);
assertEquals(632, testTrailerDecomp.length);
// Compress it using our engine
HDGFLZW lzw = new HDGFLZW();
byte[] decomp2 = lzw.decompress(new ByteArrayInputStream(testTrailerComp));
assertArrayEquals(testTrailerDecomp, decomp2);
// Compress it using our engine
byte[] comp = lzw.compress(new ByteArrayInputStream(testTrailerDecomp));
// the compressed binary differs, as the run length searching finds different results
// but the decompressed data is the same
// Now decompress it again
byte[] decomp = lzw.decompress(new ByteArrayInputStream(comp));
// for(int i=0; i<comp.length; i++) {
// System.err.println(i + "\t" + comp[i] + "\t" + testTrailerComp[i]);
// }
// First up, check the round tripping
// assertEquals(632, decomp.length);
for(int i=0; i<decomp.length; i++) {
assertEquals("Wrong at " + i, decomp[i], testTrailerDecomp[i]);
}
// Now check the compressed intermediate version
assertEquals(339, comp.length);
for(int i=0; i<comp.length; i++) {
assertEquals("Wrong at " + i, comp[i], testTrailerComp[i]);
}
assertArrayEquals(testTrailerDecomp, decomp);
}
}