Improve the HDGF LZW code.

Some tweaks to the decompression, and more tests, but mostly work on the compression side. We can now compress small streams properly, and these round-trip fine. However, some longer streams don't compress correctly, and more work on that is still needed (see the disabled unit test)


git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1049805 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2010-12-16 07:41:41 +00:00
parent 4c8a39924b
commit 0b4b029e2a
5 changed files with 482 additions and 368 deletions

View File

@ -72,7 +72,8 @@
HDGFLZW, which will be much better documented, and also
under the ASL. <strong>Completed October 2007</strong></li>
<li>Add compression support to HDGFLZW.
<strong>In progress</strong></li>
<strong>In progress - works for small streams but encoding
goes wrong on larger ones</strong></li>
<li>Have HDGF just write back the raw bytes it read in, and
have a test to ensure the file is un-changed.</li>
<li>Have HDGF generate the bytes to write out from the

View File

@ -34,6 +34,8 @@
<changes>
<release version="3.8-beta1" date="2010-??-??">
<action dev="poi-developers" type="add">Inside ExtractorFactory, support finding embedded OOXML documents and providing extractors for them</action>
<action dev="poi-developers" type="add">Partial HDGF LZW compression support</action>
<action dev="poi-developers" type="add">50244 - Support for continued NameRecords</action>
<action dev="POI-DEVELOPERS" type="fix">50416 - Correct shifting of the first or last row in a sheet by multiple rows</action>
<action dev="POI-DEVELOPERS" type="fix">50440 - Support evaluating formulas with newlines in them, which XSSF may have (but HSSF may not)</action>

View File

@ -75,6 +75,7 @@ public byte[] decode(InputStream src) throws IOException {
decode(src,res);
return res.toByteArray();
}
/**
* Perform a streaming decompression of the input.
* Works by:
@ -107,10 +108,10 @@ public void decode(InputStream src, OutputStream res) throws IOException {
// processing each bit of the flag byte in turn
int mask;
// This is a byte as looked up in the dictionary
// These are bytes as looked up in the dictionary
// It needs to be signed, as it'll get passed on to
// the output stream
byte dataB;
byte[] dataB = new byte[19];
// This is an unsigned byte read from the stream
// It needs to be unsigned, so that bit stuff works
int dataI;
@ -158,10 +159,10 @@ public void decode(InputStream src, OutputStream res) throws IOException {
// Loop over the codes, outputting what they correspond to
for(int i=0; i<len; i++) {
buffer [(pos + i) & 4095] = buffer [(pntr + i) & 4095];
dataB = buffer[(pntr + i) & 4095];
res.write(new byte[] {dataB});
dataB[i] = buffer[(pntr + i) & 4095];
buffer[ (pos + i) & 4095 ] = dataB[i];
}
res.write(dataB, 0, len);
// Record how far along the stream we have moved
pos = pos + len;
@ -172,221 +173,9 @@ public void decode(InputStream src, OutputStream res) throws IOException {
/**
* Performs the Visio compatible streaming LZW compression.
* TODO - Finish
*/
public void compress(InputStream src, OutputStream res) throws IOException {
Compressor c = new Compressor();
HDGFLZWCompressor c = new HDGFLZWCompressor();
c.compress(src, res);
}
/**
* Helper class to handle the Visio compatible
* streaming LZW compression.
* Need our own class to handle keeping track of the
* code buffer, pending bytes to write out etc.
*/
private static final class Compressor {
// We use 12 bit codes:
// * 0-255 are real bytes
// * 256-4095 are the substring codes
// Java handily initialises our buffer / dictionary
// to all zeros
byte[] dict = new byte[4096];
// The next block of data to be written out, minus
// its mask byte
byte[] buffer = new byte[16];
// And how long it is
// (Un-compressed codes are 1 byte each, compressed codes
// are two)
int bufferLen = 0;
// The raw length of a code is limited to 4 bits
byte[] rawCode = new byte[16];
// And how much we're using
int rawCodeLen = 0;
// How far through the input and output streams we are
int posInp = 0;
int posOut = 0;
// What the next mask byte to output will be
int nextMask = 0;
// And how many bits we've already set
int maskBitsSet = 0;
public Compressor() {
//
}
/**
* Returns the last place that the bytes from rawCode are found
* at in the buffer, or -1 if they can't be found
*/
private int findRawCodeInBuffer() {
// Work our way back from the end
// (Visio always seems to use the last possible code)
for(int i=(buffer.length - rawCodeLen); i>=0; i--) {
boolean matches = true;
for(int j=0; matches && j<rawCodeLen; j++) {
if(buffer[i] == rawCode[j]) {
// Fits
} else {
// Doesn't fit, can't be a match
matches = false;
}
}
// Was this position a match?
if(matches) {
return i;
}
}
// Not found
return -1;
}
/**
* Output the compressed representation for the bytes
* found in rawCode
*/
private void outputCompressed(OutputStream res) throws IOException {
// It's not worth compressing only 1 or two bytes,
// due to the overheads
// So if asked, just output uncompressed
if(rawCodeLen < 3) {
for(int i=0; i<rawCodeLen; i++) {
outputUncompressed(rawCode[i], res);
}
return;
}
// Increment the mask bit count, we've done another code
maskBitsSet++;
// Add the length+code to the buffer
// (The position is the first 12 bits, the
// length is the last 4 bits)
// TODO
posOut += 2;
// If we're now at 8 codes, output
if(maskBitsSet == 8) {
output8Codes(res);
}
}
/**
* Output the un-compressed byte
*/
private void outputUncompressed(byte b, OutputStream res) throws IOException {
// Set the mask bit for us
nextMask += (1<<maskBitsSet);
// And add us to the buffer + dictionary
buffer[bufferLen] = fromInt(b);
bufferLen++;
dict[(posOut&4095)] = fromInt(b);
posOut++;
// If we're now at 8 codes, output
if(maskBitsSet == 8) {
output8Codes(res);
}
}
/**
* We've got 8 code worth to write out, so
* output along with the header
*/
private void output8Codes(OutputStream res) throws IOException {
// Output the mask and the data
res.write(new byte[] { fromInt(nextMask) } );
res.write(buffer, 0, bufferLen);
// Reset things
nextMask = 0;
maskBitsSet = 0;
bufferLen = 0;
}
/**
* Does the compression
*/
public void compress(InputStream src, OutputStream res) throws IOException {
// Have we hit the end of the file yet?
boolean going = true;
// This is a byte as looked up in the dictionary
// It needs to be signed, as it'll get passed on to
// the output stream
byte dataB;
// This is an unsigned byte read from the stream
// It needs to be unsigned, so that bit stuff works
int dataI;
while( going ) {
dataI = src.read();
posInp++;
if(dataI == -1) { going = false; }
dataB = fromInt(dataI);
// If we've run out of data, output anything that's
// pending then finish
if(!going && rawCodeLen > 0) {
outputCompressed(res);
break;
}
// Try adding this new byte onto rawCode, and
// see if all of that is still found in the
// buffer dictionary or not
rawCode[rawCodeLen] = dataB;
rawCodeLen++;
int rawAt = findRawCodeInBuffer();
// If we found it and are now at 16 bytes,
// we need to output our pending code block
if(rawCodeLen == 16 && rawAt > -1) {
outputCompressed(res);
rawCodeLen = 0;
continue;
}
// If we did find all of rawCode with our new
// byte added on, we can wait to see what happens
// with the next byte
if(rawAt > -1) {
continue;
}
// If we get here, then the rawCode + this byte weren't
// found in the dictionary
// If there was something in rawCode before, then that was
// found in the dictionary, so output that compressed
rawCodeLen--;
if(rawCodeLen > 0) {
// Output the old rawCode
outputCompressed(res);
// Can this byte start a new rawCode, or does
// it need outputting itself?
rawCode[0] = dataB;
rawCodeLen = 1;
if(findRawCodeInBuffer() > -1) {
// Fits in, wait for next byte
continue;
}
// Doesn't fit, output
outputUncompressed(dataB,res);
rawCodeLen = 0;
} else {
// Nothing in rawCode before, so this byte
// isn't in the buffer dictionary
// Output it un-compressed
outputUncompressed(dataB,res);
}
}
}
}
}

View File

@ -0,0 +1,241 @@
package org.apache.poi.hdgf;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
/**
* Helper class to handle the Visio compatible
* streaming LZW compression.
* Need our own class to handle keeping track of the
* code buffer, pending bytes to write out etc.
*
* TODO Fix this, as it starts to go wrong on
* large streams
*/
final class HDGFLZWCompressor {
// We use 12 bit codes:
// * 0-255 are real bytes
// * 256-4095 are the substring codes
// Java handily initialises our buffer / dictionary
// to all zeros
byte[] dict = new byte[4096];
// The next block of data to be written out, minus
// its mask byte
byte[] buffer = new byte[16];
// And how long it is
// (Un-compressed codes are 1 byte each, compressed codes
// are two)
int bufferLen = 0;
// The raw length of a code is limited to 4 bits + 2
byte[] rawCode = new byte[18];
// And how much we're using
int rawCodeLen = 0;
// How far through the input and output streams we are
int posInp = 0;
int posOut = 0;
// What the next mask byte to output will be
int nextMask = 0;
// And how many bits we've already set
int maskBitsSet = 0;
public HDGFLZWCompressor() {}
/**
* Returns the last place that the bytes from rawCode are found
* at in the buffer, or -1 if they can't be found
*/
private int findRawCodeInBuffer() {
// Work our way through all the codes until we
// find the right one. Visio starts from the end
for(int i=4096-rawCodeLen; i>0; i--) {
boolean matches = true;
for(int j=0; matches && j<rawCodeLen; j++) {
if(dict[i+j] == rawCode[j]) {
// Fits
} else {
// Doesn't fit, can't be a match
matches = false;
}
}
// Was this position a match?
if(matches) {
return i;
}
}
// Not found
return -1;
}
/**
* Output the compressed representation for the bytes
* found in rawCode
*/
private void outputCompressed(OutputStream res) throws IOException {
// It's not worth compressing only 1 or two bytes,
// due to the overheads
// So if asked, just output uncompressed
if(rawCodeLen < 3) {
for(int i=0; i<rawCodeLen; i++) {
outputUncompressed(rawCode[i], res);
}
return;
}
// Grab where the data lives
int codesAt = findRawCodeInBuffer();
codesAt -= 18;
if(codesAt < 0) {
codesAt += 4096;
}
// Increment the mask bit count, we've done another code
maskBitsSet++;
// Add the length+code to the buffer
// (The position is the first 12 bits, the
// length is the last 4 bits)
int bp1 = (codesAt & 255);
int bp2 = (rawCodeLen-3) + ((codesAt-bp1) >> 4);
buffer[bufferLen] = HDGFLZW.fromInt(bp1);
bufferLen++;
buffer[bufferLen] = HDGFLZW.fromInt(bp2);
bufferLen++;
// Copy the data to the dictionary in the new place
for(int i=0; i<rawCodeLen; i++) {
dict[(posOut&4095)] = rawCode[i];
posOut++;
}
// If we're now at 8 codes, output
if(maskBitsSet == 8) {
output8Codes(res);
}
}
/**
* Output the un-compressed byte
*/
private void outputUncompressed(byte b, OutputStream res) throws IOException {
// Set the mask bit for us
nextMask += (1<<maskBitsSet);
maskBitsSet++;
// And add us to the buffer + dictionary
buffer[bufferLen] = b;
bufferLen++;
dict[(posOut&4095)] = b;
posOut++;
// If we're now at 8 codes, output
if(maskBitsSet == 8) {
output8Codes(res);
}
}
/**
* We've got 8 code worth to write out, so
* output along with the header
*/
private void output8Codes(OutputStream res) throws IOException {
// Output the mask and the data
res.write(new byte[] { HDGFLZW.fromInt(nextMask) } );
res.write(buffer, 0, bufferLen);
// Reset things
nextMask = 0;
maskBitsSet = 0;
bufferLen = 0;
}
/**
* Does the compression
*/
public void compress(InputStream src, OutputStream res) throws IOException {
// Have we hit the end of the file yet?
boolean going = true;
// This is a byte as looked up in the dictionary
// It needs to be signed, as it'll get passed on to
// the output stream
byte dataB;
// This is an unsigned byte read from the stream
// It needs to be unsigned, so that bit stuff works
int dataI;
while( going ) {
dataI = src.read();
posInp++;
if(dataI == -1) { going = false; }
dataB = HDGFLZW.fromInt(dataI);
// If we've run out of data, output anything that's
// pending then finish
if(!going) {
if(rawCodeLen > 0) {
outputCompressed(res);
if(maskBitsSet > 0) {
output8Codes(res);
}
}
break;
}
// Try adding this new byte onto rawCode, and
// see if all of that is still found in the
// buffer dictionary or not
rawCode[rawCodeLen] = dataB;
rawCodeLen++;
int rawAt = findRawCodeInBuffer();
// If we found it and are now at 18 bytes,
// we need to output our pending code block
if(rawCodeLen == 18 && rawAt > -1) {
outputCompressed(res);
rawCodeLen = 0;
continue;
}
// If we did find all of rawCode with our new
// byte added on, we can wait to see what happens
// with the next byte
if(rawAt > -1) {
continue;
}
// If we get here, then the rawCode + this byte weren't
// found in the dictionary
// If there was something in rawCode before, then that was
// found in the dictionary, so output that compressed
rawCodeLen--;
if(rawCodeLen > 0) {
// Output the old rawCode
outputCompressed(res);
// Can this byte start a new rawCode, or does
// it need outputting itself?
rawCode[0] = dataB;
rawCodeLen = 1;
if(findRawCodeInBuffer() > -1) {
// Fits in, wait for next byte
continue;
}
// Doesn't fit, output
outputUncompressed(dataB,res);
rawCodeLen = 0;
} else {
// Nothing in rawCode before, so this byte
// isn't in the buffer dictionary
// Output it un-compressed
outputUncompressed(dataB,res);
}
}
}
}

View File

@ -28,17 +28,19 @@ public final class TestHDGFLZW extends TestCase {
-21, -16, // 3 @ 4093
1, 0, 0, -72,
-13, -16, // 3 @ 5
78, // *mask bit*
78, // *mask bit* 2,3,4,7
-32, -5, // 14 @ 4082
1, 0, 3,
-21, -16, // 3 @ 4093
10, 5, // 8 @ 28
4,
-21, -16, // 3 @ 4093
21, // *mask bit*
21, // *mask bit* 1,3,5
9,
-21, -16, // 3 @ 4093
103, -21, -16, 34,
103,
-21, -16, // 3 @ 4093
34,
-36, -1, // 18 @ 4078
52, 15, // 18 @ 70
70, 15, // 18 @ 88
@ -169,7 +171,73 @@ public final class TestHDGFLZW extends TestCase {
}
}
public void DISABLEDtestCompress() throws Exception {
/**
* Test that we can round-trip a little bit.
* Uses a part short enough that we agree with visio
* on the best way to compress it
*/
public void testCompressMini() throws Exception {
// first 11 bytes compressed = 12 bytes uncompressed
byte[] sourceComp = new byte[11];
byte[] sourceDecomp = new byte[12];
System.arraycopy(testTrailerComp, 0, sourceComp, 0, sourceComp.length);
System.arraycopy(testTrailerDecomp, 0, sourceDecomp, 0, sourceDecomp.length);
// Compress it using our engine
HDGFLZW lzw = new HDGFLZW();
byte[] comp = lzw.compress(new ByteArrayInputStream(sourceDecomp));
// Now decompress it again
byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
// First up, check the round tripping
assertEquals(12, decomp.length);
for(int i=0; i<decomp.length; i++) {
assertEquals("Wrong at " + i, decomp[i], testTrailerDecomp[i]);
}
// Now check the compressed intermediate version
assertEquals(11, comp.length);
for(int i=0; i<comp.length; i++) {
assertEquals("Wrong at " + i, comp[i], testTrailerComp[i]);
}
}
/**
* Tests that we can do several mask pages
*/
public void testCompressMidi() throws Exception {
// First 12 -> 11
// Next 32 -> 13
byte[] sourceComp = new byte[24];
byte[] sourceDecomp = new byte[44];
System.arraycopy(testTrailerComp, 0, sourceComp, 0, sourceComp.length);
System.arraycopy(testTrailerDecomp, 0, sourceDecomp, 0, sourceDecomp.length);
// Compress it using our engine
HDGFLZW lzw = new HDGFLZW();
byte[] comp = lzw.compress(new ByteArrayInputStream(sourceDecomp));
// We should be 3 characters bigger, as
// we split one compressed bit into two
assertEquals(27, comp.length);
// Now decompress it again
byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
// We can only check the round-tripping, as for now
// visio cheats on re-using a block
assertEquals(44, decomp.length);
for(int i=0; i<decomp.length; i++) {
assertEquals("Wrong at " + i, decomp[i], sourceDecomp[i]);
}
}
/**
* Gets 160 bytes through then starts going wrong...
* TODO Fix this
*/
public void DISABLEDtestCompressFull() throws Exception {
assertEquals(339, testTrailerComp.length);
assertEquals(632, testTrailerDecomp.length);
@ -177,11 +245,24 @@ public final class TestHDGFLZW extends TestCase {
HDGFLZW lzw = new HDGFLZW();
byte[] comp = lzw.compress(new ByteArrayInputStream(testTrailerDecomp));
// Now check it's the right data
// Now decompress it again
byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
// for(int i=0; i<comp.length; i++) {
// System.err.println(i + "\t" + comp[i] + "\t" + testTrailerComp[i]);
// }
// First up, check the round tripping
// assertEquals(632, decomp.length);
for(int i=0; i<decomp.length; i++) {
assertEquals("Wrong at " + i, decomp[i], testTrailerDecomp[i]);
}
// Now check the compressed intermediate version
assertEquals(339, comp.length);
for(int i=0; i<comp.length; i++) {
if(comp[i] != testTrailerComp[i])
System.err.println(i + "\t" + comp[i] + "\t" + testTrailerComp[i]);
assertEquals("Wrong at " + i, comp[i], testTrailerComp[i]);
}
}
}