mirror of https://github.com/apache/poi.git
Improve the HDGF LZW code.
Some tweaks to the decompression, and more tests, but mostly work on the compression side. We can now compress small streams properly, and these round-trip fine. However, some longer streams don't compress correctly, and more work on that is still needed (see the disabled unit test) git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1049805 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4c8a39924b
commit
0b4b029e2a
|
@ -72,7 +72,8 @@
|
||||||
HDGFLZW, which will be much better documented, and also
|
HDGFLZW, which will be much better documented, and also
|
||||||
under the ASL. <strong>Completed October 2007</strong></li>
|
under the ASL. <strong>Completed October 2007</strong></li>
|
||||||
<li>Add compression support to HDGFLZW.
|
<li>Add compression support to HDGFLZW.
|
||||||
<strong>In progress</strong></li>
|
<strong>In progress - works for small streams but encoding
|
||||||
|
goes wrong on larger ones</strong></li>
|
||||||
<li>Have HDGF just write back the raw bytes it read in, and
|
<li>Have HDGF just write back the raw bytes it read in, and
|
||||||
have a test to ensure the file is un-changed.</li>
|
have a test to ensure the file is un-changed.</li>
|
||||||
<li>Have HDGF generate the bytes to write out from the
|
<li>Have HDGF generate the bytes to write out from the
|
||||||
|
|
|
@ -34,6 +34,8 @@
|
||||||
|
|
||||||
<changes>
|
<changes>
|
||||||
<release version="3.8-beta1" date="2010-??-??">
|
<release version="3.8-beta1" date="2010-??-??">
|
||||||
|
<action dev="poi-developers" type="add">Inside ExtractorFactory, support finding embedded OOXML documents and providing extractors for them</action>
|
||||||
|
<action dev="poi-developers" type="add">Partial HDGF LZW compression support</action>
|
||||||
<action dev="poi-developers" type="add">50244 - Support for continued NameRecords</action>
|
<action dev="poi-developers" type="add">50244 - Support for continued NameRecords</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">50416 - Correct shifting of the first or last row in a sheet by multiple rows</action>
|
<action dev="POI-DEVELOPERS" type="fix">50416 - Correct shifting of the first or last row in a sheet by multiple rows</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">50440 - Support evaluating formulas with newlines in them, which XSSF may have (but HSSF may not)</action>
|
<action dev="POI-DEVELOPERS" type="fix">50440 - Support evaluating formulas with newlines in them, which XSSF may have (but HSSF may not)</action>
|
||||||
|
|
|
@ -35,47 +35,48 @@ import java.io.OutputStream;
|
||||||
*/
|
*/
|
||||||
public class HDGFLZW {
|
public class HDGFLZW {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Given an integer, turn it into a java byte, handling
|
* Given an integer, turn it into a java byte, handling
|
||||||
* the wrapping.
|
* the wrapping.
|
||||||
* This is a convenience method
|
* This is a convenience method
|
||||||
*/
|
*/
|
||||||
public static byte fromInt(int b) {
|
public static byte fromInt(int b) {
|
||||||
if(b < 128) return (byte)b;
|
if(b < 128) return (byte)b;
|
||||||
return (byte)(b - 256);
|
return (byte)(b - 256);
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
* Given a java byte, turn it into an integer between 0
|
* Given a java byte, turn it into an integer between 0
|
||||||
* and 255 (i.e. handle the unwrapping).
|
* and 255 (i.e. handle the unwrapping).
|
||||||
* This is a convenience method
|
* This is a convenience method
|
||||||
*/
|
*/
|
||||||
public static int fromByte(byte b) {
|
public static int fromByte(byte b) {
|
||||||
if(b >= 0) {
|
if(b >= 0) {
|
||||||
return b;
|
return b;
|
||||||
}
|
}
|
||||||
return b + 256;
|
return b + 256;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compress the given input stream, returning the array of bytes
|
* Compress the given input stream, returning the array of bytes
|
||||||
* of the compressed input
|
* of the compressed input
|
||||||
*/
|
*/
|
||||||
public byte[] compress(InputStream src) throws IOException {
|
public byte[] compress(InputStream src) throws IOException {
|
||||||
ByteArrayOutputStream res = new ByteArrayOutputStream();
|
ByteArrayOutputStream res = new ByteArrayOutputStream();
|
||||||
compress(src,res);
|
compress(src,res);
|
||||||
return res.toByteArray();
|
return res.toByteArray();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Decompresses the given input stream, returning the array of bytes
|
* Decompresses the given input stream, returning the array of bytes
|
||||||
* of the decompressed input.
|
* of the decompressed input.
|
||||||
*/
|
*/
|
||||||
public byte[] decode(InputStream src) throws IOException {
|
public byte[] decode(InputStream src) throws IOException {
|
||||||
ByteArrayOutputStream res = new ByteArrayOutputStream();
|
ByteArrayOutputStream res = new ByteArrayOutputStream();
|
||||||
decode(src,res);
|
decode(src,res);
|
||||||
return res.toByteArray();
|
return res.toByteArray();
|
||||||
}
|
}
|
||||||
/**
|
|
||||||
|
/**
|
||||||
* Perform a streaming decompression of the input.
|
* Perform a streaming decompression of the input.
|
||||||
* Works by:
|
* Works by:
|
||||||
* 1) Reading a flag byte, the 8 bits of which tell you if the
|
* 1) Reading a flag byte, the 8 bits of which tell you if the
|
||||||
|
@ -88,7 +89,7 @@ public byte[] decode(InputStream src) throws IOException {
|
||||||
* 5) Loop until we've done all 8 bits, then read in the next
|
* 5) Loop until we've done all 8 bits, then read in the next
|
||||||
* flag byte
|
* flag byte
|
||||||
*/
|
*/
|
||||||
public void decode(InputStream src, OutputStream res) throws IOException {
|
public void decode(InputStream src, OutputStream res) throws IOException {
|
||||||
// We use 12 bit codes:
|
// We use 12 bit codes:
|
||||||
// * 0-255 are real bytes
|
// * 0-255 are real bytes
|
||||||
// * 256-4095 are the substring codes
|
// * 256-4095 are the substring codes
|
||||||
|
@ -107,10 +108,10 @@ public void decode(InputStream src, OutputStream res) throws IOException {
|
||||||
// processing each bit of the flag byte in turn
|
// processing each bit of the flag byte in turn
|
||||||
int mask;
|
int mask;
|
||||||
|
|
||||||
// This is a byte as looked up in the dictionary
|
// These are bytes as looked up in the dictionary
|
||||||
// It needs to be signed, as it'll get passed on to
|
// It needs to be signed, as it'll get passed on to
|
||||||
// the output stream
|
// the output stream
|
||||||
byte dataB;
|
byte[] dataB = new byte[19];
|
||||||
// This is an unsigned byte read from the stream
|
// This is an unsigned byte read from the stream
|
||||||
// It needs to be unsigned, so that bit stuff works
|
// It needs to be unsigned, so that bit stuff works
|
||||||
int dataI;
|
int dataI;
|
||||||
|
@ -158,235 +159,23 @@ public void decode(InputStream src, OutputStream res) throws IOException {
|
||||||
|
|
||||||
// Loop over the codes, outputting what they correspond to
|
// Loop over the codes, outputting what they correspond to
|
||||||
for(int i=0; i<len; i++) {
|
for(int i=0; i<len; i++) {
|
||||||
buffer [(pos + i) & 4095] = buffer [(pntr + i) & 4095];
|
dataB[i] = buffer[(pntr + i) & 4095];
|
||||||
dataB = buffer[(pntr + i) & 4095];
|
buffer[ (pos + i) & 4095 ] = dataB[i];
|
||||||
res.write(new byte[] {dataB});
|
|
||||||
}
|
}
|
||||||
|
res.write(dataB, 0, len);
|
||||||
|
|
||||||
// Record how far along the stream we have moved
|
// Record how far along the stream we have moved
|
||||||
pos = pos + len;
|
pos = pos + len;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Performs the Visio compatible streaming LZW compression.
|
* Performs the Visio compatible streaming LZW compression.
|
||||||
* TODO - Finish
|
|
||||||
*/
|
*/
|
||||||
public void compress(InputStream src, OutputStream res) throws IOException {
|
public void compress(InputStream src, OutputStream res) throws IOException {
|
||||||
Compressor c = new Compressor();
|
HDGFLZWCompressor c = new HDGFLZWCompressor();
|
||||||
c.compress(src, res);
|
c.compress(src, res);
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Helper class to handle the Visio compatible
|
|
||||||
* streaming LZW compression.
|
|
||||||
* Need our own class to handle keeping track of the
|
|
||||||
* code buffer, pending bytes to write out etc.
|
|
||||||
*/
|
|
||||||
private static final class Compressor {
|
|
||||||
// We use 12 bit codes:
|
|
||||||
// * 0-255 are real bytes
|
|
||||||
// * 256-4095 are the substring codes
|
|
||||||
// Java handily initialises our buffer / dictionary
|
|
||||||
// to all zeros
|
|
||||||
byte[] dict = new byte[4096];
|
|
||||||
|
|
||||||
// The next block of data to be written out, minus
|
|
||||||
// its mask byte
|
|
||||||
byte[] buffer = new byte[16];
|
|
||||||
// And how long it is
|
|
||||||
// (Un-compressed codes are 1 byte each, compressed codes
|
|
||||||
// are two)
|
|
||||||
int bufferLen = 0;
|
|
||||||
|
|
||||||
// The raw length of a code is limited to 4 bits
|
|
||||||
byte[] rawCode = new byte[16];
|
|
||||||
// And how much we're using
|
|
||||||
int rawCodeLen = 0;
|
|
||||||
|
|
||||||
// How far through the input and output streams we are
|
|
||||||
int posInp = 0;
|
|
||||||
int posOut = 0;
|
|
||||||
|
|
||||||
// What the next mask byte to output will be
|
|
||||||
int nextMask = 0;
|
|
||||||
// And how many bits we've already set
|
|
||||||
int maskBitsSet = 0;
|
|
||||||
|
|
||||||
public Compressor() {
|
|
||||||
//
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* Returns the last place that the bytes from rawCode are found
|
|
||||||
* at in the buffer, or -1 if they can't be found
|
|
||||||
*/
|
|
||||||
private int findRawCodeInBuffer() {
|
|
||||||
// Work our way back from the end
|
|
||||||
// (Visio always seems to use the last possible code)
|
|
||||||
for(int i=(buffer.length - rawCodeLen); i>=0; i--) {
|
|
||||||
boolean matches = true;
|
|
||||||
for(int j=0; matches && j<rawCodeLen; j++) {
|
|
||||||
if(buffer[i] == rawCode[j]) {
|
|
||||||
// Fits
|
|
||||||
} else {
|
|
||||||
// Doesn't fit, can't be a match
|
|
||||||
matches = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Was this position a match?
|
|
||||||
if(matches) {
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Not found
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Output the compressed representation for the bytes
|
|
||||||
* found in rawCode
|
|
||||||
*/
|
|
||||||
private void outputCompressed(OutputStream res) throws IOException {
|
|
||||||
// It's not worth compressing only 1 or two bytes,
|
|
||||||
// due to the overheads
|
|
||||||
// So if asked, just output uncompressed
|
|
||||||
if(rawCodeLen < 3) {
|
|
||||||
for(int i=0; i<rawCodeLen; i++) {
|
|
||||||
outputUncompressed(rawCode[i], res);
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Increment the mask bit count, we've done another code
|
|
||||||
maskBitsSet++;
|
|
||||||
// Add the length+code to the buffer
|
|
||||||
// (The position is the first 12 bits, the
|
|
||||||
// length is the last 4 bits)
|
|
||||||
// TODO
|
|
||||||
posOut += 2;
|
|
||||||
|
|
||||||
// If we're now at 8 codes, output
|
|
||||||
if(maskBitsSet == 8) {
|
|
||||||
output8Codes(res);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/**
|
|
||||||
* Output the un-compressed byte
|
|
||||||
*/
|
|
||||||
private void outputUncompressed(byte b, OutputStream res) throws IOException {
|
|
||||||
// Set the mask bit for us
|
|
||||||
nextMask += (1<<maskBitsSet);
|
|
||||||
|
|
||||||
// And add us to the buffer + dictionary
|
|
||||||
buffer[bufferLen] = fromInt(b);
|
|
||||||
bufferLen++;
|
|
||||||
dict[(posOut&4095)] = fromInt(b);
|
|
||||||
posOut++;
|
|
||||||
|
|
||||||
// If we're now at 8 codes, output
|
|
||||||
if(maskBitsSet == 8) {
|
|
||||||
output8Codes(res);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* We've got 8 code worth to write out, so
|
|
||||||
* output along with the header
|
|
||||||
*/
|
|
||||||
private void output8Codes(OutputStream res) throws IOException {
|
|
||||||
// Output the mask and the data
|
|
||||||
res.write(new byte[] { fromInt(nextMask) } );
|
|
||||||
res.write(buffer, 0, bufferLen);
|
|
||||||
|
|
||||||
// Reset things
|
|
||||||
nextMask = 0;
|
|
||||||
maskBitsSet = 0;
|
|
||||||
bufferLen = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Does the compression
|
|
||||||
*/
|
|
||||||
public void compress(InputStream src, OutputStream res) throws IOException {
|
|
||||||
// Have we hit the end of the file yet?
|
|
||||||
boolean going = true;
|
|
||||||
|
|
||||||
// This is a byte as looked up in the dictionary
|
|
||||||
// It needs to be signed, as it'll get passed on to
|
|
||||||
// the output stream
|
|
||||||
byte dataB;
|
|
||||||
// This is an unsigned byte read from the stream
|
|
||||||
// It needs to be unsigned, so that bit stuff works
|
|
||||||
int dataI;
|
|
||||||
|
|
||||||
while( going ) {
|
|
||||||
dataI = src.read();
|
|
||||||
posInp++;
|
|
||||||
if(dataI == -1) { going = false; }
|
|
||||||
dataB = fromInt(dataI);
|
|
||||||
|
|
||||||
// If we've run out of data, output anything that's
|
|
||||||
// pending then finish
|
|
||||||
if(!going && rawCodeLen > 0) {
|
|
||||||
outputCompressed(res);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try adding this new byte onto rawCode, and
|
|
||||||
// see if all of that is still found in the
|
|
||||||
// buffer dictionary or not
|
|
||||||
rawCode[rawCodeLen] = dataB;
|
|
||||||
rawCodeLen++;
|
|
||||||
int rawAt = findRawCodeInBuffer();
|
|
||||||
|
|
||||||
// If we found it and are now at 16 bytes,
|
|
||||||
// we need to output our pending code block
|
|
||||||
if(rawCodeLen == 16 && rawAt > -1) {
|
|
||||||
outputCompressed(res);
|
|
||||||
rawCodeLen = 0;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// If we did find all of rawCode with our new
|
|
||||||
// byte added on, we can wait to see what happens
|
|
||||||
// with the next byte
|
|
||||||
if(rawAt > -1) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// If we get here, then the rawCode + this byte weren't
|
|
||||||
// found in the dictionary
|
|
||||||
|
|
||||||
// If there was something in rawCode before, then that was
|
|
||||||
// found in the dictionary, so output that compressed
|
|
||||||
rawCodeLen--;
|
|
||||||
if(rawCodeLen > 0) {
|
|
||||||
// Output the old rawCode
|
|
||||||
outputCompressed(res);
|
|
||||||
|
|
||||||
// Can this byte start a new rawCode, or does
|
|
||||||
// it need outputting itself?
|
|
||||||
rawCode[0] = dataB;
|
|
||||||
rawCodeLen = 1;
|
|
||||||
if(findRawCodeInBuffer() > -1) {
|
|
||||||
// Fits in, wait for next byte
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// Doesn't fit, output
|
|
||||||
outputUncompressed(dataB,res);
|
|
||||||
rawCodeLen = 0;
|
|
||||||
} else {
|
|
||||||
// Nothing in rawCode before, so this byte
|
|
||||||
// isn't in the buffer dictionary
|
|
||||||
// Output it un-compressed
|
|
||||||
outputUncompressed(dataB,res);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
|
@ -0,0 +1,241 @@
|
||||||
|
package org.apache.poi.hdgf;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper class to handle the Visio compatible
|
||||||
|
* streaming LZW compression.
|
||||||
|
* Need our own class to handle keeping track of the
|
||||||
|
* code buffer, pending bytes to write out etc.
|
||||||
|
*
|
||||||
|
* TODO Fix this, as it starts to go wrong on
|
||||||
|
* large streams
|
||||||
|
*/
|
||||||
|
final class HDGFLZWCompressor {
|
||||||
|
// We use 12 bit codes:
|
||||||
|
// * 0-255 are real bytes
|
||||||
|
// * 256-4095 are the substring codes
|
||||||
|
// Java handily initialises our buffer / dictionary
|
||||||
|
// to all zeros
|
||||||
|
byte[] dict = new byte[4096];
|
||||||
|
|
||||||
|
// The next block of data to be written out, minus
|
||||||
|
// its mask byte
|
||||||
|
byte[] buffer = new byte[16];
|
||||||
|
// And how long it is
|
||||||
|
// (Un-compressed codes are 1 byte each, compressed codes
|
||||||
|
// are two)
|
||||||
|
int bufferLen = 0;
|
||||||
|
|
||||||
|
// The raw length of a code is limited to 4 bits + 2
|
||||||
|
byte[] rawCode = new byte[18];
|
||||||
|
// And how much we're using
|
||||||
|
int rawCodeLen = 0;
|
||||||
|
|
||||||
|
// How far through the input and output streams we are
|
||||||
|
int posInp = 0;
|
||||||
|
int posOut = 0;
|
||||||
|
|
||||||
|
// What the next mask byte to output will be
|
||||||
|
int nextMask = 0;
|
||||||
|
// And how many bits we've already set
|
||||||
|
int maskBitsSet = 0;
|
||||||
|
|
||||||
|
public HDGFLZWCompressor() {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the last place that the bytes from rawCode are found
|
||||||
|
* at in the buffer, or -1 if they can't be found
|
||||||
|
*/
|
||||||
|
private int findRawCodeInBuffer() {
|
||||||
|
// Work our way through all the codes until we
|
||||||
|
// find the right one. Visio starts from the end
|
||||||
|
for(int i=4096-rawCodeLen; i>0; i--) {
|
||||||
|
boolean matches = true;
|
||||||
|
for(int j=0; matches && j<rawCodeLen; j++) {
|
||||||
|
if(dict[i+j] == rawCode[j]) {
|
||||||
|
// Fits
|
||||||
|
} else {
|
||||||
|
// Doesn't fit, can't be a match
|
||||||
|
matches = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Was this position a match?
|
||||||
|
if(matches) {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Not found
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Output the compressed representation for the bytes
|
||||||
|
* found in rawCode
|
||||||
|
*/
|
||||||
|
private void outputCompressed(OutputStream res) throws IOException {
|
||||||
|
// It's not worth compressing only 1 or two bytes,
|
||||||
|
// due to the overheads
|
||||||
|
// So if asked, just output uncompressed
|
||||||
|
if(rawCodeLen < 3) {
|
||||||
|
for(int i=0; i<rawCodeLen; i++) {
|
||||||
|
outputUncompressed(rawCode[i], res);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Grab where the data lives
|
||||||
|
int codesAt = findRawCodeInBuffer();
|
||||||
|
codesAt -= 18;
|
||||||
|
if(codesAt < 0) {
|
||||||
|
codesAt += 4096;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Increment the mask bit count, we've done another code
|
||||||
|
maskBitsSet++;
|
||||||
|
|
||||||
|
// Add the length+code to the buffer
|
||||||
|
// (The position is the first 12 bits, the
|
||||||
|
// length is the last 4 bits)
|
||||||
|
int bp1 = (codesAt & 255);
|
||||||
|
int bp2 = (rawCodeLen-3) + ((codesAt-bp1) >> 4);
|
||||||
|
buffer[bufferLen] = HDGFLZW.fromInt(bp1);
|
||||||
|
bufferLen++;
|
||||||
|
buffer[bufferLen] = HDGFLZW.fromInt(bp2);
|
||||||
|
bufferLen++;
|
||||||
|
|
||||||
|
// Copy the data to the dictionary in the new place
|
||||||
|
for(int i=0; i<rawCodeLen; i++) {
|
||||||
|
dict[(posOut&4095)] = rawCode[i];
|
||||||
|
posOut++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we're now at 8 codes, output
|
||||||
|
if(maskBitsSet == 8) {
|
||||||
|
output8Codes(res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Output the un-compressed byte
|
||||||
|
*/
|
||||||
|
private void outputUncompressed(byte b, OutputStream res) throws IOException {
|
||||||
|
// Set the mask bit for us
|
||||||
|
nextMask += (1<<maskBitsSet);
|
||||||
|
maskBitsSet++;
|
||||||
|
|
||||||
|
// And add us to the buffer + dictionary
|
||||||
|
buffer[bufferLen] = b;
|
||||||
|
bufferLen++;
|
||||||
|
dict[(posOut&4095)] = b;
|
||||||
|
posOut++;
|
||||||
|
|
||||||
|
// If we're now at 8 codes, output
|
||||||
|
if(maskBitsSet == 8) {
|
||||||
|
output8Codes(res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* We've got 8 code worth to write out, so
|
||||||
|
* output along with the header
|
||||||
|
*/
|
||||||
|
private void output8Codes(OutputStream res) throws IOException {
|
||||||
|
// Output the mask and the data
|
||||||
|
res.write(new byte[] { HDGFLZW.fromInt(nextMask) } );
|
||||||
|
res.write(buffer, 0, bufferLen);
|
||||||
|
|
||||||
|
// Reset things
|
||||||
|
nextMask = 0;
|
||||||
|
maskBitsSet = 0;
|
||||||
|
bufferLen = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Does the compression
|
||||||
|
*/
|
||||||
|
public void compress(InputStream src, OutputStream res) throws IOException {
|
||||||
|
// Have we hit the end of the file yet?
|
||||||
|
boolean going = true;
|
||||||
|
|
||||||
|
// This is a byte as looked up in the dictionary
|
||||||
|
// It needs to be signed, as it'll get passed on to
|
||||||
|
// the output stream
|
||||||
|
byte dataB;
|
||||||
|
// This is an unsigned byte read from the stream
|
||||||
|
// It needs to be unsigned, so that bit stuff works
|
||||||
|
int dataI;
|
||||||
|
|
||||||
|
while( going ) {
|
||||||
|
dataI = src.read();
|
||||||
|
posInp++;
|
||||||
|
if(dataI == -1) { going = false; }
|
||||||
|
dataB = HDGFLZW.fromInt(dataI);
|
||||||
|
|
||||||
|
// If we've run out of data, output anything that's
|
||||||
|
// pending then finish
|
||||||
|
if(!going) {
|
||||||
|
if(rawCodeLen > 0) {
|
||||||
|
outputCompressed(res);
|
||||||
|
if(maskBitsSet > 0) {
|
||||||
|
output8Codes(res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try adding this new byte onto rawCode, and
|
||||||
|
// see if all of that is still found in the
|
||||||
|
// buffer dictionary or not
|
||||||
|
rawCode[rawCodeLen] = dataB;
|
||||||
|
rawCodeLen++;
|
||||||
|
int rawAt = findRawCodeInBuffer();
|
||||||
|
|
||||||
|
// If we found it and are now at 18 bytes,
|
||||||
|
// we need to output our pending code block
|
||||||
|
if(rawCodeLen == 18 && rawAt > -1) {
|
||||||
|
outputCompressed(res);
|
||||||
|
rawCodeLen = 0;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we did find all of rawCode with our new
|
||||||
|
// byte added on, we can wait to see what happens
|
||||||
|
// with the next byte
|
||||||
|
if(rawAt > -1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we get here, then the rawCode + this byte weren't
|
||||||
|
// found in the dictionary
|
||||||
|
|
||||||
|
// If there was something in rawCode before, then that was
|
||||||
|
// found in the dictionary, so output that compressed
|
||||||
|
rawCodeLen--;
|
||||||
|
if(rawCodeLen > 0) {
|
||||||
|
// Output the old rawCode
|
||||||
|
outputCompressed(res);
|
||||||
|
|
||||||
|
// Can this byte start a new rawCode, or does
|
||||||
|
// it need outputting itself?
|
||||||
|
rawCode[0] = dataB;
|
||||||
|
rawCodeLen = 1;
|
||||||
|
if(findRawCodeInBuffer() > -1) {
|
||||||
|
// Fits in, wait for next byte
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Doesn't fit, output
|
||||||
|
outputUncompressed(dataB,res);
|
||||||
|
rawCodeLen = 0;
|
||||||
|
} else {
|
||||||
|
// Nothing in rawCode before, so this byte
|
||||||
|
// isn't in the buffer dictionary
|
||||||
|
// Output it un-compressed
|
||||||
|
outputUncompressed(dataB,res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -28,17 +28,19 @@ public final class TestHDGFLZW extends TestCase {
|
||||||
-21, -16, // 3 @ 4093
|
-21, -16, // 3 @ 4093
|
||||||
1, 0, 0, -72,
|
1, 0, 0, -72,
|
||||||
-13, -16, // 3 @ 5
|
-13, -16, // 3 @ 5
|
||||||
78, // *mask bit*
|
78, // *mask bit* 2,3,4,7
|
||||||
-32, -5, // 14 @ 4082
|
-32, -5, // 14 @ 4082
|
||||||
1, 0, 3,
|
1, 0, 3,
|
||||||
-21, -16, // 3 @ 4093
|
-21, -16, // 3 @ 4093
|
||||||
10, 5, // 8 @ 28
|
10, 5, // 8 @ 28
|
||||||
4,
|
4,
|
||||||
-21, -16, // 3 @ 4093
|
-21, -16, // 3 @ 4093
|
||||||
21, // *mask bit*
|
21, // *mask bit* 1,3,5
|
||||||
9,
|
9,
|
||||||
-21, -16, // 3 @ 4093
|
-21, -16, // 3 @ 4093
|
||||||
103, -21, -16, 34,
|
103,
|
||||||
|
-21, -16, // 3 @ 4093
|
||||||
|
34,
|
||||||
-36, -1, // 18 @ 4078
|
-36, -1, // 18 @ 4078
|
||||||
52, 15, // 18 @ 70
|
52, 15, // 18 @ 70
|
||||||
70, 15, // 18 @ 88
|
70, 15, // 18 @ 88
|
||||||
|
@ -169,7 +171,73 @@ public final class TestHDGFLZW extends TestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void DISABLEDtestCompress() throws Exception {
|
/**
|
||||||
|
* Test that we can round-trip a little bit.
|
||||||
|
* Uses a part short enough that we agree with visio
|
||||||
|
* on the best way to compress it
|
||||||
|
*/
|
||||||
|
public void testCompressMini() throws Exception {
|
||||||
|
// first 11 bytes compressed = 12 bytes uncompressed
|
||||||
|
byte[] sourceComp = new byte[11];
|
||||||
|
byte[] sourceDecomp = new byte[12];
|
||||||
|
System.arraycopy(testTrailerComp, 0, sourceComp, 0, sourceComp.length);
|
||||||
|
System.arraycopy(testTrailerDecomp, 0, sourceDecomp, 0, sourceDecomp.length);
|
||||||
|
|
||||||
|
// Compress it using our engine
|
||||||
|
HDGFLZW lzw = new HDGFLZW();
|
||||||
|
byte[] comp = lzw.compress(new ByteArrayInputStream(sourceDecomp));
|
||||||
|
|
||||||
|
// Now decompress it again
|
||||||
|
byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
|
||||||
|
|
||||||
|
// First up, check the round tripping
|
||||||
|
assertEquals(12, decomp.length);
|
||||||
|
for(int i=0; i<decomp.length; i++) {
|
||||||
|
assertEquals("Wrong at " + i, decomp[i], testTrailerDecomp[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now check the compressed intermediate version
|
||||||
|
assertEquals(11, comp.length);
|
||||||
|
for(int i=0; i<comp.length; i++) {
|
||||||
|
assertEquals("Wrong at " + i, comp[i], testTrailerComp[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests that we can do several mask pages
|
||||||
|
*/
|
||||||
|
public void testCompressMidi() throws Exception {
|
||||||
|
// First 12 -> 11
|
||||||
|
// Next 32 -> 13
|
||||||
|
byte[] sourceComp = new byte[24];
|
||||||
|
byte[] sourceDecomp = new byte[44];
|
||||||
|
System.arraycopy(testTrailerComp, 0, sourceComp, 0, sourceComp.length);
|
||||||
|
System.arraycopy(testTrailerDecomp, 0, sourceDecomp, 0, sourceDecomp.length);
|
||||||
|
|
||||||
|
// Compress it using our engine
|
||||||
|
HDGFLZW lzw = new HDGFLZW();
|
||||||
|
byte[] comp = lzw.compress(new ByteArrayInputStream(sourceDecomp));
|
||||||
|
|
||||||
|
// We should be 3 characters bigger, as
|
||||||
|
// we split one compressed bit into two
|
||||||
|
assertEquals(27, comp.length);
|
||||||
|
|
||||||
|
// Now decompress it again
|
||||||
|
byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
|
||||||
|
|
||||||
|
// We can only check the round-tripping, as for now
|
||||||
|
// visio cheats on re-using a block
|
||||||
|
assertEquals(44, decomp.length);
|
||||||
|
for(int i=0; i<decomp.length; i++) {
|
||||||
|
assertEquals("Wrong at " + i, decomp[i], sourceDecomp[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets 160 bytes through then starts going wrong...
|
||||||
|
* TODO Fix this
|
||||||
|
*/
|
||||||
|
public void DISABLEDtestCompressFull() throws Exception {
|
||||||
assertEquals(339, testTrailerComp.length);
|
assertEquals(339, testTrailerComp.length);
|
||||||
assertEquals(632, testTrailerDecomp.length);
|
assertEquals(632, testTrailerDecomp.length);
|
||||||
|
|
||||||
|
@ -177,11 +245,24 @@ public final class TestHDGFLZW extends TestCase {
|
||||||
HDGFLZW lzw = new HDGFLZW();
|
HDGFLZW lzw = new HDGFLZW();
|
||||||
byte[] comp = lzw.compress(new ByteArrayInputStream(testTrailerDecomp));
|
byte[] comp = lzw.compress(new ByteArrayInputStream(testTrailerDecomp));
|
||||||
|
|
||||||
// Now check it's the right data
|
// Now decompress it again
|
||||||
|
byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
|
||||||
|
|
||||||
|
// for(int i=0; i<comp.length; i++) {
|
||||||
|
// System.err.println(i + "\t" + comp[i] + "\t" + testTrailerComp[i]);
|
||||||
|
// }
|
||||||
|
|
||||||
|
// First up, check the round tripping
|
||||||
|
// assertEquals(632, decomp.length);
|
||||||
|
for(int i=0; i<decomp.length; i++) {
|
||||||
|
assertEquals("Wrong at " + i, decomp[i], testTrailerDecomp[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Now check the compressed intermediate version
|
||||||
assertEquals(339, comp.length);
|
assertEquals(339, comp.length);
|
||||||
for(int i=0; i<comp.length; i++) {
|
for(int i=0; i<comp.length; i++) {
|
||||||
if(comp[i] != testTrailerComp[i])
|
assertEquals("Wrong at " + i, comp[i], testTrailerComp[i]);
|
||||||
System.err.println(i + "\t" + comp[i] + "\t" + testTrailerComp[i]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue