Improve the HDGF LZW code.

Some tweaks to the decompression, and more tests, but mostly work on the compression side. We can now compress small streams properly, and these round-trip fine. However, some longer streams don't compress correctly, and more work on that is still needed (see the disabled unit test) git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1049805 13f79535-47bb-0310-9956-ffa450edef68
2010-12-16 07:41:41 +00:00 · 2010-12-16 07:41:41 +00:00 · 0b4b029e2a
parent 4c8a39924b
commit 0b4b029e2a
5 changed files with 482 additions and 368 deletions
--- a/src/documentation/content/xdocs/hdgf/index.xml
+++ b/src/documentation/content/xdocs/hdgf/index.xml
@ -72,7 +72,8 @@
 				  HDGFLZW, which will be much better documented, and also 
 				  under the ASL. <strong>Completed October 2007</strong></li>
 				 <li>Add compression support to HDGFLZW. 
-				  <strong>In progress</strong></li>
+				  <strong>In progress - works for small streams but encoding
+               goes wrong on larger ones</strong></li>
 				 <li>Have HDGF just write back the raw bytes it read in, and
 				  have a test to ensure the file is un-changed.</li>
 				 <li>Have HDGF generate the bytes to write out from the
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@ -34,6 +34,8 @@

    <changes>
        <release version="3.8-beta1" date="2010-??-??">
+           <action dev="poi-developers" type="add">Inside ExtractorFactory, support finding embedded OOXML documents and providing extractors for them</action>
+           <action dev="poi-developers" type="add">Partial HDGF LZW compression support</action>
           <action dev="poi-developers" type="add">50244 - Support for continued NameRecords</action>
           <action dev="POI-DEVELOPERS" type="fix">50416 - Correct shifting of the first or last row in a sheet by multiple rows</action>
           <action dev="POI-DEVELOPERS" type="fix">50440 - Support evaluating formulas with newlines in them, which XSSF may have (but HSSF may not)</action>
--- a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
@ -75,6 +75,7 @@ public byte[] decode(InputStream src) throws IOException {
      decode(src,res);
      return res.toByteArray();
   }
+   
   /**
    * Perform a streaming decompression of the input.
    * Works by:
@ -107,10 +108,10 @@ public void decode(InputStream src, OutputStream res) throws IOException {
      //  processing each bit of the flag byte in turn
      int mask;

-	// This is a byte as looked up in the dictionary
+      // These are bytes as looked up in the dictionary
      // It needs to be signed, as it'll get passed on to
      //  the output stream
-	byte dataB;
+      byte[] dataB = new byte[19];
      // This is an unsigned byte read from the stream
      // It needs to be unsigned, so that bit stuff works
      int dataI;
@ -158,10 +159,10 @@ public void decode(InputStream src, OutputStream res) throws IOException {

               // Loop over the codes, outputting what they correspond to
               for(int i=0; i<len; i++) {
-					buffer [(pos + i) & 4095] = buffer [(pntr + i) & 4095];
-					dataB = buffer[(pntr + i) & 4095];
-					res.write(new byte[] {dataB});
+                  dataB[i] = buffer[(pntr + i) & 4095];
+                  buffer[ (pos + i) & 4095 ] = dataB[i];
               }
+               res.write(dataB, 0, len);

               // Record how far along the stream we have moved
               pos = pos + len;
@ -172,221 +173,9 @@ public void decode(InputStream src, OutputStream res) throws IOException {

   /**
    * Performs the Visio compatible streaming LZW compression.
- * TODO - Finish
    */
   public void compress(InputStream src, OutputStream res) throws IOException {
-	Compressor c = new Compressor();
+      HDGFLZWCompressor c = new HDGFLZWCompressor();
      c.compress(src, res);
   }
-
-/**
- * Helper class to handle the Visio compatible
- *  streaming LZW compression.
- * Need our own class to handle keeping track of the
- *  code buffer, pending bytes to write out etc.
- */
-private static final class Compressor {
-	// We use 12 bit codes:
-	// * 0-255 are real bytes
-	// * 256-4095 are the substring codes
-	// Java handily initialises our buffer / dictionary
-	//  to all zeros
-	byte[] dict = new byte[4096];
-
-	// The next block of data to be written out, minus
-	//  its mask byte
-	byte[] buffer = new byte[16];
-	// And how long it is
-	// (Un-compressed codes are 1 byte each, compressed codes
-	//   are two)
-	int bufferLen = 0;
-
-	// The raw length of a code is limited to 4 bits
-	byte[] rawCode = new byte[16];
-	// And how much we're using
-	int rawCodeLen = 0;
-
-	// How far through the input and output streams we are
-	int posInp = 0;
-	int posOut = 0;
-
-	// What the next mask byte to output will be
-	int nextMask = 0;
-	// And how many bits we've already set
-	int maskBitsSet = 0;
-
-	public Compressor() {
-		//
-	}
-/**
- * Returns the last place that the bytes from rawCode are found
- *  at in the buffer, or -1 if they can't be found
- */
-private int findRawCodeInBuffer() {
-	// Work our way back from the end
-	// (Visio always seems to use the last possible code)
-	for(int i=(buffer.length - rawCodeLen); i>=0; i--) {
-		boolean matches = true;
-		for(int j=0; matches && j<rawCodeLen; j++) {
-			if(buffer[i] == rawCode[j]) {
-				// Fits
-			} else {
-				// Doesn't fit, can't be a match
-				matches = false;
-			}
-		}
-
-		// Was this position a match?
-		if(matches) {
-			return i;
-		}
-	}
-
-	// Not found
-	return -1;
-}
-
-/**
- * Output the compressed representation for the bytes
- *  found in rawCode
- */
-private void outputCompressed(OutputStream res) throws IOException {
-	// It's not worth compressing only 1 or two bytes,
-	//  due to the overheads
-	// So if asked, just output uncompressed
-	if(rawCodeLen < 3) {
-		for(int i=0; i<rawCodeLen; i++) {
-			outputUncompressed(rawCode[i], res);
-		}
-		return;
-	}
-
-	// Increment the mask bit count, we've done another code
-	maskBitsSet++;
-	// Add the length+code to the buffer
-	// (The position is the first 12 bits, the
-	//  length is the last 4 bits)
-	// TODO
-	posOut += 2;
-
-	// If we're now at 8 codes, output
-	if(maskBitsSet == 8) {
-		output8Codes(res);
-	}
-}
-/**
- * Output the un-compressed byte
- */
-private void outputUncompressed(byte b, OutputStream res) throws IOException {
-	// Set the mask bit for us
-	nextMask += (1<<maskBitsSet);
-
-	// And add us to the buffer + dictionary
-	buffer[bufferLen] = fromInt(b);
-	bufferLen++;
-	dict[(posOut&4095)] = fromInt(b);
-	posOut++;
-
-	// If we're now at 8 codes, output
-	if(maskBitsSet == 8) {
-		output8Codes(res);
-	}
-}
-
-/**
- * We've got 8 code worth to write out, so
- *  output along with the header
- */
-private void output8Codes(OutputStream res) throws IOException {
-	// Output the mask and the data
-	res.write(new byte[] { fromInt(nextMask) } );
-	res.write(buffer, 0, bufferLen);
-
-	// Reset things
-	nextMask = 0;
-	maskBitsSet = 0;
-	bufferLen = 0;
-}
-
-/**
- * Does the compression
- */
-public void compress(InputStream src, OutputStream res) throws IOException {
-	// Have we hit the end of the file yet?
-	boolean going = true;
-
-	// This is a byte as looked up in the dictionary
-	// It needs to be signed, as it'll get passed on to
-	//  the output stream
-	byte dataB;
-	// This is an unsigned byte read from the stream
-	// It needs to be unsigned, so that bit stuff works
-	int dataI;
-
-	while( going ) {
-		dataI = src.read();
-		posInp++;
-		if(dataI == -1) { going = false; }
-		dataB = fromInt(dataI);
-
-		// If we've run out of data, output anything that's
-		//  pending then finish
-		if(!going && rawCodeLen > 0) {
-			outputCompressed(res);
-			break;
-		}
-
-		// Try adding this new byte onto rawCode, and
-		//  see if all of that is still found in the
-		//  buffer dictionary or not
-		rawCode[rawCodeLen] = dataB;
-		rawCodeLen++;
-		int rawAt = findRawCodeInBuffer();
-
-		// If we found it and are now at 16 bytes,
-		//  we need to output our pending code block
-		if(rawCodeLen == 16 && rawAt > -1) {
-			outputCompressed(res);
-			rawCodeLen = 0;
-			continue;
-		}
-
-		// If we did find all of rawCode with our new
-		//  byte added on, we can wait to see what happens
-		//  with the next byte
-		if(rawAt > -1) {
-			continue;
-		}
-
-		// If we get here, then the rawCode + this byte weren't
-		// found in the dictionary
-
-		// If there was something in rawCode before, then that was
-		// found in the dictionary, so output that compressed
-		rawCodeLen--;
-		if(rawCodeLen > 0) {
-			// Output the old rawCode
-			outputCompressed(res);
-
-			// Can this byte start a new rawCode, or does
-			//  it need outputting itself?
-			rawCode[0] = dataB;
-			rawCodeLen = 1;
-			if(findRawCodeInBuffer() > -1) {
-				// Fits in, wait for next byte
-				continue;
-			}
-			// Doesn't fit, output
-			outputUncompressed(dataB,res);
-			rawCodeLen = 0;
-		} else {
-			// Nothing in rawCode before, so this byte
-			//  isn't in the buffer dictionary
-			// Output it un-compressed
-			outputUncompressed(dataB,res);
-		}
-	}
-}
-}
-
 }
--- a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZWCompressor.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZWCompressor.java
@ -0,0 +1,241 @@
+package org.apache.poi.hdgf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+/**
+ * Helper class to handle the Visio compatible
+ *  streaming LZW compression.
+ * Need our own class to handle keeping track of the
+ *  code buffer, pending bytes to write out etc.
+ *  
+ * TODO Fix this, as it starts to go wrong on
+ *  large streams 
+ */
+final class HDGFLZWCompressor {
+	// We use 12 bit codes:
+	// * 0-255 are real bytes
+	// * 256-4095 are the substring codes
+	// Java handily initialises our buffer / dictionary
+	//  to all zeros
+	byte[] dict = new byte[4096];
+
+	// The next block of data to be written out, minus
+	//  its mask byte
+	byte[] buffer = new byte[16];
+	// And how long it is
+	// (Un-compressed codes are 1 byte each, compressed codes
+	//   are two)
+	int bufferLen = 0;
+
+	// The raw length of a code is limited to 4 bits + 2
+	byte[] rawCode = new byte[18];
+	// And how much we're using
+	int rawCodeLen = 0;
+
+	// How far through the input and output streams we are
+	int posInp = 0;
+	int posOut = 0;
+
+	// What the next mask byte to output will be
+	int nextMask = 0;
+	// And how many bits we've already set
+	int maskBitsSet = 0;
+
+	public HDGFLZWCompressor() {}
+	
+/**
+ * Returns the last place that the bytes from rawCode are found
+ *  at in the buffer, or -1 if they can't be found
+ */
+private int findRawCodeInBuffer() {
+	// Work our way through all the codes until we
+   //  find the right one. Visio starts from the end
+	for(int i=4096-rawCodeLen; i>0; i--) {
+		boolean matches = true;
+		for(int j=0; matches && j<rawCodeLen; j++) {
+			if(dict[i+j] == rawCode[j]) {
+				// Fits
+			} else {
+				// Doesn't fit, can't be a match
+				matches = false;
+			}
+		}
+
+		// Was this position a match?
+		if(matches) {
+			return i;
+		}
+	}
+
+	// Not found
+	return -1;
+}
+
+/**
+ * Output the compressed representation for the bytes
+ *  found in rawCode
+ */
+private void outputCompressed(OutputStream res) throws IOException {
+	// It's not worth compressing only 1 or two bytes,
+	//  due to the overheads
+	// So if asked, just output uncompressed
+	if(rawCodeLen < 3) {
+		for(int i=0; i<rawCodeLen; i++) {
+			outputUncompressed(rawCode[i], res);
+		}
+		return;
+	}
+	
+	// Grab where the data lives
+	int codesAt = findRawCodeInBuffer();
+   codesAt -= 18;
+	if(codesAt < 0) {
+	   codesAt += 4096;
+	}
+
+	// Increment the mask bit count, we've done another code
+	maskBitsSet++;
+	
+	// Add the length+code to the buffer
+	// (The position is the first 12 bits, the
+	//  length is the last 4 bits)
+	int bp1 = (codesAt & 255);
+	int bp2 = (rawCodeLen-3) + ((codesAt-bp1) >> 4);
+	buffer[bufferLen] = HDGFLZW.fromInt(bp1);
+	bufferLen++;
+   buffer[bufferLen] = HDGFLZW.fromInt(bp2);
+   bufferLen++;
+   
+   // Copy the data to the dictionary in the new place
+   for(int i=0; i<rawCodeLen; i++) {
+      dict[(posOut&4095)] = rawCode[i];
+      posOut++; 
+   }
+
+	// If we're now at 8 codes, output
+	if(maskBitsSet == 8) {
+		output8Codes(res);
+	}
+}
+/**
+ * Output the un-compressed byte
+ */
+private void outputUncompressed(byte b, OutputStream res) throws IOException {
+	// Set the mask bit for us
+	nextMask += (1<<maskBitsSet);
+	maskBitsSet++;
+
+	// And add us to the buffer + dictionary
+	buffer[bufferLen] = b;
+	bufferLen++;
+	dict[(posOut&4095)] = b;
+	posOut++;
+
+	// If we're now at 8 codes, output
+	if(maskBitsSet == 8) {
+		output8Codes(res);
+	}
+}
+
+/**
+ * We've got 8 code worth to write out, so
+ *  output along with the header
+ */
+private void output8Codes(OutputStream res) throws IOException {
+	// Output the mask and the data
+	res.write(new byte[] { HDGFLZW.fromInt(nextMask) } );
+	res.write(buffer, 0, bufferLen);
+
+	// Reset things
+	nextMask = 0;
+	maskBitsSet = 0;
+	bufferLen = 0;
+}
+
+/**
+ * Does the compression
+ */
+public void compress(InputStream src, OutputStream res) throws IOException {
+	// Have we hit the end of the file yet?
+	boolean going = true;
+
+	// This is a byte as looked up in the dictionary
+	// It needs to be signed, as it'll get passed on to
+	//  the output stream
+	byte dataB;
+	// This is an unsigned byte read from the stream
+	// It needs to be unsigned, so that bit stuff works
+	int dataI;
+
+	while( going ) {
+		dataI = src.read();
+		posInp++;
+		if(dataI == -1) { going = false; }
+		dataB = HDGFLZW.fromInt(dataI);
+
+		// If we've run out of data, output anything that's
+		//  pending then finish
+		if(!going) {
+		   if(rawCodeLen > 0) {
+	         outputCompressed(res);
+	         if(maskBitsSet > 0) {
+	            output8Codes(res);
+	         }
+		   }
+			break;
+		}
+
+		// Try adding this new byte onto rawCode, and
+		//  see if all of that is still found in the
+		//  buffer dictionary or not
+		rawCode[rawCodeLen] = dataB;
+		rawCodeLen++;
+		int rawAt = findRawCodeInBuffer();
+		
+		// If we found it and are now at 18 bytes,
+		//  we need to output our pending code block
+		if(rawCodeLen == 18 && rawAt > -1) {
+			outputCompressed(res);
+			rawCodeLen = 0;
+			continue;
+		}
+
+		// If we did find all of rawCode with our new
+		//  byte added on, we can wait to see what happens
+		//  with the next byte
+		if(rawAt > -1) {
+			continue;
+		}
+
+		// If we get here, then the rawCode + this byte weren't
+		// found in the dictionary
+
+		// If there was something in rawCode before, then that was
+		// found in the dictionary, so output that compressed
+		rawCodeLen--;
+		if(rawCodeLen > 0) {
+			// Output the old rawCode
+			outputCompressed(res);
+
+			// Can this byte start a new rawCode, or does
+			//  it need outputting itself?
+			rawCode[0] = dataB;
+			rawCodeLen = 1;
+			if(findRawCodeInBuffer() > -1) {
+				// Fits in, wait for next byte
+				continue;
+			}
+			// Doesn't fit, output
+			outputUncompressed(dataB,res);
+			rawCodeLen = 0;
+		} else {
+			// Nothing in rawCode before, so this byte
+			//  isn't in the buffer dictionary
+			// Output it un-compressed
+			outputUncompressed(dataB,res);
+		}
+	}
+}
+}
--- a/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java
+++ b/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java
@ -28,17 +28,19 @@ public final class TestHDGFLZW extends TestCase {
 		-21, -16, // 3 @ 4093
 		1, 0, 0, -72,
 		-13, -16, // 3 @ 5
-		78,       // *mask bit*
+		78,       // *mask bit* 2,3,4,7
 		-32, -5,  // 14 @ 4082
 		1, 0, 3,
 		-21, -16, // 3 @ 4093
 		10, 5,    // 8 @ 28
 		4,
 		-21, -16, // 3 @ 4093
-		21,       // *mask bit*
+		21,       // *mask bit* 1,3,5
 		9,
 		-21, -16, // 3 @ 4093
-		103, -21, -16, 34,
+		103, 
+		-21, -16, // 3 @ 4093
+		34,
 		-36, -1,  // 18 @ 4078
 		52, 15,   // 18 @ 70
 		70, 15,   // 18 @ 88
@ -169,7 +171,73 @@ public final class TestHDGFLZW extends TestCase {
 		}
 	}

-	public void DISABLEDtestCompress() throws Exception {
+	/**
+	 * Test that we can round-trip a little bit.
+	 * Uses a part short enough that we agree with visio
+	 *  on the best way to compress it
+	 */
+	public void testCompressMini() throws Exception {
+	   // first 11 bytes compressed = 12 bytes uncompressed
+	   byte[] sourceComp = new byte[11];
+	   byte[] sourceDecomp = new byte[12];
+	   System.arraycopy(testTrailerComp, 0, sourceComp, 0, sourceComp.length);
+      System.arraycopy(testTrailerDecomp, 0, sourceDecomp, 0, sourceDecomp.length);
+
+		// Compress it using our engine
+		HDGFLZW lzw = new HDGFLZW();
+		byte[] comp = lzw.compress(new ByteArrayInputStream(sourceDecomp));
+		
+		// Now decompress it again
+		byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
+
+		// First up, check the round tripping
+		assertEquals(12, decomp.length);
+      for(int i=0; i<decomp.length; i++) {
+         assertEquals("Wrong at " + i, decomp[i], testTrailerDecomp[i]);
+      }
+
+		// Now check the compressed intermediate version
+      assertEquals(11, comp.length);
+      for(int i=0; i<comp.length; i++) {
+         assertEquals("Wrong at " + i, comp[i], testTrailerComp[i]);
+      }
+	}
+
+	/**
+	 * Tests that we can do several mask pages
+	 */
+   public void testCompressMidi() throws Exception {
+      // First 12 -> 11
+      // Next 32 -> 13
+      byte[] sourceComp = new byte[24];
+      byte[] sourceDecomp = new byte[44];
+      System.arraycopy(testTrailerComp, 0, sourceComp, 0, sourceComp.length);
+      System.arraycopy(testTrailerDecomp, 0, sourceDecomp, 0, sourceDecomp.length);
+
+      // Compress it using our engine
+      HDGFLZW lzw = new HDGFLZW();
+      byte[] comp = lzw.compress(new ByteArrayInputStream(sourceDecomp));
+      
+      // We should be 3 characters bigger, as
+      //  we split one compressed bit into two
+      assertEquals(27, comp.length);
+      
+      // Now decompress it again
+      byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
+
+      // We can only check the round-tripping, as for now
+      //  visio cheats on re-using a block
+      assertEquals(44, decomp.length);
+      for(int i=0; i<decomp.length; i++) {
+         assertEquals("Wrong at " + i, decomp[i], sourceDecomp[i]);
+      }
+   }
+
+   /**
+    * Gets 160 bytes through then starts going wrong...
+    * TODO Fix this
+    */
+   public void DISABLEDtestCompressFull() throws Exception {
      assertEquals(339, testTrailerComp.length);
      assertEquals(632, testTrailerDecomp.length);

@ -177,11 +245,24 @@ public final class TestHDGFLZW extends TestCase {
      HDGFLZW lzw = new HDGFLZW();
      byte[] comp = lzw.compress(new ByteArrayInputStream(testTrailerDecomp));
      
-		// Now check it's the right data
+      // Now decompress it again
+      byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
+
+//      for(int i=0; i<comp.length; i++) {
+//         System.err.println(i + "\t" + comp[i] + "\t" + testTrailerComp[i]);
+//      }
+      
+      // First up, check the round tripping
+//    assertEquals(632, decomp.length);
+      for(int i=0; i<decomp.length; i++) {
+         assertEquals("Wrong at " + i, decomp[i], testTrailerDecomp[i]);
+      }
+
+      
+      // Now check the compressed intermediate version
      assertEquals(339, comp.length);
      for(int i=0; i<comp.length; i++) {
-			if(comp[i] != testTrailerComp[i])
-				System.err.println(i + "\t" + comp[i] + "\t" + testTrailerComp[i]);
+         assertEquals("Wrong at " + i, comp[i], testTrailerComp[i]);
      }
   }
 }