HADOOP-4012. Provide splitting support for bzip2 compressed files. Contributed by Abdul Qadeer

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@813581 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christopher Douglas 2009-09-10 20:51:48 +00:00
parent efb3e17bed
commit 86724941c5
12 changed files with 897 additions and 142 deletions

View File

@ -187,6 +187,9 @@ Trunk (unreleased changes)
HADOOP-5073. Add annotation mechanism for interface classification.
(Jakob Homan via suresh)
HADOOP-4012. Provide splitting support for bzip2 compressed files. (Abdul
Qadeer via cdouglas)
IMPROVEMENTS
HADOOP-4565. Added CombineFileInputFormat to use data locality information

View File

@ -296,12 +296,12 @@ abstract public class FSInputChecker extends FSInputStream {
@Override
public synchronized long getPos() throws IOException {
return chunkPos-(count-pos);
return chunkPos-Math.max(0L, count - pos);
}
@Override
public synchronized int available() throws IOException {
return count-pos;
return Math.max(0, count - pos);
}
/**

View File

@ -23,6 +23,9 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import org.apache.hadoop.fs.Seekable;
import org.apache.hadoop.io.compress.bzip2.BZip2Constants;
import org.apache.hadoop.io.compress.bzip2.BZip2DummyCompressor;
import org.apache.hadoop.io.compress.bzip2.BZip2DummyDecompressor;
import org.apache.hadoop.io.compress.bzip2.CBZip2InputStream;
@ -35,17 +38,17 @@ import org.apache.hadoop.io.compress.bzip2.CBZip2OutputStream;
* CompressionCodec which have a Compressor or Decompressor type argument, throw
* UnsupportedOperationException.
*/
public class BZip2Codec implements
org.apache.hadoop.io.compress.CompressionCodec {
public class BZip2Codec implements SplittableCompressionCodec {
private static final String HEADER = "BZ";
private static final int HEADER_LEN = HEADER.length();
private static final String SUB_HEADER = "h9";
private static final int SUB_HEADER_LEN = SUB_HEADER.length();
/**
* Creates a new instance of BZip2Codec
*/
public BZip2Codec() {
}
public BZip2Codec() { }
/**
* Creates CompressionOutputStream for BZip2
@ -62,10 +65,10 @@ public class BZip2Codec implements
}
/**
* This functionality is currently not supported.
* Creates a compressor using given OutputStream.
*
* @throws java.lang.UnsupportedOperationException
* Throws UnsupportedOperationException
* @return CompressionOutputStream
@throws java.io.IOException
*/
public CompressionOutputStream createOutputStream(OutputStream out,
Compressor compressor) throws IOException {
@ -75,8 +78,7 @@ public class BZip2Codec implements
/**
* This functionality is currently not supported.
*
* @throws java.lang.UnsupportedOperationException
* Throws UnsupportedOperationException
* @return BZip2DummyCompressor.class
*/
public Class<? extends org.apache.hadoop.io.compress.Compressor> getCompressorType() {
return BZip2DummyCompressor.class;
@ -85,8 +87,7 @@ public class BZip2Codec implements
/**
* This functionality is currently not supported.
*
* @throws java.lang.UnsupportedOperationException
* Throws UnsupportedOperationException
* @return Compressor
*/
public Compressor createCompressor() {
return new BZip2DummyCompressor();
@ -109,19 +110,72 @@ public class BZip2Codec implements
/**
* This functionality is currently not supported.
*
* @throws java.lang.UnsupportedOperationException
* Throws UnsupportedOperationException
* @return CompressionInputStream
*/
public CompressionInputStream createInputStream(InputStream in,
Decompressor decompressor) throws IOException {
return createInputStream(in);
}
/**
* Creates CompressionInputStream to be used to read off uncompressed data
* in one of the two reading modes. i.e. Continuous or Blocked reading modes
*
* @param seekableIn The InputStream
* @param start The start offset into the compressed stream
* @param end The end offset into the compressed stream
* @param readMode Controls whether progress is reported continuously or
* only at block boundaries.
*
* @return CompressionInputStream for BZip2 aligned at block boundaries
*/
public SplitCompressionInputStream createInputStream(InputStream seekableIn,
Decompressor decompressor, long start, long end, READ_MODE readMode)
throws IOException {
if (!(seekableIn instanceof Seekable)) {
throw new IOException("seekableIn must be an instance of " +
Seekable.class.getName());
}
//find the position of first BZip2 start up marker
((Seekable)seekableIn).seek(0);
// BZip2 start of block markers are of 6 bytes. But the very first block
// also has "BZh9", making it 10 bytes. This is the common case. But at
// time stream might start without a leading BZ.
final long FIRST_BZIP2_BLOCK_MARKER_POSITION =
CBZip2InputStream.numberOfBytesTillNextMarker(seekableIn);
long adjStart = Math.max(0L, start - FIRST_BZIP2_BLOCK_MARKER_POSITION);
((Seekable)seekableIn).seek(adjStart);
SplitCompressionInputStream in =
new BZip2CompressionInputStream(seekableIn, adjStart, end, readMode);
// The following if clause handles the following case:
// Assume the following scenario in BZip2 compressed stream where
// . represent compressed data.
// .....[48 bit Block].....[48 bit Block].....[48 bit Block]...
// ........................[47 bits][1 bit].....[48 bit Block]...
// ................................^[Assume a Byte alignment here]
// ........................................^^[current position of stream]
// .....................^^[We go back 10 Bytes in stream and find a Block marker]
// ........................................^^[We align at wrong position!]
// ...........................................................^^[While this pos is correct]
if (in.getPos() <= start) {
((Seekable)seekableIn).seek(start);
in = new BZip2CompressionInputStream(seekableIn, start, end, readMode);
}
return in;
}
/**
* This functionality is currently not supported.
*
* @throws java.lang.UnsupportedOperationException
* Throws UnsupportedOperationException
* @return BZip2DummyDecompressor.class
*/
public Class<? extends org.apache.hadoop.io.compress.Decompressor> getDecompressorType() {
return BZip2DummyDecompressor.class;
@ -130,8 +184,7 @@ public class BZip2Codec implements
/**
* This functionality is currently not supported.
*
* @throws java.lang.UnsupportedOperationException
* Throws UnsupportedOperationException
* @return Decompressor
*/
public Decompressor createDecompressor() {
return new BZip2DummyDecompressor();
@ -146,7 +199,8 @@ public class BZip2Codec implements
return ".bz2";
}
private static class BZip2CompressionOutputStream extends CompressionOutputStream {
private static class BZip2CompressionOutputStream extends
CompressionOutputStream {
// class data starts here//
private CBZip2OutputStream output;
@ -221,26 +275,79 @@ public class BZip2Codec implements
}// end of class BZip2CompressionOutputStream
private static class BZip2CompressionInputStream extends CompressionInputStream {
/**
* This class is capable to de-compress BZip2 data in two modes;
* CONTINOUS and BYBLOCK. BYBLOCK mode makes it possible to
* do decompression starting any arbitrary position in the stream.
*
* So this facility can easily be used to parallelize decompression
* of a large BZip2 file for performance reasons. (It is exactly
* done so for Hadoop framework. See LineRecordReader for an
* example). So one can break the file (of course logically) into
* chunks for parallel processing. These "splits" should be like
* default Hadoop splits (e.g as in FileInputFormat getSplit metod).
* So this code is designed and tested for FileInputFormat's way
* of splitting only.
*/
private static class BZip2CompressionInputStream extends
SplitCompressionInputStream {
// class data starts here//
private CBZip2InputStream input;
boolean needsReset;
private BufferedInputStream bufferedIn;
private boolean isHeaderStripped = false;
private boolean isSubHeaderStripped = false;
private READ_MODE readMode = READ_MODE.CONTINUOUS;
private long startingPos = 0L;
// Following state machine handles different states of compressed stream
// position
// HOLD : Don't advertise compressed stream position
// ADVERTISE : Read 1 more character and advertise stream position
// See more comments about it before updatePos method.
private enum POS_ADVERTISEMENT_STATE_MACHINE {
HOLD, ADVERTISE
};
POS_ADVERTISEMENT_STATE_MACHINE posSM = POS_ADVERTISEMENT_STATE_MACHINE.HOLD;
long compressedStreamPosition = 0;
// class data ends here//
public BZip2CompressionInputStream(InputStream in) throws IOException {
this(in, 0L, Long.MAX_VALUE, READ_MODE.CONTINUOUS);
}
super(in);
needsReset = true;
public BZip2CompressionInputStream(InputStream in, long start, long end,
READ_MODE readMode) throws IOException {
super(in, start, end);
needsReset = false;
bufferedIn = new BufferedInputStream(super.in);
this.startingPos = super.getPos();
this.readMode = readMode;
if (this.startingPos == 0) {
// We only strip header if it is start of file
bufferedIn = readStreamHeader();
}
input = new CBZip2InputStream(bufferedIn, readMode);
if (this.isHeaderStripped) {
input.updateReportedByteCount(HEADER_LEN);
}
if (this.isSubHeaderStripped) {
input.updateReportedByteCount(SUB_HEADER_LEN);
}
this.updatePos(false);
}
private BufferedInputStream readStreamHeader() throws IOException {
// We are flexible enough to allow the compressed stream not to
// start with the header of BZ. So it works fine either we have
// the header or not.
BufferedInputStream bufferedIn = null;
if (super.in != null) {
bufferedIn = new BufferedInputStream(super.in);
bufferedIn.mark(HEADER_LEN);
byte[] headerBytes = new byte[HEADER_LEN];
int actualRead = bufferedIn.read(headerBytes, 0, HEADER_LEN);
@ -248,6 +355,17 @@ public class BZip2Codec implements
String header = new String(headerBytes);
if (header.compareTo(HEADER) != 0) {
bufferedIn.reset();
} else {
this.isHeaderStripped = true;
// In case of BYBLOCK mode, we also want to strip off
// remaining two character of the header.
if (this.readMode == READ_MODE.BYBLOCK) {
actualRead = bufferedIn.read(headerBytes, 0,
SUB_HEADER_LEN);
if (actualRead != -1) {
this.isSubHeaderStripped = true;
}
}
}
}
}
@ -267,33 +385,96 @@ public class BZip2Codec implements
}
}
/**
* This method updates compressed stream position exactly when the
* client of this code has read off at least one byte passed any BZip2
* end of block marker.
*
* This mechanism is very helpful to deal with data level record
* boundaries. Please see constructor and next methods of
* org.apache.hadoop.mapred.LineRecordReader as an example usage of this
* feature. We elaborate it with an example in the following:
*
* Assume two different scenarios of the BZip2 compressed stream, where
* [m] represent end of block, \n is line delimiter and . represent compressed
* data.
*
* ............[m]......\n.......
*
* ..........\n[m]......\n.......
*
* Assume that end is right after [m]. In the first case the reading
* will stop at \n and there is no need to read one more line. (To see the
* reason of reading one more line in the next() method is explained in LineRecordReader.)
* While in the second example LineRecordReader needs to read one more line
* (till the second \n). Now since BZip2Codecs only update position
* at least one byte passed a maker, so it is straight forward to differentiate
* between the two cases mentioned.
*
*/
public int read(byte[] b, int off, int len) throws IOException {
if (needsReset) {
internalReset();
}
return this.input.read(b, off, len);
int result = 0;
result = this.input.read(b, off, len);
if (result == BZip2Constants.END_OF_BLOCK) {
this.posSM = POS_ADVERTISEMENT_STATE_MACHINE.ADVERTISE;
}
if (this.posSM == POS_ADVERTISEMENT_STATE_MACHINE.ADVERTISE) {
result = this.input.read(b, off, off + 1);
// This is the precise time to update compressed stream position
// to the client of this code.
this.updatePos(true);
this.posSM = POS_ADVERTISEMENT_STATE_MACHINE.HOLD;
}
return result;
}
public int read() throws IOException {
byte b[] = new byte[1];
int result = this.read(b, 0, 1);
return (result < 0) ? result : b[0];
}
private void internalReset() throws IOException {
if (needsReset) {
needsReset = false;
BufferedInputStream bufferedIn = readStreamHeader();
input = new CBZip2InputStream(bufferedIn);
input = new CBZip2InputStream(bufferedIn, this.readMode);
}
}
public void resetState() throws IOException {
// Cannot read from bufferedIn at this point because bufferedIn might not be ready
// Cannot read from bufferedIn at this point because bufferedIn
// might not be ready
// yet, as in SequenceFile.Reader implementation.
needsReset = true;
}
public int read() throws IOException {
if (needsReset) {
internalReset();
public long getPos() {
return this.compressedStreamPosition;
}
return this.input.read();
/*
* As the comments before read method tell that
* compressed stream is advertised when at least
* one byte passed EOB have been read off. But
* there is an exception to this rule. When we
* construct the stream we advertise the position
* exactly at EOB. In the following method
* shouldAddOn boolean captures this exception.
*
*/
private void updatePos(boolean shouldAddOn) {
int addOn = shouldAddOn ? 1 : 0;
this.compressedStreamPosition = this.startingPos
+ this.input.getProcessedByteCount() + addOn;
}
}// end of BZip2CompressionInputStream

View File

@ -38,9 +38,10 @@ public class BlockDecompressorStream extends DecompressorStream {
* @param in input stream
* @param decompressor decompressor to use
* @param bufferSize size of buffer
* @throws IOException
*/
public BlockDecompressorStream(InputStream in, Decompressor decompressor,
int bufferSize) {
int bufferSize) throws IOException {
super(in, decompressor, bufferSize);
}
@ -49,12 +50,13 @@ public class BlockDecompressorStream extends DecompressorStream {
*
* @param in input stream
* @param decompressor decompressor to use
* @throws IOException
*/
public BlockDecompressorStream(InputStream in, Decompressor decompressor) {
public BlockDecompressorStream(InputStream in, Decompressor decompressor) throws IOException {
super(in, decompressor);
}
protected BlockDecompressorStream(InputStream in) {
protected BlockDecompressorStream(InputStream in) throws IOException {
super(in);
}

View File

@ -21,6 +21,8 @@ package org.apache.hadoop.io.compress;
import java.io.IOException;
import java.io.InputStream;
import org.apache.hadoop.fs.PositionedReadable;
import org.apache.hadoop.fs.Seekable;
/**
* A compression input stream.
*
@ -28,19 +30,25 @@ import java.io.InputStream;
* reposition the underlying input stream then call {@link #resetState()},
* without having to also synchronize client buffers.
*/
public abstract class CompressionInputStream extends InputStream {
public abstract class CompressionInputStream extends InputStream implements Seekable {
/**
* The input stream to be compressed.
*/
protected final InputStream in;
protected long maxAvailableData = 0L;
/**
* Create a compression input stream that reads
* the decompressed bytes from the given stream.
*
* @param in The input stream to be compressed.
* @throws IOException
*/
protected CompressionInputStream(InputStream in) {
protected CompressionInputStream(InputStream in) throws IOException {
if (!(in instanceof Seekable) || !(in instanceof PositionedReadable)) {
this.maxAvailableData = in.available();
}
this.in = in;
}
@ -60,4 +68,40 @@ public abstract class CompressionInputStream extends InputStream {
*/
public abstract void resetState() throws IOException;
/**
* This method returns the current position in the stream.
*
* @return Current position in stream as a long
*/
public long getPos() throws IOException {
if (!(in instanceof Seekable) || !(in instanceof PositionedReadable)){
//This way of getting the current position will not work for file
//size which can be fit in an int and hence can not be returned by
//available method.
return (this.maxAvailableData - this.in.available());
}
else{
return ((Seekable)this.in).getPos();
}
}
/**
* This method is current not supported.
*
* @throws UnsupportedOperationException
*/
public void seek(long pos) throws UnsupportedOperationException {
throw new UnsupportedOperationException();
}
/**
* This method is current not supported.
*
* @throws UnsupportedOperationException
*/
public boolean seekToNewSource(long targetPos) throws UnsupportedOperationException {
throw new UnsupportedOperationException();
}
}

View File

@ -30,7 +30,7 @@ public class DecompressorStream extends CompressionInputStream {
protected boolean eof = false;
protected boolean closed = false;
public DecompressorStream(InputStream in, Decompressor decompressor, int bufferSize) {
public DecompressorStream(InputStream in, Decompressor decompressor, int bufferSize) throws IOException {
super(in);
if (in == null || decompressor == null) {
@ -43,7 +43,7 @@ public class DecompressorStream extends CompressionInputStream {
buffer = new byte[bufferSize];
}
public DecompressorStream(InputStream in, Decompressor decompressor) {
public DecompressorStream(InputStream in, Decompressor decompressor) throws IOException {
this(in, decompressor, 512);
}
@ -51,8 +51,9 @@ public class DecompressorStream extends CompressionInputStream {
* Allow derived classes to directly set the underlying stream.
*
* @param in Underlying input stream.
* @throws IOException
*/
protected DecompressorStream(InputStream in) {
protected DecompressorStream(InputStream in) throws IOException {
super(in);
}

View File

@ -103,8 +103,9 @@ public class GzipCodec extends DefaultCodec {
/**
* Allow subclasses to directly set the inflater stream.
* @throws IOException
*/
protected GzipInputStream(DecompressorStream in) {
protected GzipInputStream(DecompressorStream in) throws IOException {
super(in);
}

View File

@ -0,0 +1,66 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.io.compress;
import java.io.IOException;
import java.io.InputStream;
/**
* An InputStream covering a range of compressed data. The start and end
* offsets requested by a client may be modified by the codec to fit block
* boundaries or other algorithm-dependent requirements.
*/
public abstract class SplitCompressionInputStream
extends CompressionInputStream {
private long start;
private long end;
public SplitCompressionInputStream(InputStream in, long start, long end)
throws IOException {
super(in);
this.start = start;
this.end = end;
}
protected void setStart(long start) {
this.start = start;
}
protected void setEnd(long end) {
this.end = end;
}
/**
* After calling createInputStream, the values of start or end
* might change. So this method can be used to get the new value of start.
* @return The changed value of start
*/
public long getAdjustedStart() {
return start;
}
/**
* After calling createInputStream, the values of start or end
* might change. So this method can be used to get the new value of end.
* @return The changed value of end
*/
public long getAdjustedEnd() {
return end;
}
}

View File

@ -0,0 +1,72 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.io.compress;
import java.io.IOException;
import java.io.InputStream;
/**
* This interface is meant to be implemented by those compression codecs
* which are capable to compress / de-compress a stream starting at any
* arbitrary position.
*
* Especially the process of de-compressing a stream starting at some arbitrary
* position is challenging. Most of the codecs are only able to successfully
* de-compress a stream, if they start from the very beginning till the end.
* One of the reasons is the stored state at the beginning of the stream which
* is crucial for de-compression.
*
* Yet there are few codecs which do not save the whole state at the beginning
* of the stream and hence can be used to de-compress stream starting at any
* arbitrary points. This interface is meant to be used by such codecs. Such
* codecs are highly valuable, especially in the context of Hadoop, because
* an input compressed file can be split and hence can be worked on by multiple
* machines in parallel.
*/
public interface SplittableCompressionCodec extends CompressionCodec {
/**
* During decompression, data can be read off from the decompressor in two
* modes, namely continuous and blocked. Few codecs (e.g. BZip2) are capable
* of compressing data in blocks and then decompressing the blocks. In
* Blocked reading mode codecs inform 'end of block' events to its caller.
* While in continuous mode, the caller of codecs is unaware about the blocks
* and uncompressed data is spilled out like a continuous stream.
*/
public enum READ_MODE {CONTINUOUS, BYBLOCK};
/**
* Create a stream as dictated by the readMode. This method is used when
* the codecs wants the ability to work with the underlying stream positions.
*
* @param seekableIn The seekable input stream (seeks in compressed data)
* @param start The start offset into the compressed stream. May be changed
* by the underlying codec.
* @param end The end offset into the compressed stream. May be changed by
* the underlying codec.
* @param readMode Controls whether stream position is reported continuously
* from the compressed stream only only at block boundaries.
* @return a stream to read uncompressed bytes from
*/
SplitCompressionInputStream createInputStream(InputStream seekableIn,
Decompressor decompressor, long start, long end, READ_MODE readMode)
throws IOException;
}

View File

@ -44,6 +44,14 @@ public interface BZip2Constants {
int N_ITERS = 4;
int MAX_SELECTORS = (2 + (900000 / G_SIZE));
int NUM_OVERSHOOT_BYTES = 20;
/**
* End of a BZip2 block
*/
public static final int END_OF_BLOCK = -2;
/**
* End of BZip2 stream.
*/
public static final int END_OF_STREAM = -1;
/**
* This array really shouldn't be here. Again, for historical purposes it

View File

@ -23,9 +23,13 @@
*/
package org.apache.hadoop.io.compress.bzip2;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.IOException;
import org.apache.hadoop.io.compress.SplittableCompressionCodec.READ_MODE;
/**
* An input stream that decompresses from the BZip2 format (without the file
* header chars) to be read as any other stream.
@ -45,30 +49,43 @@ import java.io.IOException;
* </p>
*
* <p>
* This Ant code was enhanced so that it can de-compress blocks of bzip2 data.
* Current position in the stream is an important statistic for Hadoop. For
* example in LineRecordReader, we solely depend on the current position in the
* stream to know about the progess. The notion of position becomes complicated
* for compressed files. The Hadoop splitting is done in terms of compressed
* file. But a compressed file deflates to a large amount of data. So we have
* handled this problem in the following way.
*
* On object creation time, we find the next block start delimiter. Once such a
* marker is found, the stream stops there (we discard any read compressed data
* in this process) and the position is updated (i.e. the caller of this class
* will find out the stream location). At this point we are ready for actual
* reading (i.e. decompression) of data.
*
* The subsequent read calls give out data. The position is updated when the
* caller of this class has read off the current block + 1 bytes. In between the
* block reading, position is not updated. (We can only update the postion on
* block boundaries).
* </p>
*
* <p>
* Instances of this class are not threadsafe.
* </p>
*/
public class CBZip2InputStream extends InputStream implements BZip2Constants {
private static void reportCRCError() throws IOException {
throw new IOException("BZip2 CRC error");
}
private void makeMaps() {
final boolean[] inUse = this.data.inUse;
final byte[] seqToUnseq = this.data.seqToUnseq;
int nInUseShadow = 0;
for (int i = 0; i < 256; i++) {
if (inUse[i])
seqToUnseq[nInUseShadow++] = (byte) i;
}
this.nInUse = nInUseShadow;
}
public static final long BLOCK_DELIMITER = 0X314159265359L;// start of block
public static final long EOS_DELIMITER = 0X177245385090L;// end of bzip2 stream
private static final int DELIMITER_BIT_LENGTH = 48;
READ_MODE readMode = READ_MODE.CONTINUOUS;
// The variable records the current advertised position of the stream.
private long reportedBytesReadFromCompressedStream = 0L;
// The following variable keep record of compressed bytes read.
private long bytesReadFromCompressedStream = 0L;
private boolean lazyInitialization = false;
private byte array[] = new byte[1];
/**
* Index of the last char in the block, so the block size == last + 1.
@ -86,32 +103,34 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
*/
private int blockSize100k;
private boolean blockRandomised;
private boolean blockRandomised = false;
private int bsBuff;
private int bsLive;
private long bsBuff;
private long bsLive;
private final CRC crc = new CRC();
private int nInUse;
private InputStream in;
private BufferedInputStream in;
private int currentChar = -1;
private static final int EOF = 0;
private static final int START_BLOCK_STATE = 1;
private static final int RAND_PART_A_STATE = 2;
private static final int RAND_PART_B_STATE = 3;
private static final int RAND_PART_C_STATE = 4;
private static final int NO_RAND_PART_A_STATE = 5;
private static final int NO_RAND_PART_B_STATE = 6;
private static final int NO_RAND_PART_C_STATE = 7;
/**
* A state machine to keep track of current state of the de-coder
*
*/
public enum STATE {
EOF, START_BLOCK_STATE, RAND_PART_A_STATE, RAND_PART_B_STATE, RAND_PART_C_STATE, NO_RAND_PART_A_STATE, NO_RAND_PART_B_STATE, NO_RAND_PART_C_STATE, NO_PROCESS_STATE
};
private int currentState = START_BLOCK_STATE;
private STATE currentState = STATE.START_BLOCK_STATE;
private int storedBlockCRC, storedCombinedCRC;
private int computedBlockCRC, computedCombinedCRC;
private boolean skipResult = false;// used by skipToNextMarker
private static boolean skipDecompression = false;
// Variables used by setup* methods exclusively
private int su_count;
@ -129,6 +148,121 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
*/
private CBZip2InputStream.Data data;
/**
* This method reports the processed bytes so far. Please note that this
* statistic is only updated on block boundaries and only when the stream is
* initiated in BYBLOCK mode.
*/
public long getProcessedByteCount() {
return reportedBytesReadFromCompressedStream;
}
/**
* This method keeps track of raw processed compressed
* bytes.
*
* @param count count is the number of bytes to be
* added to raw processed bytes
*/
protected void updateProcessedByteCount(int count) {
this.bytesReadFromCompressedStream += count;
}
/**
* This method is called by the client of this
* class in case there are any corrections in
* the stream position. One common example is
* when client of this code removes starting BZ
* characters from the compressed stream.
*
* @param count count bytes are added to the reported bytes
*
*/
public void updateReportedByteCount(int count) {
this.reportedBytesReadFromCompressedStream += count;
this.updateProcessedByteCount(count);
}
/**
* This method reads a Byte from the compressed stream. Whenever we need to
* read from the underlying compressed stream, this method should be called
* instead of directly calling the read method of the underlying compressed
* stream. This method does important record keeping to have the statistic
* that how many bytes have been read off the compressed stream.
*/
private int readAByte(InputStream inStream) throws IOException {
int read = inStream.read();
if (read >= 0) {
this.updateProcessedByteCount(1);
}
return read;
}
/**
* This method tries to find the marker (passed to it as the first parameter)
* in the stream. It can find bit patterns of length <= 63 bits. Specifically
* this method is used in CBZip2InputStream to find the end of block (EOB)
* delimiter in the stream, starting from the current position of the stream.
* If marker is found, the stream position will be right after marker at the
* end of this call.
*
* @param marker The bit pattern to be found in the stream
* @param markerBitLength No of bits in the marker
*
* @throws IOException
* @throws IllegalArgumentException if marketBitLength is greater than 63
*/
public boolean skipToNextMarker(long marker, int markerBitLength)
throws IOException, IllegalArgumentException {
try {
if (markerBitLength > 63) {
throw new IllegalArgumentException(
"skipToNextMarker can not find patterns greater than 63 bits");
}
// pick next marketBitLength bits in the stream
long bytes = 0;
bytes = this.bsR(markerBitLength);
if (bytes == -1) {
return false;
}
while (true) {
if (bytes == marker) {
return true;
} else {
bytes = bytes << 1;
bytes = bytes & ((1L << markerBitLength) - 1);
int oneBit = (int) this.bsR(1);
if (oneBit != -1) {
bytes = bytes | oneBit;
} else
return false;
}
}
} catch (IOException ex) {
return false;
}
}
protected void reportCRCError() throws IOException {
throw new IOException("crc error");
}
private void makeMaps() {
final boolean[] inUse = this.data.inUse;
final byte[] seqToUnseq = this.data.seqToUnseq;
int nInUseShadow = 0;
for (int i = 0; i < 256; i++) {
if (inUse[i])
seqToUnseq[nInUseShadow++] = (byte) i;
}
this.nInUse = nInUseShadow;
}
/**
* Constructs a new CBZip2InputStream which decompresses bytes read from the
* specified stream.
@ -145,21 +279,99 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
* @throws NullPointerException
* if <tt>in == null</tt>
*/
public CBZip2InputStream(final InputStream in) throws IOException {
super();
public CBZip2InputStream(final InputStream in, READ_MODE readMode)
throws IOException {
this.in = in;
super();
int blockSize = 0X39;// i.e 9
this.blockSize100k = blockSize - '0';
this.in = new BufferedInputStream(in, 1024 * 9);// >1 MB buffer
this.readMode = readMode;
if (readMode == READ_MODE.CONTINUOUS) {
currentState = STATE.START_BLOCK_STATE;
lazyInitialization = (in.available() == 0)?true:false;
if(!lazyInitialization){
init();
}
} else if (readMode == READ_MODE.BYBLOCK) {
this.currentState = STATE.NO_PROCESS_STATE;
skipResult = this.skipToNextMarker(CBZip2InputStream.BLOCK_DELIMITER,DELIMITER_BIT_LENGTH);
this.reportedBytesReadFromCompressedStream = this.bytesReadFromCompressedStream;
if(!skipDecompression){
changeStateToProcessABlock();
}
}
}
/**
* Returns the number of bytes between the current stream position
* and the immediate next BZip2 block marker.
*
* @param in
* The InputStream
*
* @return long Number of bytes between current stream position and the
* next BZip2 block start marker.
* @throws IOException
*
*/
public static long numberOfBytesTillNextMarker(final InputStream in) throws IOException{
CBZip2InputStream.skipDecompression = true;
CBZip2InputStream anObject = null;
anObject = new CBZip2InputStream(in, READ_MODE.BYBLOCK);
return anObject.getProcessedByteCount();
}
public CBZip2InputStream(final InputStream in) throws IOException {
this(in, READ_MODE.CONTINUOUS);
}
private void changeStateToProcessABlock() throws IOException {
if (skipResult == true) {
initBlock();
setupBlock();
} else {
this.currentState = STATE.EOF;
}
}
public int read() throws IOException {
if (this.in != null) {
return read0();
int result = this.read(array, 0, 1);
int value = 0XFF & array[0];
return (result > 0 ? value : result);
} else {
throw new IOException("stream closed");
}
}
/**
* In CONTINOUS reading mode, this read method starts from the
* start of the compressed stream and end at the end of file by
* emitting un-compressed data. In this mode stream positioning
* is not announced and should be ignored.
*
* In BYBLOCK reading mode, this read method informs about the end
* of a BZip2 block by returning EOB. At this event, the compressed
* stream position is also announced. This announcement tells that
* how much of the compressed stream has been de-compressed and read
* out of this class. In between EOB events, the stream position is
* not updated.
*
*
* @throws IOException
* if the stream content is malformed or an I/O error occurs.
*
* @return int The return value greater than 0 are the bytes read. A value
* of -1 means end of stream while -2 represents end of block
*/
public int read(final byte[] dest, final int offs, final int len)
throws IOException {
if (offs < 0) {
@ -176,13 +388,39 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
throw new IOException("stream closed");
}
final int hi = offs + len;
int destOffs = offs;
for (int b; (destOffs < hi) && ((b = read0()) >= 0);) {
dest[destOffs++] = (byte) b;
if(lazyInitialization){
this.init();
this.lazyInitialization = false;
}
return (destOffs == offs) ? -1 : (destOffs - offs);
if(skipDecompression){
changeStateToProcessABlock();
CBZip2InputStream.skipDecompression = false;
}
final int hi = offs + len;
int destOffs = offs;
int b = 0;
for (; ((destOffs < hi) && ((b = read0())) >= 0);) {
dest[destOffs++] = (byte) b;
}
int result = destOffs - offs;
if (result == 0) {
//report 'end of block' or 'end of stream'
result = b;
skipResult = this.skipToNextMarker(CBZip2InputStream.BLOCK_DELIMITER, DELIMITER_BIT_LENGTH);
//Exactly when we are about to start a new block, we advertise the stream position.
this.reportedBytesReadFromCompressedStream = this.bytesReadFromCompressedStream;
changeStateToProcessABlock();
}
return result;
}
private int read0() throws IOException {
@ -190,7 +428,10 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
switch (this.currentState) {
case EOF:
return -1;
return END_OF_STREAM;// return -1
case NO_PROCESS_STATE:
return END_OF_BLOCK;// return -2
case START_BLOCK_STATE:
throw new IllegalStateException();
@ -225,13 +466,13 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
}
private void init() throws IOException {
int magic2 = this.in.read();
int magic2 = this.readAByte(in);
if (magic2 != 'h') {
throw new IOException("Stream is not BZip2 formatted: expected 'h'"
+ " as first byte but got '" + (char) magic2 + "'");
}
int blockSize = this.in.read();
int blockSize = this.readAByte(in);
if ((blockSize < '1') || (blockSize > '9')) {
throw new IOException("Stream is not BZip2 formatted: illegal "
+ "blocksize " + (char) blockSize);
@ -244,6 +485,27 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
}
private void initBlock() throws IOException {
if (this.readMode == READ_MODE.BYBLOCK) {
// this.checkBlockIntegrity();
this.storedBlockCRC = bsGetInt();
this.blockRandomised = bsR(1) == 1;
/**
* Allocate data here instead in constructor, so we do not allocate
* it if the input file is empty.
*/
if (this.data == null) {
this.data = new Data(this.blockSize100k);
}
// currBlockNo++;
getAndMoveToFrontDecode();
this.crc.initialiseCRC();
this.currentState = STATE.START_BLOCK_STATE;
return;
}
char magic0 = bsGetUByte();
char magic1 = bsGetUByte();
char magic2 = bsGetUByte();
@ -261,7 +523,7 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
magic4 != 0x53 || // 'S'
magic5 != 0x59 // 'Y'
) {
this.currentState = EOF;
this.currentState = STATE.EOF;
throw new IOException("bad block header");
} else {
this.storedBlockCRC = bsGetInt();
@ -279,7 +541,7 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
getAndMoveToFrontDecode();
this.crc.initialiseCRC();
this.currentState = START_BLOCK_STATE;
this.currentState = STATE.START_BLOCK_STATE;
}
}
@ -304,7 +566,7 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
private void complete() throws IOException {
this.storedCombinedCRC = bsGetInt();
this.currentState = EOF;
this.currentState = STATE.EOF;
this.data = null;
if (this.storedCombinedCRC != this.computedCombinedCRC) {
@ -326,14 +588,14 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
}
}
private int bsR(final int n) throws IOException {
int bsLiveShadow = this.bsLive;
int bsBuffShadow = this.bsBuff;
private long bsR(final long n) throws IOException {
long bsLiveShadow = this.bsLive;
long bsBuffShadow = this.bsBuff;
if (bsLiveShadow < n) {
final InputStream inShadow = this.in;
do {
int thech = inShadow.read();
int thech = readAByte(inShadow);
if (thech < 0) {
throw new IOException("unexpected end of stream");
@ -347,15 +609,15 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
}
this.bsLive = bsLiveShadow - n;
return (bsBuffShadow >> (bsLiveShadow - n)) & ((1 << n) - 1);
return (bsBuffShadow >> (bsLiveShadow - n)) & ((1L << n) - 1);
}
private boolean bsGetBit() throws IOException {
int bsLiveShadow = this.bsLive;
int bsBuffShadow = this.bsBuff;
long bsLiveShadow = this.bsLive;
long bsBuffShadow = this.bsBuff;
if (bsLiveShadow < 1) {
int thech = this.in.read();
int thech = this.readAByte(in);
if (thech < 0) {
throw new IOException("unexpected end of stream");
@ -375,7 +637,7 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
}
private int bsGetInt() throws IOException {
return (((((bsR(8) << 8) | bsR(8)) << 8) | bsR(8)) << 8) | bsR(8);
return (int) ((((((bsR(8) << 8) | bsR(8)) << 8) | bsR(8)) << 8) | bsR(8));
}
/**
@ -454,8 +716,8 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
final int alphaSize = this.nInUse + 2;
/* Now the selectors */
final int nGroups = bsR(3);
final int nSelectors = bsR(15);
final int nGroups = (int) bsR(3);
final int nSelectors = (int) bsR(15);
for (int i = 0; i < nSelectors; i++) {
int j = 0;
@ -486,7 +748,7 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
/* Now the coding tables */
for (int t = 0; t < nGroups; t++) {
int curr = bsR(5);
int curr = (int) bsR(5);
final char[] len_t = len[t];
for (int i = 0; i < alphaSize; i++) {
while (bsGetBit()) {
@ -532,7 +794,7 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
}
private void getAndMoveToFrontDecode() throws IOException {
this.origPtr = bsR(24);
this.origPtr = (int) bsR(24);
recvDecodingTables();
final InputStream inShadow = this.in;
@ -562,8 +824,8 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
int groupPos = G_SIZE - 1;
final int eob = this.nInUse + 1;
int nextSym = getAndMoveToFrontDecode0(0);
int bsBuffShadow = this.bsBuff;
int bsLiveShadow = this.bsLive;
int bsBuffShadow = (int) this.bsBuff;
int bsLiveShadow = (int) this.bsLive;
int lastShadow = -1;
int zt = selector[groupNo] & 0xff;
int[] base_zt = base[zt];
@ -597,10 +859,8 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
int zn = minLens_zt;
// Inlined:
// int zvec = bsR(zn);
while (bsLiveShadow < zn) {
final int thech = inShadow.read();
final int thech = readAByte(inShadow);
if (thech >= 0) {
bsBuffShadow = (bsBuffShadow << 8) | thech;
bsLiveShadow += 8;
@ -609,14 +869,14 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
throw new IOException("unexpected end of stream");
}
}
int zvec = (bsBuffShadow >> (bsLiveShadow - zn))
long zvec = (bsBuffShadow >> (bsLiveShadow - zn))
& ((1 << zn) - 1);
bsLiveShadow -= zn;
while (zvec > limit_zt[zn]) {
zn++;
while (bsLiveShadow < 1) {
final int thech = inShadow.read();
final int thech = readAByte(inShadow);
if (thech >= 0) {
bsBuffShadow = (bsBuffShadow << 8) | thech;
bsLiveShadow += 8;
@ -630,7 +890,7 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
zvec = (zvec << 1)
| ((bsBuffShadow >> bsLiveShadow) & 1);
}
nextSym = perm_zt[zvec - base_zt[zn]];
nextSym = perm_zt[(int) (zvec - base_zt[zn])];
}
final byte ch = seqToUnseq[yy[0]];
@ -680,10 +940,8 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
int zn = minLens_zt;
// Inlined:
// int zvec = bsR(zn);
while (bsLiveShadow < zn) {
final int thech = inShadow.read();
final int thech = readAByte(inShadow);
if (thech >= 0) {
bsBuffShadow = (bsBuffShadow << 8) | thech;
bsLiveShadow += 8;
@ -699,7 +957,7 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
while (zvec > limit_zt[zn]) {
zn++;
while (bsLiveShadow < 1) {
final int thech = inShadow.read();
final int thech = readAByte(inShadow);
if (thech >= 0) {
bsBuffShadow = (bsBuffShadow << 8) | thech;
bsLiveShadow += 8;
@ -709,7 +967,7 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
}
}
bsLiveShadow--;
zvec = (zvec << 1) | ((bsBuffShadow >> bsLiveShadow) & 1);
zvec = ((zvec << 1) | ((bsBuffShadow >> bsLiveShadow) & 1));
}
nextSym = perm_zt[zvec - base_zt[zn]];
}
@ -726,14 +984,14 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
final int zt = dataShadow.selector[groupNo] & 0xff;
final int[] limit_zt = dataShadow.limit[zt];
int zn = dataShadow.minLens[zt];
int zvec = bsR(zn);
int bsLiveShadow = this.bsLive;
int bsBuffShadow = this.bsBuff;
int zvec = (int) bsR(zn);
int bsLiveShadow = (int) this.bsLive;
int bsBuffShadow = (int) this.bsBuff;
while (zvec > limit_zt[zn]) {
zn++;
while (bsLiveShadow < 1) {
final int thech = inShadow.read();
final int thech = readAByte(inShadow);
if (thech >= 0) {
bsBuffShadow = (bsBuffShadow << 8) | thech;
@ -807,12 +1065,16 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
this.su_ch2 = su_ch2Shadow ^= (this.su_rNToGo == 1) ? 1 : 0;
this.su_i2++;
this.currentChar = su_ch2Shadow;
this.currentState = RAND_PART_B_STATE;
this.currentState = STATE.RAND_PART_B_STATE;
this.crc.updateCRC(su_ch2Shadow);
} else {
endBlock();
if (readMode == READ_MODE.CONTINUOUS) {
initBlock();
setupBlock();
} else if (readMode == READ_MODE.BYBLOCK) {
this.currentState = STATE.NO_PROCESS_STATE;
}
}
}
@ -824,19 +1086,23 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
this.su_tPos = this.data.tt[this.su_tPos];
this.su_i2++;
this.currentChar = su_ch2Shadow;
this.currentState = NO_RAND_PART_B_STATE;
this.currentState = STATE.NO_RAND_PART_B_STATE;
this.crc.updateCRC(su_ch2Shadow);
} else {
this.currentState = NO_RAND_PART_A_STATE;
this.currentState = STATE.NO_RAND_PART_A_STATE;
endBlock();
if (readMode == READ_MODE.CONTINUOUS) {
initBlock();
setupBlock();
} else if (readMode == READ_MODE.BYBLOCK) {
this.currentState = STATE.NO_PROCESS_STATE;
}
}
}
private void setupRandPartB() throws IOException {
if (this.su_ch2 != this.su_chPrev) {
this.currentState = RAND_PART_A_STATE;
this.currentState = STATE.RAND_PART_A_STATE;
this.su_count = 1;
setupRandPartA();
} else if (++this.su_count >= 4) {
@ -851,13 +1117,13 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
this.su_rNToGo--;
}
this.su_j2 = 0;
this.currentState = RAND_PART_C_STATE;
this.currentState = STATE.RAND_PART_C_STATE;
if (this.su_rNToGo == 1) {
this.su_z ^= 1;
}
setupRandPartC();
} else {
this.currentState = RAND_PART_A_STATE;
this.currentState = STATE.RAND_PART_A_STATE;
setupRandPartA();
}
}
@ -868,7 +1134,7 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
this.crc.updateCRC(this.su_ch2);
this.su_j2++;
} else {
this.currentState = RAND_PART_A_STATE;
this.currentState = STATE.RAND_PART_A_STATE;
this.su_i2++;
this.su_count = 0;
setupRandPartA();
@ -895,7 +1161,7 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
this.currentChar = su_ch2Shadow;
this.crc.updateCRC(su_ch2Shadow);
this.su_j2++;
this.currentState = NO_RAND_PART_C_STATE;
this.currentState = STATE.NO_RAND_PART_C_STATE;
} else {
this.su_i2++;
this.su_count = 0;

View File

@ -19,31 +19,38 @@ package org.apache.hadoop.io.compress;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.Random;
import junit.framework.TestCase;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.RandomDatum;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.io.compress.zlib.ZlibFactory;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.util.ReflectionUtils;
public class TestCodec extends TestCase {
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.junit.Test;
import static org.junit.Assert.*;
public class TestCodec {
private static final Log LOG=
LogFactory.getLog(TestCodec.class);
@ -51,17 +58,20 @@ public class TestCodec extends TestCase {
private Configuration conf = new Configuration();
private int count = 10000;
private int seed = new Random().nextInt();
@Test
public void testDefaultCodec() throws IOException {
codecTest(conf, seed, 0, "org.apache.hadoop.io.compress.DefaultCodec");
codecTest(conf, seed, count, "org.apache.hadoop.io.compress.DefaultCodec");
}
@Test
public void testGzipCodec() throws IOException {
codecTest(conf, seed, 0, "org.apache.hadoop.io.compress.GzipCodec");
codecTest(conf, seed, count, "org.apache.hadoop.io.compress.GzipCodec");
}
@Test
public void testBZip2Codec() throws IOException {
codecTest(conf, seed, 0, "org.apache.hadoop.io.compress.BZip2Codec");
codecTest(conf, seed, count, "org.apache.hadoop.io.compress.BZip2Codec");
@ -133,6 +143,109 @@ public class TestCodec extends TestCase {
LOG.info("SUCCESS! Completed checking " + count + " records");
}
@Test
public void testSplitableCodecs() throws Exception {
testSplitableCodec(BZip2Codec.class);
}
private void testSplitableCodec(
Class<? extends SplittableCompressionCodec> codecClass)
throws IOException {
final long DEFLBYTES = 2 * 1024 * 1024;
final Configuration conf = new Configuration();
final Random rand = new Random();
final long seed = rand.nextLong();
LOG.info("seed: " + seed);
rand.setSeed(seed);
SplittableCompressionCodec codec =
ReflectionUtils.newInstance(codecClass, conf);
final FileSystem fs = FileSystem.getLocal(conf);
final FileStatus infile =
fs.getFileStatus(writeSplitTestFile(fs, rand, codec, DEFLBYTES));
if (infile.getLen() > Integer.MAX_VALUE) {
fail("Unexpected compression: " + DEFLBYTES + " -> " + infile.getLen());
}
final int flen = (int) infile.getLen();
final Text line = new Text();
final Decompressor dcmp = CodecPool.getDecompressor(codec);
try {
for (int pos = 0; pos < infile.getLen(); pos += rand.nextInt(flen / 8)) {
// read from random positions, verifying that there exist two sequential
// lines as written in writeSplitTestFile
final SplitCompressionInputStream in =
codec.createInputStream(fs.open(infile.getPath()), dcmp,
pos, flen, SplittableCompressionCodec.READ_MODE.BYBLOCK);
if (in.getAdjustedStart() >= flen) {
break;
}
LOG.info("SAMPLE " + in.getAdjustedStart() + "," + in.getAdjustedEnd());
final LineReader lreader = new LineReader(in);
lreader.readLine(line); // ignore; likely partial
if (in.getPos() >= flen) {
break;
}
lreader.readLine(line);
final int seq1 = readLeadingInt(line);
lreader.readLine(line);
if (in.getPos() >= flen) {
break;
}
final int seq2 = readLeadingInt(line);
assertEquals("Mismatched lines", seq1 + 1, seq2);
}
} finally {
CodecPool.returnDecompressor(dcmp);
}
// remove on success
fs.delete(infile.getPath().getParent(), true);
}
private static int readLeadingInt(Text txt) throws IOException {
DataInputStream in =
new DataInputStream(new ByteArrayInputStream(txt.getBytes()));
return in.readInt();
}
/** Write infLen bytes (deflated) to file in test dir using codec.
* Records are of the form
* &lt;i&gt;&lt;b64 rand&gt;&lt;i+i&gt;&lt;b64 rand&gt;
*/
private static Path writeSplitTestFile(FileSystem fs, Random rand,
CompressionCodec codec, long infLen) throws IOException {
final int REC_SIZE = 1024;
final Path wd = new Path(new Path(
System.getProperty("test.build.data", "/tmp")).makeQualified(fs),
codec.getClass().getSimpleName());
final Path file = new Path(wd, "test" + codec.getDefaultExtension());
final byte[] b = new byte[REC_SIZE];
final Base64 b64 = new Base64();
DataOutputStream fout = null;
Compressor cmp = CodecPool.getCompressor(codec);
try {
fout = new DataOutputStream(codec.createOutputStream(
fs.create(file, true), cmp));
final DataOutputBuffer dob = new DataOutputBuffer(REC_SIZE * 4 / 3 + 4);
int seq = 0;
while (infLen > 0) {
rand.nextBytes(b);
final byte[] b64enc = b64.encode(b); // ensures rand printable, no LF
dob.reset();
dob.writeInt(seq);
System.arraycopy(dob.getData(), 0, b64enc, 0, dob.getLength());
fout.write(b64enc);
fout.write('\n');
++seq;
infLen -= b64enc.length;
}
LOG.info("Wrote " + seq + " records to " + file);
} finally {
IOUtils.cleanup(LOG, fout);
CodecPool.returnCompressor(cmp);
}
return file;
}
@Test
public void testCodecPoolGzipReuse() throws Exception {
Configuration conf = new Configuration();
conf.setBoolean("hadoop.native.lib", true);
@ -149,19 +262,21 @@ public class TestCodec extends TestCase {
assertTrue("Got mismatched ZlibCompressor", c2 != CodecPool.getCompressor(gzc));
}
public void testSequenceFileDefaultCodec() throws IOException, ClassNotFoundException,
@Test
public void testSequenceFileDefaultCodec() throws IOException, ClassNotFoundException,
InstantiationException, IllegalAccessException {
sequenceFileCodecTest(conf, 100, "org.apache.hadoop.io.compress.DefaultCodec", 100);
sequenceFileCodecTest(conf, 200000, "org.apache.hadoop.io.compress.DefaultCodec", 1000000);
}
public void testSequenceFileBZip2Codec() throws IOException, ClassNotFoundException,
@Test
public void testSequenceFileBZip2Codec() throws IOException, ClassNotFoundException,
InstantiationException, IllegalAccessException {
sequenceFileCodecTest(conf, 0, "org.apache.hadoop.io.compress.BZip2Codec", 100);
sequenceFileCodecTest(conf, 100, "org.apache.hadoop.io.compress.BZip2Codec", 100);
sequenceFileCodecTest(conf, 200000, "org.apache.hadoop.io.compress.BZip2Codec", 1000000);
}
private static void sequenceFileCodecTest(Configuration conf, int lines,
String codecClass, int blockSize)
throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException {
@ -242,8 +357,4 @@ public class TestCodec extends TestCase {
}
public TestCodec(String name) {
super(name);
}
}