HADOOP-11445. Bzip2Codec: Data block is skipped when position of newly created stream is equal to start of split. Contributed by Ankit Kamboj
(cherry picked from commit d02fb53750
)
This commit is contained in:
parent
53ecb63587
commit
2b408d8dc7
|
@ -322,6 +322,9 @@ Release 2.7.0 - UNRELEASED
|
||||||
HADOOP-11459. Fix recent findbugs in ActiveStandbyElector, NetUtils
|
HADOOP-11459. Fix recent findbugs in ActiveStandbyElector, NetUtils
|
||||||
and ShellBasedIdMapping (vinayakumarb)
|
and ShellBasedIdMapping (vinayakumarb)
|
||||||
|
|
||||||
|
HADOOP-11445. Bzip2Codec: Data block is skipped when position of newly
|
||||||
|
created stream is equal to start of split (Ankit Kamboj via jlowe)
|
||||||
|
|
||||||
Release 2.6.0 - 2014-11-18
|
Release 2.6.0 - 2014-11-18
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -225,7 +225,7 @@ public class BZip2Codec implements Configurable, SplittableCompressionCodec {
|
||||||
// ........................................^^[We align at wrong position!]
|
// ........................................^^[We align at wrong position!]
|
||||||
// ...........................................................^^[While this pos is correct]
|
// ...........................................................^^[While this pos is correct]
|
||||||
|
|
||||||
if (in.getPos() <= start) {
|
if (in.getPos() < start) {
|
||||||
((Seekable)seekableIn).seek(start);
|
((Seekable)seekableIn).seek(start);
|
||||||
in = new BZip2CompressionInputStream(seekableIn, start, end, readMode);
|
in = new BZip2CompressionInputStream(seekableIn, start, end, readMode);
|
||||||
}
|
}
|
||||||
|
|
|
@ -106,6 +106,27 @@ public class TestLineRecordReader {
|
||||||
testSplitRecords("blockEndingInCRThenLF.txt.bz2", 136498);
|
testSplitRecords("blockEndingInCRThenLF.txt.bz2", 136498);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//This test ensures record reader doesn't lose records when it starts
|
||||||
|
//exactly at the starting byte of a bz2 compressed block
|
||||||
|
@Test
|
||||||
|
public void testBzip2SplitStartAtBlockMarker() throws IOException {
|
||||||
|
//136504 in blockEndingInCR.txt.bz2 is the byte at which the bz2 block ends
|
||||||
|
//In the following test cases record readers should iterate over all the records
|
||||||
|
//and should not miss any record.
|
||||||
|
|
||||||
|
//Start next split at just the start of the block.
|
||||||
|
testSplitRecords("blockEndingInCR.txt.bz2", 136504);
|
||||||
|
|
||||||
|
//Start next split a byte forward in next block.
|
||||||
|
testSplitRecords("blockEndingInCR.txt.bz2", 136505);
|
||||||
|
|
||||||
|
//Start next split 3 bytes forward in next block.
|
||||||
|
testSplitRecords("blockEndingInCR.txt.bz2", 136508);
|
||||||
|
|
||||||
|
//Start next split 10 bytes from behind the end marker.
|
||||||
|
testSplitRecords("blockEndingInCR.txt.bz2", 136494);
|
||||||
|
}
|
||||||
|
|
||||||
// Use the LineRecordReader to read records from the file
|
// Use the LineRecordReader to read records from the file
|
||||||
public ArrayList<String> readRecords(URL testFileUrl, int splitSize)
|
public ArrayList<String> readRecords(URL testFileUrl, int splitSize)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
Loading…
Reference in New Issue