HADOOP-13270. BZip2CompressionInputStream finds the same compression marker twice in corner case, causing duplicate data blocks. Contributed by Kai Sasaki.

(cherry picked from commit e3ba9ad3f1) (cherry picked from commit d219550e8b)
2016-06-14 10:18:17 +09:00 · 2016-06-14 10:18:17 +09:00 · 8295351e54
parent 56e29d2711
commit 8295351e54
2 changed files with 71 additions and 60 deletions
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java
@ -207,7 +207,12 @@ public class BZip2Codec implements Configurable, SplittableCompressionCodec {
    // time stream might start without a leading BZ.
    final long FIRST_BZIP2_BLOCK_MARKER_POSITION =
      CBZip2InputStream.numberOfBytesTillNextMarker(seekableIn);
-    long adjStart = Math.max(0L, start - FIRST_BZIP2_BLOCK_MARKER_POSITION);
+    long adjStart = 0L;
    if (start != 0) {
      // Other than the first of file, the marker size is 6 bytes.
      adjStart = Math.max(0L, start - (FIRST_BZIP2_BLOCK_MARKER_POSITION
          - (HEADER_LEN + SUB_HEADER_LEN)));
    }
    ((Seekable)seekableIn).seek(adjStart);
    SplitCompressionInputStream in =
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestTextInputFormat.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestTextInputFormat.java
@ -175,6 +175,20 @@ public class TestTextInputFormat {
    for (int length = MAX_LENGTH / 2; length < MAX_LENGTH;
        length += random.nextInt(MAX_LENGTH / 4)+1) {
      for (int i = 0; i < 3; i++) {
        int numSplits = random.nextInt(MAX_LENGTH / 2000) + 1;
        verifyPartitions(length, numSplits, file, codec, conf);
      }
    }
    // corner case when we have byte alignment and position of stream are same
    verifyPartitions(471507, 218, file, codec, conf);
    verifyPartitions(473608, 110, file, codec, conf);
  }
  private void verifyPartitions(int length, int numSplits, Path file,
      CompressionCodec codec, JobConf conf) throws IOException {
    LOG.info("creating; entries = " + length);
@ -195,26 +209,22 @@ public class TestTextInputFormat {
    format.configure(conf);
    LongWritable key = new LongWritable();
    Text value = new Text();
      for (int i = 0; i < 3; i++) {
        int numSplits = random.nextInt(MAX_LENGTH/2000)+1;
    LOG.info("splitting: requesting = " + numSplits);
    InputSplit[] splits = format.getSplits(conf, numSplits);
    LOG.info("splitting: got =        " + splits.length);
    // check each split
    BitSet bits = new BitSet(length);
    for (int j = 0; j < splits.length; j++) {
      LOG.debug("split["+j+"]= " + splits[j]);
      RecordReader<LongWritable, Text> reader =
-            format.getRecordReader(splits[j], conf, reporter);
+              format.getRecordReader(splits[j], conf, Reporter.NULL);
      try {
        int counter = 0;
        while (reader.next(key, value)) {
          int v = Integer.parseInt(value.toString());
          LOG.debug("read " + v);
          if (bits.get(v)) {
            LOG.warn("conflict with " + v +
                    " in split " + j +
@ -236,10 +246,6 @@ public class TestTextInputFormat {
    assertEquals("Some keys in no partition.", length, bits.cardinality());
  }
    }
  }
  private static LineReader makeStream(String str) throws IOException {
    return new LineReader(new ByteArrayInputStream
                                             (str.getBytes("UTF-8")),