From 347997c9596a4f7c2e32e39c2e30e9152ba0e7a3 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@apache.org>
Date: Tue, 31 Oct 2017 09:30:13 -0500
Subject: [PATCH] HADOOP-14919. BZip2 drops records when reading data in
 splits. Contributed by Jason Lowe

(cherry picked from commit 2fae63aa60c43b62bd908a9499562fe528603185)
---
 .../apache/hadoop/io/compress/BZip2Codec.java | 39 +---------
 .../io/compress/bzip2/CBZip2InputStream.java  | 32 +++++---
 .../hadoop/mapred/TestTextInputFormat.java    | 76 +++++++++++++++++++
 3 files changed, 98 insertions(+), 49 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java
index 331606eb449..db781187315 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java
@@ -204,43 +204,8 @@ public class BZip2Codec implements Configurable, SplittableCompressionCodec {
           Seekable.class.getName());
     }
 
-    //find the position of first BZip2 start up marker
-    ((Seekable)seekableIn).seek(0);
-
-    // BZip2 start of block markers are of 6 bytes.  But the very first block
-    // also has "BZh9", making it 10 bytes.  This is the common case.  But at
-    // time stream might start without a leading BZ.
-    final long FIRST_BZIP2_BLOCK_MARKER_POSITION =
-      CBZip2InputStream.numberOfBytesTillNextMarker(seekableIn);
-    long adjStart = 0L;
-    if (start != 0) {
-      // Other than the first of file, the marker size is 6 bytes.
-      adjStart = Math.max(0L, start - (FIRST_BZIP2_BLOCK_MARKER_POSITION
-          - (HEADER_LEN + SUB_HEADER_LEN)));
-    }
-
-    ((Seekable)seekableIn).seek(adjStart);
-    SplitCompressionInputStream in =
-      new BZip2CompressionInputStream(seekableIn, adjStart, end, readMode);
-
-
-    // The following if clause handles the following case:
-    // Assume the following scenario in BZip2 compressed stream where
-    // . represent compressed data.
-    // .....[48 bit Block].....[48 bit   Block].....[48 bit Block]...
-    // ........................[47 bits][1 bit].....[48 bit Block]...
-    // ................................^[Assume a Byte alignment here]
-    // ........................................^^[current position of stream]
-    // .....................^^[We go back 10 Bytes in stream and find a Block marker]
-    // ........................................^^[We align at wrong position!]
-    // ...........................................................^^[While this pos is correct]
-
-    if (in.getPos() < start) {
-      ((Seekable)seekableIn).seek(start);
-      in = new BZip2CompressionInputStream(seekableIn, start, end, readMode);
-    }
-
-    return in;
+    ((Seekable)seekableIn).seek(start);
+    return new BZip2CompressionInputStream(seekableIn, start, end, readMode);
   }
 
   /**
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/CBZip2InputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/CBZip2InputStream.java
index 57fc07b617a..edb37fee848 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/CBZip2InputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/CBZip2InputStream.java
@@ -52,20 +52,20 @@ import org.apache.hadoop.io.compress.SplittableCompressionCodec.READ_MODE;
  * This Ant code was enhanced so that it can de-compress blocks of bzip2 data.
  * Current position in the stream is an important statistic for Hadoop. For
  * example in LineRecordReader, we solely depend on the current position in the
- * stream to know about the progess. The notion of position becomes complicated
+ * stream to know about the progress. The notion of position becomes complicated
  * for compressed files. The Hadoop splitting is done in terms of compressed
  * file. But a compressed file deflates to a large amount of data. So we have
  * handled this problem in the following way.
  *
  * On object creation time, we find the next block start delimiter. Once such a
  * marker is found, the stream stops there (we discard any read compressed data
- * in this process) and the position is updated (i.e. the caller of this class
- * will find out the stream location). At this point we are ready for actual
- * reading (i.e. decompression) of data.
+ * in this process) and the position is reported as the beginning of the block
+ * start delimiter. At this point we are ready for actual reading
+ * (i.e. decompression) of data.
  *
  * The subsequent read calls give out data. The position is updated when the
  * caller of this class has read off the current block + 1 bytes. In between the
- * block reading, position is not updated. (We can only update the postion on
+ * block reading, position is not updated. (We can only update the position on
  * block boundaries).
  * </p>
  *
@@ -204,11 +204,12 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
   * in the stream.  It can find bit patterns of length <= 63 bits.  Specifically
   * this method is used in CBZip2InputStream to find the end of block (EOB)
   * delimiter in the stream, starting from the current position of the stream.
-  * If marker is found, the stream position will be right after marker at the
-  * end of this call.
+  * If marker is found, the stream position will be at the byte containing
+  * the starting bit of the marker.
   *
   * @param marker  The bit pattern to be found in the stream
   * @param markerBitLength  No of bits in the marker
+  * @return true if the marker was found otherwise false
   *
   * @throws IOException
   * @throws IllegalArgumentException  if marketBitLength is greater than 63
@@ -224,23 +225,33 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
       long bytes = 0;
       bytes = this.bsR(markerBitLength);
       if (bytes == -1) {
+        this.reportedBytesReadFromCompressedStream =
+            this.bytesReadFromCompressedStream;
         return false;
       }
       while (true) {
         if (bytes == marker) {
+          // Report the byte position where the marker starts
+          long markerBytesRead = (markerBitLength + this.bsLive + 7) / 8;
+          this.reportedBytesReadFromCompressedStream =
+              this.bytesReadFromCompressedStream - markerBytesRead;
           return true;
-
         } else {
           bytes = bytes << 1;
           bytes = bytes & ((1L << markerBitLength) - 1);
           int oneBit = (int) this.bsR(1);
           if (oneBit != -1) {
             bytes = bytes | oneBit;
-          } else
+          } else {
+            this.reportedBytesReadFromCompressedStream =
+                this.bytesReadFromCompressedStream;
             return false;
+          }
         }
       }
     } catch (IOException ex) {
+      this.reportedBytesReadFromCompressedStream =
+          this.bytesReadFromCompressedStream;
       return false;
     }
   }
@@ -302,7 +313,6 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
     } else if (readMode == READ_MODE.BYBLOCK) {
       this.currentState = STATE.NO_PROCESS_STATE;
       skipResult = this.skipToNextMarker(CBZip2InputStream.BLOCK_DELIMITER,DELIMITER_BIT_LENGTH);
-      this.reportedBytesReadFromCompressedStream = this.bytesReadFromCompressedStream;
       if(!skipDecompression){
         changeStateToProcessABlock();
       }
@@ -419,8 +429,6 @@ public class CBZip2InputStream extends InputStream implements BZip2Constants {
       result = b;
 
       skipResult = this.skipToNextMarker(CBZip2InputStream.BLOCK_DELIMITER, DELIMITER_BIT_LENGTH);
-      //Exactly when we are about to start a new block, we advertise the stream position.
-      this.reportedBytesReadFromCompressedStream = this.bytesReadFromCompressedStream;
 
       changeStateToProcessABlock();
     }
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestTextInputFormat.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestTextInputFormat.java
index 5106c3843f2..220d4a9f5db 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestTextInputFormat.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestTextInputFormat.java
@@ -186,6 +186,82 @@ public class TestTextInputFormat {
     verifyPartitions(473608, 110, file, codec, conf);
   }
 
+  // Test a corner case when position of stream is right after BZip2 marker
+  @Test (timeout=900000)
+  public void testSplitableCodecs2() throws IOException {
+    JobConf conf = new JobConf(defaultConf);
+    // Create the codec
+    CompressionCodec codec = null;
+    try {
+      codec = (CompressionCodec)
+      ReflectionUtils.newInstance(conf.getClassByName("org.apache.hadoop.io.compress.BZip2Codec"), conf);
+    } catch (ClassNotFoundException cnfe) {
+      throw new IOException("Illegal codec!");
+    }
+    Path file = new Path(workDir, "test"+codec.getDefaultExtension());
+
+    FileSystem localFs = FileSystem.getLocal(conf);
+    localFs.delete(workDir, true);
+    FileInputFormat.setInputPaths(conf, workDir);
+
+    int length = 250000;
+    LOG.info("creating; entries = " + length);
+    // create a file with length entries
+    Writer writer =
+        new OutputStreamWriter(codec.createOutputStream(localFs.create(file)));
+    try {
+      for (int i = 0; i < length; i++) {
+        writer.write(Integer.toString(i));
+        writer.write("\n");
+      }
+    } finally {
+      writer.close();
+    }
+
+    // Test split positions around a block boundary where the block does
+    // not start on a byte boundary.
+    for (long splitpos = 203418; splitpos < 203430; ++splitpos) {
+      TextInputFormat format = new TextInputFormat();
+      format.configure(conf);
+      LOG.info("setting block size of the input file to " + splitpos);
+      conf.setLong("mapreduce.input.fileinputformat.split.minsize", splitpos);
+      LongWritable key = new LongWritable();
+      Text value = new Text();
+      InputSplit[] splits = format.getSplits(conf, 2);
+      LOG.info("splitting: got =        " + splits.length);
+
+      // check each split
+      BitSet bits = new BitSet(length);
+      for (int j = 0; j < splits.length; j++) {
+        LOG.debug("split[" + j + "]= " + splits[j]);
+        RecordReader<LongWritable, Text> reader =
+            format.getRecordReader(splits[j], conf, Reporter.NULL);
+        try {
+          int counter = 0;
+          while (reader.next(key, value)) {
+            int v = Integer.parseInt(value.toString());
+            LOG.debug("read " + v);
+            if (bits.get(v)) {
+              LOG.warn("conflict with " + v + " in split " + j +
+                  " at position " + reader.getPos());
+            }
+            assertFalse("Key in multiple partitions.", bits.get(v));
+            bits.set(v);
+            counter++;
+          }
+          if (counter > 0) {
+            LOG.info("splits[" + j + "]=" + splits[j] + " count=" + counter);
+          } else {
+            LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + counter);
+          }
+        } finally {
+          reader.close();
+        }
+      }
+      assertEquals("Some keys in no partition.", length, bits.cardinality());
+    }
+  }
+
   private void verifyPartitions(int length, int numSplits, Path file,
       CompressionCodec codec, JobConf conf) throws IOException {