diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml
index 19ce44bf412..c524b60bdb4 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml
@@ -93,6 +93,7 @@
.gitattributes
src/test/resources/recordSpanningMultipleSplits.txt
src/test/resources/testBOM.txt
+ src/test/resources/TestSafeguardSplittingUnsplittableFiles.txt.gz
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java
index 5e45b49e9ec..c6cbd500f33 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java
@@ -57,9 +57,12 @@ import com.google.common.collect.Iterables;
*
FileInputFormat
is the base class for all file-based
* InputFormat
s. This provides a generic implementation of
* {@link #getSplits(JobConf, int)}.
- * Subclasses of FileInputFormat
can also override the
- * {@link #isSplitable(FileSystem, Path)} method to ensure input-files are
- * not split-up and are processed as a whole by {@link Mapper}s.
+ *
+ * Implementations of FileInputFormat
can also override the
+ * {@link #isSplitable(FileSystem, Path)} method to prevent input files
+ * from being split-up in certain situations. Implementations that may
+ * deal with non-splittable files must override this method, since
+ * the default implementation assumes splitting is always possible.
*/
@InterfaceAudience.Public
@InterfaceStability.Stable
@@ -116,9 +119,13 @@ public abstract class FileInputFormat implements InputFormat {
}
/**
- * Is the given filename splitable? Usually, true, but if the file is
+ * Is the given filename splittable? Usually, true, but if the file is
* stream compressed, it will not be.
- *
+ *
+ * The default implementation in FileInputFormat
always returns
+ * true. Implementations that may deal with non-splittable files must
+ * override this method.
+ *
* FileInputFormat
implementations can override this and return
* false
to ensure that individual input files are never split-up
* so that {@link Mapper}s process entire files.
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java
index ba075e5dfb2..45263c4256c 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java
@@ -118,6 +118,13 @@ public class LineRecordReader implements RecordReader {
end = cIn.getAdjustedEnd();
filePosition = cIn; // take pos from compressed stream
} else {
+ if (start != 0) {
+ // So we have a split that is part of a file stored using
+ // a Compression codec that cannot be split.
+ throw new IOException("Cannot seek in " +
+ codec.getClass().getSimpleName() + " compressed stream");
+ }
+
in = new SplitLineReader(codec.createInputStream(fileIn,
decompressor), job, recordDelimiter);
filePosition = fileIn;
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java
index a3ffe019c8f..f5cd5d116e4 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java
@@ -51,13 +51,16 @@ import com.google.common.collect.Lists;
/**
* A base class for file-based {@link InputFormat}s.
- *
+ *
* FileInputFormat
is the base class for all file-based
* InputFormat
s. This provides a generic implementation of
* {@link #getSplits(JobContext)}.
- * Subclasses of FileInputFormat
can also override the
- * {@link #isSplitable(JobContext, Path)} method to ensure input-files are
- * not split-up and are processed as a whole by {@link Mapper}s.
+ *
+ * Implementations of FileInputFormat
can also override the
+ * {@link #isSplitable(JobContext, Path)} method to prevent input files
+ * from being split-up in certain situations. Implementations that may
+ * deal with non-splittable files must override this method, since
+ * the default implementation assumes splitting is always possible.
*/
@InterfaceAudience.Public
@InterfaceStability.Stable
@@ -146,9 +149,13 @@ public abstract class FileInputFormat extends InputFormat {
}
/**
- * Is the given filename splitable? Usually, true, but if the file is
+ * Is the given filename splittable? Usually, true, but if the file is
* stream compressed, it will not be.
- *
+ *
+ * The default implementation in FileInputFormat
always returns
+ * true. Implementations that may deal with non-splittable files must
+ * override this method.
+ *
* FileInputFormat
implementations can override this and return
* false
to ensure that individual input files are never split-up
* so that {@link Mapper}s process entire files.
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java
index 42e94ad0c35..5af8f43bbec 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java
@@ -86,7 +86,7 @@ public class LineRecordReader extends RecordReader {
CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
if (null!=codec) {
- isCompressedInput = true;
+ isCompressedInput = true;
decompressor = CodecPool.getDecompressor(codec);
if (codec instanceof SplittableCompressionCodec) {
final SplitCompressionInputStream cIn =
@@ -99,6 +99,13 @@ public class LineRecordReader extends RecordReader {
end = cIn.getAdjustedEnd();
filePosition = cIn;
} else {
+ if (start != 0) {
+ // So we have a split that is only part of a file stored using
+ // a Compression codec that cannot be split.
+ throw new IOException("Cannot seek in " +
+ codec.getClass().getSimpleName() + " compressed stream");
+ }
+
in = new SplitLineReader(codec.createInputStream(fileIn,
decompressor), job, this.recordDelimiterBytes);
filePosition = fileIn;
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java
index 4c94e59ef9c..cbbbeaaa9c1 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java
@@ -127,6 +127,13 @@ public class TestLineRecordReader {
testSplitRecords("blockEndingInCR.txt.bz2", 136494);
}
+ @Test(expected=IOException.class)
+ public void testSafeguardSplittingUnsplittableFiles() throws IOException {
+ // The LineRecordReader must fail when trying to read a file that
+ // was compressed using an unsplittable file format
+ testSplitRecords("TestSafeguardSplittingUnsplittableFiles.txt.gz", 2);
+ }
+
// Use the LineRecordReader to read records from the file
public ArrayList readRecords(URL testFileUrl, int splitSize)
throws IOException {
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java
index 52fdc090abe..8b385a0103e 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java
@@ -110,6 +110,13 @@ public class TestLineRecordReader {
testSplitRecords("blockEndingInCRThenLF.txt.bz2", 136498);
}
+ @Test(expected=IOException.class)
+ public void testSafeguardSplittingUnsplittableFiles() throws IOException {
+ // The LineRecordReader must fail when trying to read a file that
+ // was compressed using an unsplittable file format
+ testSplitRecords("TestSafeguardSplittingUnsplittableFiles.txt.gz", 2);
+ }
+
// Use the LineRecordReader to read records from the file
public ArrayList readRecords(URL testFileUrl, int splitSize)
throws IOException {
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/TestSafeguardSplittingUnsplittableFiles.txt.gz b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/TestSafeguardSplittingUnsplittableFiles.txt.gz
new file mode 100644
index 00000000000..557db03de99
--- /dev/null
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/TestSafeguardSplittingUnsplittableFiles.txt.gz
@@ -0,0 +1 @@
+Hello World