diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml index 19ce44bf412..c524b60bdb4 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml @@ -93,6 +93,7 @@ .gitattributes src/test/resources/recordSpanningMultipleSplits.txt src/test/resources/testBOM.txt + src/test/resources/TestSafeguardSplittingUnsplittableFiles.txt.gz diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java index 5e45b49e9ec..c6cbd500f33 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java @@ -57,9 +57,12 @@ import com.google.common.collect.Iterables; *

FileInputFormat is the base class for all file-based * InputFormats. This provides a generic implementation of * {@link #getSplits(JobConf, int)}. - * Subclasses of FileInputFormat can also override the - * {@link #isSplitable(FileSystem, Path)} method to ensure input-files are - * not split-up and are processed as a whole by {@link Mapper}s. + * + * Implementations of FileInputFormat can also override the + * {@link #isSplitable(FileSystem, Path)} method to prevent input files + * from being split-up in certain situations. Implementations that may + * deal with non-splittable files must override this method, since + * the default implementation assumes splitting is always possible. */ @InterfaceAudience.Public @InterfaceStability.Stable @@ -116,9 +119,13 @@ public abstract class FileInputFormat implements InputFormat { } /** - * Is the given filename splitable? Usually, true, but if the file is + * Is the given filename splittable? Usually, true, but if the file is * stream compressed, it will not be. - * + * + * The default implementation in FileInputFormat always returns + * true. Implementations that may deal with non-splittable files must + * override this method. + * * FileInputFormat implementations can override this and return * false to ensure that individual input files are never split-up * so that {@link Mapper}s process entire files. diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java index ba075e5dfb2..45263c4256c 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java @@ -118,6 +118,13 @@ public class LineRecordReader implements RecordReader { end = cIn.getAdjustedEnd(); filePosition = cIn; // take pos from compressed stream } else { + if (start != 0) { + // So we have a split that is part of a file stored using + // a Compression codec that cannot be split. + throw new IOException("Cannot seek in " + + codec.getClass().getSimpleName() + " compressed stream"); + } + in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job, recordDelimiter); filePosition = fileIn; diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java index a3ffe019c8f..f5cd5d116e4 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java @@ -51,13 +51,16 @@ import com.google.common.collect.Lists; /** * A base class for file-based {@link InputFormat}s. - * + * *

FileInputFormat is the base class for all file-based * InputFormats. This provides a generic implementation of * {@link #getSplits(JobContext)}. - * Subclasses of FileInputFormat can also override the - * {@link #isSplitable(JobContext, Path)} method to ensure input-files are - * not split-up and are processed as a whole by {@link Mapper}s. + * + * Implementations of FileInputFormat can also override the + * {@link #isSplitable(JobContext, Path)} method to prevent input files + * from being split-up in certain situations. Implementations that may + * deal with non-splittable files must override this method, since + * the default implementation assumes splitting is always possible. */ @InterfaceAudience.Public @InterfaceStability.Stable @@ -146,9 +149,13 @@ public abstract class FileInputFormat extends InputFormat { } /** - * Is the given filename splitable? Usually, true, but if the file is + * Is the given filename splittable? Usually, true, but if the file is * stream compressed, it will not be. - * + * + * The default implementation in FileInputFormat always returns + * true. Implementations that may deal with non-splittable files must + * override this method. + * * FileInputFormat implementations can override this and return * false to ensure that individual input files are never split-up * so that {@link Mapper}s process entire files. diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java index 42e94ad0c35..5af8f43bbec 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java @@ -86,7 +86,7 @@ public class LineRecordReader extends RecordReader { CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); if (null!=codec) { - isCompressedInput = true; + isCompressedInput = true; decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = @@ -99,6 +99,13 @@ public class LineRecordReader extends RecordReader { end = cIn.getAdjustedEnd(); filePosition = cIn; } else { + if (start != 0) { + // So we have a split that is only part of a file stored using + // a Compression codec that cannot be split. + throw new IOException("Cannot seek in " + + codec.getClass().getSimpleName() + " compressed stream"); + } + in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); filePosition = fileIn; diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java index 4c94e59ef9c..cbbbeaaa9c1 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java @@ -127,6 +127,13 @@ public class TestLineRecordReader { testSplitRecords("blockEndingInCR.txt.bz2", 136494); } + @Test(expected=IOException.class) + public void testSafeguardSplittingUnsplittableFiles() throws IOException { + // The LineRecordReader must fail when trying to read a file that + // was compressed using an unsplittable file format + testSplitRecords("TestSafeguardSplittingUnsplittableFiles.txt.gz", 2); + } + // Use the LineRecordReader to read records from the file public ArrayList readRecords(URL testFileUrl, int splitSize) throws IOException { diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java index 52fdc090abe..8b385a0103e 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java @@ -110,6 +110,13 @@ public class TestLineRecordReader { testSplitRecords("blockEndingInCRThenLF.txt.bz2", 136498); } + @Test(expected=IOException.class) + public void testSafeguardSplittingUnsplittableFiles() throws IOException { + // The LineRecordReader must fail when trying to read a file that + // was compressed using an unsplittable file format + testSplitRecords("TestSafeguardSplittingUnsplittableFiles.txt.gz", 2); + } + // Use the LineRecordReader to read records from the file public ArrayList readRecords(URL testFileUrl, int splitSize) throws IOException { diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/TestSafeguardSplittingUnsplittableFiles.txt.gz b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/TestSafeguardSplittingUnsplittableFiles.txt.gz new file mode 100644 index 00000000000..557db03de99 --- /dev/null +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/TestSafeguardSplittingUnsplittableFiles.txt.gz @@ -0,0 +1 @@ +Hello World