MAPREDUCE-5958. Wrong reduce task progress if map output is compressed. Contributed by Emilio Coppa and Jason Lowe.

(cherry picked from commit 8f701ae07a)
This commit is contained in:
Kihwal Lee 2014-11-06 15:56:50 -06:00
parent e333584ca6
commit c38c5752e4
3 changed files with 64 additions and 31 deletions

View File

@ -228,6 +228,9 @@ Release 2.6.0 - UNRELEASED
MAPREDUCE-5960. JobSubmitter's check whether job.jar is local is incorrect MAPREDUCE-5960. JobSubmitter's check whether job.jar is local is incorrect
with no authority in job jar path. (Gera Shegalov via jlowe) with no authority in job jar path. (Gera Shegalov via jlowe)
MAPREDUCE-5958. Wrong reduce task progress if map output is compressed
(Emilio Coppa and jlowe via kihwal)
Release 2.5.2 - UNRELEASED Release 2.5.2 - UNRELEASED
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -515,9 +515,9 @@ public class Merger {
} }
private void adjustPriorityQueue(Segment<K, V> reader) throws IOException{ private void adjustPriorityQueue(Segment<K, V> reader) throws IOException{
long startPos = reader.getPosition(); long startPos = reader.getReader().bytesRead;
boolean hasNext = reader.nextRawKey(); boolean hasNext = reader.nextRawKey();
long endPos = reader.getPosition(); long endPos = reader.getReader().bytesRead;
totalBytesProcessed += endPos - startPos; totalBytesProcessed += endPos - startPos;
mergeProgress.set(totalBytesProcessed * progPerByte); mergeProgress.set(totalBytesProcessed * progPerByte);
if (hasNext) { if (hasNext) {
@ -543,7 +543,7 @@ public class Merger {
} }
} }
minSegment = top(); minSegment = top();
long startPos = minSegment.getPosition(); long startPos = minSegment.getReader().bytesRead;
key = minSegment.getKey(); key = minSegment.getKey();
if (!minSegment.inMemory()) { if (!minSegment.inMemory()) {
//When we load the value from an inmemory segment, we reset //When we load the value from an inmemory segment, we reset
@ -560,7 +560,7 @@ public class Merger {
} else { } else {
minSegment.getValue(value); minSegment.getValue(value);
} }
long endPos = minSegment.getPosition(); long endPos = minSegment.getReader().bytesRead;
totalBytesProcessed += endPos - startPos; totalBytesProcessed += endPos - startPos;
mergeProgress.set(totalBytesProcessed * progPerByte); mergeProgress.set(totalBytesProcessed * progPerByte);
return true; return true;
@ -638,9 +638,9 @@ public class Merger {
// Initialize the segment at the last possible moment; // Initialize the segment at the last possible moment;
// this helps in ensuring we don't use buffers until we need them // this helps in ensuring we don't use buffers until we need them
segment.init(readsCounter); segment.init(readsCounter);
long startPos = segment.getPosition(); long startPos = segment.getReader().bytesRead;
boolean hasNext = segment.nextRawKey(); boolean hasNext = segment.nextRawKey();
long endPos = segment.getPosition(); long endPos = segment.getReader().bytesRead;
if (hasNext) { if (hasNext) {
startBytes += endPos - startPos; startBytes += endPos - startPos;

View File

@ -18,13 +18,12 @@
package org.apache.hadoop.mapreduce.task.reduce; package org.apache.hadoop.mapreduce.task.reduce;
import static org.mockito.Matchers.any; import static org.mockito.Matchers.any;
import static org.mockito.Mockito.doAnswer;
import static org.mockito.Mockito.mock; import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when; import static org.mockito.Mockito.when;
import static org.mockito.Mockito.doAnswer;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import java.security.PrivilegedAction;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Iterator; import java.util.Iterator;
@ -32,9 +31,8 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.TreeMap; import java.util.TreeMap;
import org.apache.hadoop.fs.FSDataInputStream;
import org.junit.Assert;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalDirAllocator; import org.apache.hadoop.fs.LocalDirAllocator;
@ -43,14 +41,15 @@ import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.Counters.Counter; import org.apache.hadoop.mapred.Counters.Counter;
import org.apache.hadoop.mapred.IFile.Reader;
import org.apache.hadoop.mapred.IFile; import org.apache.hadoop.mapred.IFile;
import org.apache.hadoop.mapred.IFile.Reader;
import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MROutputFiles; import org.apache.hadoop.mapred.MROutputFiles;
import org.apache.hadoop.mapred.Merger; import org.apache.hadoop.mapred.Merger;
import org.apache.hadoop.mapred.Merger.Segment; import org.apache.hadoop.mapred.Merger.Segment;
import org.apache.hadoop.mapred.RawKeyValueIterator; import org.apache.hadoop.mapred.RawKeyValueIterator;
import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.CryptoUtils;
import org.apache.hadoop.mapreduce.JobID; import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.MRConfig; import org.apache.hadoop.mapreduce.MRConfig;
import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapreduce.MRJobConfig;
@ -58,21 +57,17 @@ import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskID; import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.TaskType; import org.apache.hadoop.mapreduce.TaskType;
import org.apache.hadoop.mapreduce.security.TokenCache; import org.apache.hadoop.mapreduce.security.TokenCache;
import org.apache.hadoop.mapreduce.CryptoUtils;
import org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl;
import org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl.CompressAwarePath; import org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl.CompressAwarePath;
import org.apache.hadoop.security.Credentials; import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.Progress; import org.apache.hadoop.util.Progress;
import org.apache.hadoop.util.Progressable; import org.apache.hadoop.util.Progressable;
import org.junit.After; import org.junit.Assert;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import org.mockito.invocation.InvocationOnMock; import org.mockito.invocation.InvocationOnMock;
import org.mockito.stubbing.Answer; import org.mockito.stubbing.Answer;
import com.google.common.collect.Lists;
public class TestMerger { public class TestMerger {
private Configuration conf; private Configuration conf;
@ -254,7 +249,7 @@ public class TestMerger {
testMergeShouldReturnProperProgress(getUncompressedSegments()); testMergeShouldReturnProperProgress(getUncompressedSegments());
} }
@SuppressWarnings( { "deprecation", "unchecked" }) @SuppressWarnings( { "unchecked" })
public void testMergeShouldReturnProperProgress( public void testMergeShouldReturnProperProgress(
List<Segment<Text, Text>> segments) throws IOException { List<Segment<Text, Text>> segments) throws IOException {
Path tmpDir = new Path("localpath"); Path tmpDir = new Path("localpath");
@ -267,7 +262,38 @@ public class TestMerger {
RawKeyValueIterator mergeQueue = Merger.merge(conf, fs, keyClass, RawKeyValueIterator mergeQueue = Merger.merge(conf, fs, keyClass,
valueClass, segments, 2, tmpDir, comparator, getReporter(), valueClass, segments, 2, tmpDir, comparator, getReporter(),
readsCounter, writesCounter, mergePhase); readsCounter, writesCounter, mergePhase);
Assert.assertEquals(1.0f, mergeQueue.getProgress().get(), 0.0f); final float epsilon = 0.00001f;
// Reading 6 keys total, 3 each in 2 segments, so each key read moves the
// progress forward 1/6th of the way. Initially the first keys from each
// segment have been read as part of the merge setup, so progress = 2/6.
Assert.assertEquals(2/6.0f, mergeQueue.getProgress().get(), epsilon);
// The first next() returns one of the keys already read during merge setup
Assert.assertTrue(mergeQueue.next());
Assert.assertEquals(2/6.0f, mergeQueue.getProgress().get(), epsilon);
// Subsequent next() calls should read one key and move progress
Assert.assertTrue(mergeQueue.next());
Assert.assertEquals(3/6.0f, mergeQueue.getProgress().get(), epsilon);
Assert.assertTrue(mergeQueue.next());
Assert.assertEquals(4/6.0f, mergeQueue.getProgress().get(), epsilon);
// At this point we've exhausted all of the keys in one segment
// so getting the next key will return the already cached key from the
// other segment
Assert.assertTrue(mergeQueue.next());
Assert.assertEquals(4/6.0f, mergeQueue.getProgress().get(), epsilon);
// Subsequent next() calls should read one key and move progress
Assert.assertTrue(mergeQueue.next());
Assert.assertEquals(5/6.0f, mergeQueue.getProgress().get(), epsilon);
Assert.assertTrue(mergeQueue.next());
Assert.assertEquals(1.0f, mergeQueue.getProgress().get(), epsilon);
// Now there should be no more input
Assert.assertFalse(mergeQueue.next());
Assert.assertEquals(1.0f, mergeQueue.getProgress().get(), epsilon);
} }
private Progressable getReporter() { private Progressable getReporter() {
@ -281,7 +307,7 @@ public class TestMerger {
private List<Segment<Text, Text>> getUncompressedSegments() throws IOException { private List<Segment<Text, Text>> getUncompressedSegments() throws IOException {
List<Segment<Text, Text>> segments = new ArrayList<Segment<Text, Text>>(); List<Segment<Text, Text>> segments = new ArrayList<Segment<Text, Text>>();
for (int i = 1; i < 1; i++) { for (int i = 0; i < 2; i++) {
segments.add(getUncompressedSegment(i)); segments.add(getUncompressedSegment(i));
} }
return segments; return segments;
@ -289,44 +315,51 @@ public class TestMerger {
private List<Segment<Text, Text>> getCompressedSegments() throws IOException { private List<Segment<Text, Text>> getCompressedSegments() throws IOException {
List<Segment<Text, Text>> segments = new ArrayList<Segment<Text, Text>>(); List<Segment<Text, Text>> segments = new ArrayList<Segment<Text, Text>>();
for (int i = 1; i < 1; i++) { for (int i = 0; i < 2; i++) {
segments.add(getCompressedSegment(i)); segments.add(getCompressedSegment(i));
} }
return segments; return segments;
} }
private Segment<Text, Text> getUncompressedSegment(int i) throws IOException { private Segment<Text, Text> getUncompressedSegment(int i) throws IOException {
return new Segment<Text, Text>(getReader(i), false); return new Segment<Text, Text>(getReader(i, false), false);
} }
private Segment<Text, Text> getCompressedSegment(int i) throws IOException { private Segment<Text, Text> getCompressedSegment(int i) throws IOException {
return new Segment<Text, Text>(getReader(i), false, 3000l); return new Segment<Text, Text>(getReader(i, true), false, 3000l);
} }
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
private Reader<Text, Text> getReader(int i) throws IOException { private Reader<Text, Text> getReader(int i, boolean isCompressedInput)
throws IOException {
Reader<Text, Text> readerMock = mock(Reader.class); Reader<Text, Text> readerMock = mock(Reader.class);
when(readerMock.getLength()).thenReturn(30l);
when(readerMock.getPosition()).thenReturn(0l).thenReturn(10l).thenReturn( when(readerMock.getPosition()).thenReturn(0l).thenReturn(10l).thenReturn(
20l); 20l);
when( when(
readerMock.nextRawKey(any(DataInputBuffer.class))) readerMock.nextRawKey(any(DataInputBuffer.class)))
.thenAnswer(getKeyAnswer("Segment" + i)); .thenAnswer(getKeyAnswer("Segment" + i, isCompressedInput));
doAnswer(getValueAnswer("Segment" + i)).when(readerMock).nextRawValue( doAnswer(getValueAnswer("Segment" + i)).when(readerMock).nextRawValue(
any(DataInputBuffer.class)); any(DataInputBuffer.class));
return readerMock; return readerMock;
} }
private Answer<?> getKeyAnswer(final String segmentName) { private Answer<?> getKeyAnswer(final String segmentName,
final boolean isCompressedInput) {
return new Answer<Object>() { return new Answer<Object>() {
int i = 0; int i = 0;
@SuppressWarnings("unchecked")
public Boolean answer(InvocationOnMock invocation) { public Boolean answer(InvocationOnMock invocation) {
Object[] args = invocation.getArguments(); if (i++ == 3) {
DataInputBuffer key = (DataInputBuffer) args[0];
if (i++ == 2) {
return false; return false;
} }
Reader<Text,Text> mock = (Reader<Text,Text>) invocation.getMock();
int multiplier = isCompressedInput ? 100 : 1;
mock.bytesRead += 10 * multiplier;
Object[] args = invocation.getArguments();
DataInputBuffer key = (DataInputBuffer) args[0];
key.reset(("Segment Key " + segmentName + i).getBytes(), 20); key.reset(("Segment Key " + segmentName + i).getBytes(), 20);
return true; return true;
} }
@ -340,9 +373,6 @@ public class TestMerger {
public Void answer(InvocationOnMock invocation) { public Void answer(InvocationOnMock invocation) {
Object[] args = invocation.getArguments(); Object[] args = invocation.getArguments();
DataInputBuffer key = (DataInputBuffer) args[0]; DataInputBuffer key = (DataInputBuffer) args[0];
if (i++ == 2) {
return null;
}
key.reset(("Segment Value " + segmentName + i).getBytes(), 20); key.reset(("Segment Value " + segmentName + i).getBytes(), 20);
return null; return null;
} }