MAPREDUCE-3032. Fixed TaskAttemptImpl so that JobHistory can have error information about failed tasks. Contributed by Devaraj K.

svn merge -c r1185247 --ignore-ancestry ../../trunk/


git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-0.23@1185250 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Vinod Kumar Vavilapalli 2011-10-17 15:21:34 +00:00
parent 40c2d55de0
commit 2ce6540fb0
7 changed files with 124 additions and 10 deletions

View File

@ -1599,6 +1599,9 @@ Release 0.23.0 - Unreleased
MAPREDUCE-3127. Changed default value of yarn.resourcemanager.acl.enable MAPREDUCE-3127. Changed default value of yarn.resourcemanager.acl.enable
to true and added some more documentation. (acmurthy) to true and added some more documentation. (acmurthy)
MAPREDUCE-3032. Fixed TaskAttemptImpl so that JobHistory can have error
information about failed tasks. (Devaraj K via vinodkv)
Release 0.22.0 - Unreleased Release 0.22.0 - Unreleased
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -302,8 +302,6 @@ public class TaskAttemptListenerImpl extends CompositeService
taskAttemptStatus.progress = taskStatus.getProgress(); taskAttemptStatus.progress = taskStatus.getProgress();
LOG.info("Progress of TaskAttempt " + taskAttemptID + " is : " LOG.info("Progress of TaskAttempt " + taskAttemptID + " is : "
+ taskStatus.getProgress()); + taskStatus.getProgress());
// Task sends the diagnostic information to the TT
taskAttemptStatus.diagnosticInfo = taskStatus.getDiagnosticInfo();
// Task sends the updated state-string to the TT. // Task sends the updated state-string to the TT.
taskAttemptStatus.stateString = taskStatus.getStateString(); taskAttemptStatus.stateString = taskStatus.getStateString();
// Set the output-size when map-task finishes. Set by the task itself. // Set the output-size when map-task finishes. Set by the task itself.

View File

@ -48,7 +48,6 @@ public class TaskAttemptStatusUpdateEvent extends TaskAttemptEvent {
public TaskAttemptId id; public TaskAttemptId id;
public float progress; public float progress;
public Counters counters; public Counters counters;
public String diagnosticInfo;
public String stateString; public String stateString;
public Phase phase; public Phase phase;
public long outputSize; public long outputSize;

View File

@ -118,6 +118,8 @@ import org.apache.hadoop.yarn.state.StateMachine;
import org.apache.hadoop.yarn.state.StateMachineFactory; import org.apache.hadoop.yarn.state.StateMachineFactory;
import org.apache.hadoop.yarn.util.ConverterUtils; import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.hadoop.yarn.util.RackResolver; import org.apache.hadoop.yarn.util.RackResolver;
import org.apache.hadoop.util.StringUtils;
/** /**
* Implementation of TaskAttempt interface. * Implementation of TaskAttempt interface.
@ -435,6 +437,9 @@ public abstract class TaskAttemptImpl implements
//this is the last status reported by the REMOTE running attempt //this is the last status reported by the REMOTE running attempt
private TaskAttemptStatus reportedStatus; private TaskAttemptStatus reportedStatus;
private static final String LINE_SEPARATOR = System
.getProperty("line.separator");
public TaskAttemptImpl(TaskId taskId, int i, public TaskAttemptImpl(TaskId taskId, int i,
@SuppressWarnings("rawtypes") EventHandler eventHandler, @SuppressWarnings("rawtypes") EventHandler eventHandler,
TaskAttemptListener taskAttemptListener, Path jobFile, int partition, TaskAttemptListener taskAttemptListener, Path jobFile, int partition,
@ -758,7 +763,7 @@ public abstract class TaskAttemptImpl implements
result.setStartTime(launchTime); result.setStartTime(launchTime);
result.setFinishTime(finishTime); result.setFinishTime(finishTime);
result.setShuffleFinishTime(this.reportedStatus.shuffleFinishTime); result.setShuffleFinishTime(this.reportedStatus.shuffleFinishTime);
result.setDiagnosticInfo(reportedStatus.diagnosticInfo); result.setDiagnosticInfo(StringUtils.join(LINE_SEPARATOR, getDiagnostics()));
result.setPhase(reportedStatus.phase); result.setPhase(reportedStatus.phase);
result.setStateString(reportedStatus.stateString); result.setStateString(reportedStatus.stateString);
result.setCounters(getCounters()); result.setCounters(getCounters());
@ -895,7 +900,7 @@ public abstract class TaskAttemptImpl implements
TypeConverter.fromYarn(taskAttempt.attemptId.getTaskId().getTaskType()), TypeConverter.fromYarn(taskAttempt.attemptId.getTaskId().getTaskType()),
attemptState.toString(), taskAttempt.finishTime, attemptState.toString(), taskAttempt.finishTime,
taskAttempt.nodeHostName == null ? "UNKNOWN" : taskAttempt.nodeHostName, taskAttempt.nodeHostName == null ? "UNKNOWN" : taskAttempt.nodeHostName,
taskAttempt.reportedStatus.diagnosticInfo.toString(), StringUtils.join(LINE_SEPARATOR, taskAttempt.getDiagnostics()),
taskAttempt.getProgressSplitBlock().burst()); taskAttempt.getProgressSplitBlock().burst());
return tauce; return tauce;
} }
@ -1353,8 +1358,6 @@ public abstract class TaskAttemptImpl implements
(new SpeculatorEvent (new SpeculatorEvent
(taskAttempt.reportedStatus, taskAttempt.clock.getTime())); (taskAttempt.reportedStatus, taskAttempt.clock.getTime()));
//add to diagnostic
taskAttempt.addDiagnosticInfo(newReportedStatus.diagnosticInfo);
taskAttempt.updateProgressSplits(); taskAttempt.updateProgressSplits();
//if fetch failures are present, send the fetch failure event to job //if fetch failures are present, send the fetch failure event to job
@ -1382,7 +1385,6 @@ public abstract class TaskAttemptImpl implements
private void initTaskAttemptStatus(TaskAttemptStatus result) { private void initTaskAttemptStatus(TaskAttemptStatus result) {
result.progress = 0.0f; result.progress = 0.0f;
result.diagnosticInfo = "";
result.phase = Phase.STARTING; result.phase = Phase.STARTING;
result.stateString = "NEW"; result.stateString = "NEW";
result.taskState = TaskAttemptState.NEW; result.taskState = TaskAttemptState.NEW;

View File

@ -334,7 +334,6 @@ public class RecoveryService extends CompositeService implements Recovery {
TaskAttemptStatus taskAttemptStatus = new TaskAttemptStatus(); TaskAttemptStatus taskAttemptStatus = new TaskAttemptStatus();
taskAttemptStatus.id = yarnAttemptID; taskAttemptStatus.id = yarnAttemptID;
taskAttemptStatus.progress = 1.0f; taskAttemptStatus.progress = 1.0f;
taskAttemptStatus.diagnosticInfo = "";
taskAttemptStatus.stateString = attemptInfo.getTaskStatus(); taskAttemptStatus.stateString = attemptInfo.getTaskStatus();
// taskAttemptStatus.outputSize = attemptInfo.getOutputSize(); // taskAttemptStatus.outputSize = attemptInfo.getOutputSize();
taskAttemptStatus.phase = Phase.CLEANUP; taskAttemptStatus.phase = Phase.CLEANUP;

View File

@ -83,7 +83,6 @@ public class TestMRClientService {
TaskAttemptStatus taskAttemptStatus = new TaskAttemptStatus(); TaskAttemptStatus taskAttemptStatus = new TaskAttemptStatus();
taskAttemptStatus.id = attempt.getID(); taskAttemptStatus.id = attempt.getID();
taskAttemptStatus.progress = 0.5f; taskAttemptStatus.progress = 0.5f;
taskAttemptStatus.diagnosticInfo = diagnostic2;
taskAttemptStatus.stateString = "RUNNING"; taskAttemptStatus.stateString = "RUNNING";
taskAttemptStatus.taskState = TaskAttemptState.RUNNING; taskAttemptStatus.taskState = TaskAttemptState.RUNNING;
taskAttemptStatus.phase = Phase.MAP; taskAttemptStatus.phase = Phase.MAP;

View File

@ -0,0 +1,114 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapreduce.v2.app.job.impl;
import java.util.Iterator;
import java.util.Map;
import junit.framework.Assert;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent;
import org.apache.hadoop.mapreduce.jobhistory.TaskAttemptUnsuccessfulCompletion;
import org.apache.hadoop.mapreduce.v2.api.records.JobState;
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId;
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptReport;
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptState;
import org.apache.hadoop.mapreduce.v2.api.records.TaskId;
import org.apache.hadoop.mapreduce.v2.api.records.TaskState;
import org.apache.hadoop.mapreduce.v2.app.AppContext;
import org.apache.hadoop.mapreduce.v2.app.MRApp;
import org.apache.hadoop.mapreduce.v2.app.job.Job;
import org.apache.hadoop.mapreduce.v2.app.job.Task;
import org.apache.hadoop.mapreduce.v2.app.job.TaskAttempt;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptDiagnosticsUpdateEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType;
import org.apache.hadoop.yarn.event.EventHandler;
import org.junit.Test;
public class TestTaskAttempt{
@Test
public void testMRAppHistoryForMap() throws Exception {
MRApp app = new FailingAttemptsMRApp(1, 0);
testMRAppHistory(app);
}
@Test
public void testMRAppHistoryForReduce() throws Exception {
MRApp app = new FailingAttemptsMRApp(0, 1);
testMRAppHistory(app);
}
private void testMRAppHistory(MRApp app) throws Exception {
Configuration conf = new Configuration();
Job job = app.submit(conf);
app.waitForState(job, JobState.FAILED);
Map<TaskId, Task> tasks = job.getTasks();
Assert.assertEquals("Num tasks is not correct", 1, tasks.size());
Task task = tasks.values().iterator().next();
Assert.assertEquals("Task state not correct", TaskState.FAILED, task
.getReport().getTaskState());
Map<TaskAttemptId, TaskAttempt> attempts = tasks.values().iterator().next()
.getAttempts();
Assert.assertEquals("Num attempts is not correct", 4, attempts.size());
Iterator<TaskAttempt> it = attempts.values().iterator();
TaskAttemptReport report = it.next().getReport();
Assert.assertEquals("Attempt state not correct", TaskAttemptState.FAILED,
report.getTaskAttemptState());
Assert.assertEquals("Diagnostic Information is not Correct",
"Test Diagnostic Event", report.getDiagnosticInfo());
report = it.next().getReport();
Assert.assertEquals("Attempt state not correct", TaskAttemptState.FAILED,
report.getTaskAttemptState());
}
static class FailingAttemptsMRApp extends MRApp {
FailingAttemptsMRApp(int maps, int reduces) {
super(maps, reduces, true, "FailingAttemptsMRApp", true);
}
@Override
protected void attemptLaunched(TaskAttemptId attemptID) {
getContext().getEventHandler().handle(
new TaskAttemptDiagnosticsUpdateEvent(attemptID,
"Test Diagnostic Event"));
getContext().getEventHandler().handle(
new TaskAttemptEvent(attemptID, TaskAttemptEventType.TA_FAILMSG));
}
protected EventHandler<JobHistoryEvent> createJobHistoryHandler(
AppContext context) {
return new EventHandler<JobHistoryEvent>() {
@Override
public void handle(JobHistoryEvent event) {
if (event.getType() == org.apache.hadoop.mapreduce.jobhistory.EventType.MAP_ATTEMPT_FAILED) {
TaskAttemptUnsuccessfulCompletion datum = (TaskAttemptUnsuccessfulCompletion) event
.getHistoryEvent().getDatum();
Assert.assertEquals("Diagnostic Information is not Correct",
"Test Diagnostic Event", datum.get(6).toString());
}
}
};
}
}
}