MAPREDUCE-3032. Fixed TaskAttemptImpl so that JobHistory can have error information about failed tasks. Contributed by Devaraj K.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1185247 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9f36bdd731
commit
a26b1672a8
|
@ -1645,6 +1645,9 @@ Release 0.23.0 - Unreleased
|
||||||
MAPREDUCE-3127. Changed default value of yarn.resourcemanager.acl.enable
|
MAPREDUCE-3127. Changed default value of yarn.resourcemanager.acl.enable
|
||||||
to true and added some more documentation. (acmurthy)
|
to true and added some more documentation. (acmurthy)
|
||||||
|
|
||||||
|
MAPREDUCE-3032. Fixed TaskAttemptImpl so that JobHistory can have error
|
||||||
|
information about failed tasks. (Devaraj K via vinodkv)
|
||||||
|
|
||||||
Release 0.22.0 - Unreleased
|
Release 0.22.0 - Unreleased
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -302,8 +302,6 @@ public class TaskAttemptListenerImpl extends CompositeService
|
||||||
taskAttemptStatus.progress = taskStatus.getProgress();
|
taskAttemptStatus.progress = taskStatus.getProgress();
|
||||||
LOG.info("Progress of TaskAttempt " + taskAttemptID + " is : "
|
LOG.info("Progress of TaskAttempt " + taskAttemptID + " is : "
|
||||||
+ taskStatus.getProgress());
|
+ taskStatus.getProgress());
|
||||||
// Task sends the diagnostic information to the TT
|
|
||||||
taskAttemptStatus.diagnosticInfo = taskStatus.getDiagnosticInfo();
|
|
||||||
// Task sends the updated state-string to the TT.
|
// Task sends the updated state-string to the TT.
|
||||||
taskAttemptStatus.stateString = taskStatus.getStateString();
|
taskAttemptStatus.stateString = taskStatus.getStateString();
|
||||||
// Set the output-size when map-task finishes. Set by the task itself.
|
// Set the output-size when map-task finishes. Set by the task itself.
|
||||||
|
|
|
@ -48,7 +48,6 @@ public class TaskAttemptStatusUpdateEvent extends TaskAttemptEvent {
|
||||||
public TaskAttemptId id;
|
public TaskAttemptId id;
|
||||||
public float progress;
|
public float progress;
|
||||||
public Counters counters;
|
public Counters counters;
|
||||||
public String diagnosticInfo;
|
|
||||||
public String stateString;
|
public String stateString;
|
||||||
public Phase phase;
|
public Phase phase;
|
||||||
public long outputSize;
|
public long outputSize;
|
||||||
|
|
|
@ -118,6 +118,8 @@ import org.apache.hadoop.yarn.state.StateMachine;
|
||||||
import org.apache.hadoop.yarn.state.StateMachineFactory;
|
import org.apache.hadoop.yarn.state.StateMachineFactory;
|
||||||
import org.apache.hadoop.yarn.util.ConverterUtils;
|
import org.apache.hadoop.yarn.util.ConverterUtils;
|
||||||
import org.apache.hadoop.yarn.util.RackResolver;
|
import org.apache.hadoop.yarn.util.RackResolver;
|
||||||
|
import org.apache.hadoop.util.StringUtils;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Implementation of TaskAttempt interface.
|
* Implementation of TaskAttempt interface.
|
||||||
|
@ -435,6 +437,9 @@ public abstract class TaskAttemptImpl implements
|
||||||
//this is the last status reported by the REMOTE running attempt
|
//this is the last status reported by the REMOTE running attempt
|
||||||
private TaskAttemptStatus reportedStatus;
|
private TaskAttemptStatus reportedStatus;
|
||||||
|
|
||||||
|
private static final String LINE_SEPARATOR = System
|
||||||
|
.getProperty("line.separator");
|
||||||
|
|
||||||
public TaskAttemptImpl(TaskId taskId, int i,
|
public TaskAttemptImpl(TaskId taskId, int i,
|
||||||
@SuppressWarnings("rawtypes") EventHandler eventHandler,
|
@SuppressWarnings("rawtypes") EventHandler eventHandler,
|
||||||
TaskAttemptListener taskAttemptListener, Path jobFile, int partition,
|
TaskAttemptListener taskAttemptListener, Path jobFile, int partition,
|
||||||
|
@ -758,7 +763,7 @@ public abstract class TaskAttemptImpl implements
|
||||||
result.setStartTime(launchTime);
|
result.setStartTime(launchTime);
|
||||||
result.setFinishTime(finishTime);
|
result.setFinishTime(finishTime);
|
||||||
result.setShuffleFinishTime(this.reportedStatus.shuffleFinishTime);
|
result.setShuffleFinishTime(this.reportedStatus.shuffleFinishTime);
|
||||||
result.setDiagnosticInfo(reportedStatus.diagnosticInfo);
|
result.setDiagnosticInfo(StringUtils.join(LINE_SEPARATOR, getDiagnostics()));
|
||||||
result.setPhase(reportedStatus.phase);
|
result.setPhase(reportedStatus.phase);
|
||||||
result.setStateString(reportedStatus.stateString);
|
result.setStateString(reportedStatus.stateString);
|
||||||
result.setCounters(getCounters());
|
result.setCounters(getCounters());
|
||||||
|
@ -895,7 +900,7 @@ public abstract class TaskAttemptImpl implements
|
||||||
TypeConverter.fromYarn(taskAttempt.attemptId.getTaskId().getTaskType()),
|
TypeConverter.fromYarn(taskAttempt.attemptId.getTaskId().getTaskType()),
|
||||||
attemptState.toString(), taskAttempt.finishTime,
|
attemptState.toString(), taskAttempt.finishTime,
|
||||||
taskAttempt.nodeHostName == null ? "UNKNOWN" : taskAttempt.nodeHostName,
|
taskAttempt.nodeHostName == null ? "UNKNOWN" : taskAttempt.nodeHostName,
|
||||||
taskAttempt.reportedStatus.diagnosticInfo.toString(),
|
StringUtils.join(LINE_SEPARATOR, taskAttempt.getDiagnostics()),
|
||||||
taskAttempt.getProgressSplitBlock().burst());
|
taskAttempt.getProgressSplitBlock().burst());
|
||||||
return tauce;
|
return tauce;
|
||||||
}
|
}
|
||||||
|
@ -1353,8 +1358,6 @@ public abstract class TaskAttemptImpl implements
|
||||||
(new SpeculatorEvent
|
(new SpeculatorEvent
|
||||||
(taskAttempt.reportedStatus, taskAttempt.clock.getTime()));
|
(taskAttempt.reportedStatus, taskAttempt.clock.getTime()));
|
||||||
|
|
||||||
//add to diagnostic
|
|
||||||
taskAttempt.addDiagnosticInfo(newReportedStatus.diagnosticInfo);
|
|
||||||
taskAttempt.updateProgressSplits();
|
taskAttempt.updateProgressSplits();
|
||||||
|
|
||||||
//if fetch failures are present, send the fetch failure event to job
|
//if fetch failures are present, send the fetch failure event to job
|
||||||
|
@ -1382,7 +1385,6 @@ public abstract class TaskAttemptImpl implements
|
||||||
|
|
||||||
private void initTaskAttemptStatus(TaskAttemptStatus result) {
|
private void initTaskAttemptStatus(TaskAttemptStatus result) {
|
||||||
result.progress = 0.0f;
|
result.progress = 0.0f;
|
||||||
result.diagnosticInfo = "";
|
|
||||||
result.phase = Phase.STARTING;
|
result.phase = Phase.STARTING;
|
||||||
result.stateString = "NEW";
|
result.stateString = "NEW";
|
||||||
result.taskState = TaskAttemptState.NEW;
|
result.taskState = TaskAttemptState.NEW;
|
||||||
|
|
|
@ -334,7 +334,6 @@ public class RecoveryService extends CompositeService implements Recovery {
|
||||||
TaskAttemptStatus taskAttemptStatus = new TaskAttemptStatus();
|
TaskAttemptStatus taskAttemptStatus = new TaskAttemptStatus();
|
||||||
taskAttemptStatus.id = yarnAttemptID;
|
taskAttemptStatus.id = yarnAttemptID;
|
||||||
taskAttemptStatus.progress = 1.0f;
|
taskAttemptStatus.progress = 1.0f;
|
||||||
taskAttemptStatus.diagnosticInfo = "";
|
|
||||||
taskAttemptStatus.stateString = attemptInfo.getTaskStatus();
|
taskAttemptStatus.stateString = attemptInfo.getTaskStatus();
|
||||||
// taskAttemptStatus.outputSize = attemptInfo.getOutputSize();
|
// taskAttemptStatus.outputSize = attemptInfo.getOutputSize();
|
||||||
taskAttemptStatus.phase = Phase.CLEANUP;
|
taskAttemptStatus.phase = Phase.CLEANUP;
|
||||||
|
|
|
@ -83,7 +83,6 @@ public class TestMRClientService {
|
||||||
TaskAttemptStatus taskAttemptStatus = new TaskAttemptStatus();
|
TaskAttemptStatus taskAttemptStatus = new TaskAttemptStatus();
|
||||||
taskAttemptStatus.id = attempt.getID();
|
taskAttemptStatus.id = attempt.getID();
|
||||||
taskAttemptStatus.progress = 0.5f;
|
taskAttemptStatus.progress = 0.5f;
|
||||||
taskAttemptStatus.diagnosticInfo = diagnostic2;
|
|
||||||
taskAttemptStatus.stateString = "RUNNING";
|
taskAttemptStatus.stateString = "RUNNING";
|
||||||
taskAttemptStatus.taskState = TaskAttemptState.RUNNING;
|
taskAttemptStatus.taskState = TaskAttemptState.RUNNING;
|
||||||
taskAttemptStatus.phase = Phase.MAP;
|
taskAttemptStatus.phase = Phase.MAP;
|
||||||
|
|
|
@ -0,0 +1,114 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.mapreduce.v2.app.job.impl;
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import junit.framework.Assert;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent;
|
||||||
|
import org.apache.hadoop.mapreduce.jobhistory.TaskAttemptUnsuccessfulCompletion;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.api.records.JobState;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptReport;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptState;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.api.records.TaskId;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.api.records.TaskState;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.app.AppContext;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.app.MRApp;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.app.job.Job;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.app.job.Task;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.app.job.TaskAttempt;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptDiagnosticsUpdateEvent;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType;
|
||||||
|
import org.apache.hadoop.yarn.event.EventHandler;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class TestTaskAttempt{
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMRAppHistoryForMap() throws Exception {
|
||||||
|
MRApp app = new FailingAttemptsMRApp(1, 0);
|
||||||
|
testMRAppHistory(app);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMRAppHistoryForReduce() throws Exception {
|
||||||
|
MRApp app = new FailingAttemptsMRApp(0, 1);
|
||||||
|
testMRAppHistory(app);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void testMRAppHistory(MRApp app) throws Exception {
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
Job job = app.submit(conf);
|
||||||
|
app.waitForState(job, JobState.FAILED);
|
||||||
|
Map<TaskId, Task> tasks = job.getTasks();
|
||||||
|
|
||||||
|
Assert.assertEquals("Num tasks is not correct", 1, tasks.size());
|
||||||
|
Task task = tasks.values().iterator().next();
|
||||||
|
Assert.assertEquals("Task state not correct", TaskState.FAILED, task
|
||||||
|
.getReport().getTaskState());
|
||||||
|
Map<TaskAttemptId, TaskAttempt> attempts = tasks.values().iterator().next()
|
||||||
|
.getAttempts();
|
||||||
|
Assert.assertEquals("Num attempts is not correct", 4, attempts.size());
|
||||||
|
|
||||||
|
Iterator<TaskAttempt> it = attempts.values().iterator();
|
||||||
|
TaskAttemptReport report = it.next().getReport();
|
||||||
|
Assert.assertEquals("Attempt state not correct", TaskAttemptState.FAILED,
|
||||||
|
report.getTaskAttemptState());
|
||||||
|
Assert.assertEquals("Diagnostic Information is not Correct",
|
||||||
|
"Test Diagnostic Event", report.getDiagnosticInfo());
|
||||||
|
report = it.next().getReport();
|
||||||
|
Assert.assertEquals("Attempt state not correct", TaskAttemptState.FAILED,
|
||||||
|
report.getTaskAttemptState());
|
||||||
|
}
|
||||||
|
|
||||||
|
static class FailingAttemptsMRApp extends MRApp {
|
||||||
|
FailingAttemptsMRApp(int maps, int reduces) {
|
||||||
|
super(maps, reduces, true, "FailingAttemptsMRApp", true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void attemptLaunched(TaskAttemptId attemptID) {
|
||||||
|
getContext().getEventHandler().handle(
|
||||||
|
new TaskAttemptDiagnosticsUpdateEvent(attemptID,
|
||||||
|
"Test Diagnostic Event"));
|
||||||
|
getContext().getEventHandler().handle(
|
||||||
|
new TaskAttemptEvent(attemptID, TaskAttemptEventType.TA_FAILMSG));
|
||||||
|
}
|
||||||
|
|
||||||
|
protected EventHandler<JobHistoryEvent> createJobHistoryHandler(
|
||||||
|
AppContext context) {
|
||||||
|
return new EventHandler<JobHistoryEvent>() {
|
||||||
|
@Override
|
||||||
|
public void handle(JobHistoryEvent event) {
|
||||||
|
if (event.getType() == org.apache.hadoop.mapreduce.jobhistory.EventType.MAP_ATTEMPT_FAILED) {
|
||||||
|
TaskAttemptUnsuccessfulCompletion datum = (TaskAttemptUnsuccessfulCompletion) event
|
||||||
|
.getHistoryEvent().getDatum();
|
||||||
|
Assert.assertEquals("Diagnostic Information is not Correct",
|
||||||
|
"Test Diagnostic Event", datum.get(6).toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue