MAPREDUCE-5982. Task attempts that fail from the ASSIGNED state can disappear. Contributed by Chang Li

This commit is contained in:
Jason Lowe 2015-09-18 19:21:35 +00:00
parent 521196874c
commit 7429438c85
4 changed files with 202 additions and 47 deletions

View File

@ -38,6 +38,9 @@ Release 2.7.2 - UNRELEASED
position/key information for uncompressed input sometimes. (Zhihai Xu via position/key information for uncompressed input sometimes. (Zhihai Xu via
jlowe) jlowe)
MAPREDUCE-5982. Task attempts that fail from the ASSIGNED state can
disappear (Chang Li via jlowe)
Release 2.7.1 - 2015-07-06 Release 2.7.1 - 2015-07-06
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -1378,6 +1378,19 @@ public abstract class TaskAttemptImpl implements
return tauce; return tauce;
} }
private static void
sendJHStartEventForAssignedFailTask(TaskAttemptImpl taskAttempt) {
TaskAttemptContainerLaunchedEvent event;
taskAttempt.launchTime = taskAttempt.clock.getTime();
InetSocketAddress nodeHttpInetAddr =
NetUtils.createSocketAddr(taskAttempt.container.getNodeHttpAddress());
taskAttempt.trackerName = nodeHttpInetAddr.getHostName();
taskAttempt.httpPort = nodeHttpInetAddr.getPort();
taskAttempt.sendLaunchedEvents();
}
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
private void sendLaunchedEvents() { private void sendLaunchedEvents() {
JobCounterUpdateEvent jce = new JobCounterUpdateEvent(attemptId.getTaskId() JobCounterUpdateEvent jce = new JobCounterUpdateEvent(attemptId.getTaskId()
@ -1566,6 +1579,9 @@ public abstract class TaskAttemptImpl implements
@Override @Override
public void transition(TaskAttemptImpl taskAttempt, public void transition(TaskAttemptImpl taskAttempt,
TaskAttemptEvent event) { TaskAttemptEvent event) {
if (taskAttempt.getLaunchTime() == 0) {
sendJHStartEventForAssignedFailTask(taskAttempt);
}
//set the finish time //set the finish time
taskAttempt.setFinishTime(); taskAttempt.setFinishTime();
@ -1600,7 +1616,7 @@ public abstract class TaskAttemptImpl implements
default: default:
LOG.error("Task final state is not FAILED or KILLED: " + finalState); LOG.error("Task final state is not FAILED or KILLED: " + finalState);
} }
if (taskAttempt.getLaunchTime() != 0) {
TaskAttemptUnsuccessfulCompletionEvent tauce = TaskAttemptUnsuccessfulCompletionEvent tauce =
createTaskAttemptUnsuccessfulCompletionEvent(taskAttempt, createTaskAttemptUnsuccessfulCompletionEvent(taskAttempt,
finalState); finalState);
@ -1613,10 +1629,6 @@ public abstract class TaskAttemptImpl implements
} }
taskAttempt.eventHandler.handle(new JobHistoryEvent( taskAttempt.eventHandler.handle(new JobHistoryEvent(
taskAttempt.attemptId.getTaskId().getJobId(), tauce)); taskAttempt.attemptId.getTaskId().getJobId(), tauce));
} else {
LOG.debug("Not generating HistoryFinish event since start event not " +
"generated for taskAttempt: " + taskAttempt.getID());
}
} }
} }
@ -1708,10 +1720,12 @@ public abstract class TaskAttemptImpl implements
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
@Override @Override
public void transition(TaskAttemptImpl taskAttempt, TaskAttemptEvent event) { public void transition(TaskAttemptImpl taskAttempt, TaskAttemptEvent event) {
if (taskAttempt.getLaunchTime() == 0) {
sendJHStartEventForAssignedFailTask(taskAttempt);
}
// set the finish time // set the finish time
taskAttempt.setFinishTime(); taskAttempt.setFinishTime();
if (taskAttempt.getLaunchTime() != 0) {
taskAttempt.eventHandler taskAttempt.eventHandler
.handle(createJobCounterUpdateEventTAFailed(taskAttempt, false)); .handle(createJobCounterUpdateEventTAFailed(taskAttempt, false));
TaskAttemptUnsuccessfulCompletionEvent tauce = TaskAttemptUnsuccessfulCompletionEvent tauce =
@ -1719,12 +1733,7 @@ public abstract class TaskAttemptImpl implements
TaskAttemptStateInternal.FAILED); TaskAttemptStateInternal.FAILED);
taskAttempt.eventHandler.handle(new JobHistoryEvent( taskAttempt.eventHandler.handle(new JobHistoryEvent(
taskAttempt.attemptId.getTaskId().getJobId(), tauce)); taskAttempt.attemptId.getTaskId().getJobId(), tauce));
// taskAttempt.logAttemptFinishedEvent(TaskAttemptStateInternal.FAILED); Not
// handling failed map/reduce events.
}else {
LOG.debug("Not generating HistoryFinish event since start event not " +
"generated for taskAttempt: " + taskAttempt.getID());
}
taskAttempt.eventHandler.handle(new TaskTAttemptEvent( taskAttempt.eventHandler.handle(new TaskTAttemptEvent(
taskAttempt.attemptId, TaskEventType.T_ATTEMPT_FAILED)); taskAttempt.attemptId, TaskEventType.T_ATTEMPT_FAILED));
} }
@ -1861,9 +1870,12 @@ public abstract class TaskAttemptImpl implements
@Override @Override
public void transition(TaskAttemptImpl taskAttempt, public void transition(TaskAttemptImpl taskAttempt,
TaskAttemptEvent event) { TaskAttemptEvent event) {
if (taskAttempt.getLaunchTime() == 0) {
sendJHStartEventForAssignedFailTask(taskAttempt);
}
//set the finish time //set the finish time
taskAttempt.setFinishTime(); taskAttempt.setFinishTime();
if (taskAttempt.getLaunchTime() != 0) {
taskAttempt.eventHandler taskAttempt.eventHandler
.handle(createJobCounterUpdateEventTAKilled(taskAttempt, false)); .handle(createJobCounterUpdateEventTAKilled(taskAttempt, false));
TaskAttemptUnsuccessfulCompletionEvent tauce = TaskAttemptUnsuccessfulCompletionEvent tauce =
@ -1871,17 +1883,12 @@ public abstract class TaskAttemptImpl implements
TaskAttemptStateInternal.KILLED); TaskAttemptStateInternal.KILLED);
taskAttempt.eventHandler.handle(new JobHistoryEvent( taskAttempt.eventHandler.handle(new JobHistoryEvent(
taskAttempt.attemptId.getTaskId().getJobId(), tauce)); taskAttempt.attemptId.getTaskId().getJobId(), tauce));
}else {
LOG.debug("Not generating HistoryFinish event since start event not " +
"generated for taskAttempt: " + taskAttempt.getID());
}
if (event instanceof TaskAttemptKillEvent) { if (event instanceof TaskAttemptKillEvent) {
taskAttempt.addDiagnosticInfo( taskAttempt.addDiagnosticInfo(
((TaskAttemptKillEvent) event).getMessage()); ((TaskAttemptKillEvent) event).getMessage());
} }
// taskAttempt.logAttemptFinishedEvent(TaskAttemptStateInternal.KILLED); Not logging Map/Reduce attempts in case of failure.
taskAttempt.eventHandler.handle(new TaskTAttemptEvent( taskAttempt.eventHandler.handle(new TaskTAttemptEvent(
taskAttempt.attemptId, taskAttempt.attemptId,
TaskEventType.T_ATTEMPT_KILLED)); TaskEventType.T_ATTEMPT_KILLED));

View File

@ -528,10 +528,7 @@ public class MRApp extends MRAppMaster {
public void handle(ContainerLauncherEvent event) { public void handle(ContainerLauncherEvent event) {
switch (event.getType()) { switch (event.getType()) {
case CONTAINER_REMOTE_LAUNCH: case CONTAINER_REMOTE_LAUNCH:
getContext().getEventHandler().handle( containerLaunched(event.getTaskAttemptID(), shufflePort);
new TaskAttemptContainerLaunchedEvent(event.getTaskAttemptID(),
shufflePort));
attemptLaunched(event.getTaskAttemptID()); attemptLaunched(event.getTaskAttemptID());
break; break;
case CONTAINER_REMOTE_CLEANUP: case CONTAINER_REMOTE_CLEANUP:
@ -543,6 +540,12 @@ public class MRApp extends MRAppMaster {
} }
} }
protected void containerLaunched(TaskAttemptId attemptID, int shufflePort) {
getContext().getEventHandler().handle(
new TaskAttemptContainerLaunchedEvent(attemptID,
shufflePort));
}
protected void attemptLaunched(TaskAttemptId attemptID) { protected void attemptLaunched(TaskAttemptId attemptID) {
if (autoComplete) { if (autoComplete) {
// send the done event // send the done event

View File

@ -112,6 +112,57 @@ public class TestTaskAttempt{
testMRAppHistory(app); testMRAppHistory(app);
} }
@Test
public void testMRAppHistoryForTAFailedInAssigned() throws Exception {
// test TA_CONTAINER_LAUNCH_FAILED for map
FailingAttemptsDuringAssignedMRApp app =
new FailingAttemptsDuringAssignedMRApp(1, 0,
TaskAttemptEventType.TA_CONTAINER_LAUNCH_FAILED);
testTaskAttemptAssignedFailHistory(app);
// test TA_CONTAINER_LAUNCH_FAILED for reduce
app =
new FailingAttemptsDuringAssignedMRApp(0, 1,
TaskAttemptEventType.TA_CONTAINER_LAUNCH_FAILED);
testTaskAttemptAssignedFailHistory(app);
// test TA_CONTAINER_COMPLETED for map
app =
new FailingAttemptsDuringAssignedMRApp(1, 0,
TaskAttemptEventType.TA_CONTAINER_COMPLETED);
testTaskAttemptAssignedFailHistory(app);
// test TA_CONTAINER_COMPLETED for reduce
app =
new FailingAttemptsDuringAssignedMRApp(0, 1,
TaskAttemptEventType.TA_CONTAINER_COMPLETED);
testTaskAttemptAssignedFailHistory(app);
// test TA_FAILMSG for map
app =
new FailingAttemptsDuringAssignedMRApp(1, 0,
TaskAttemptEventType.TA_FAILMSG);
testTaskAttemptAssignedFailHistory(app);
// test TA_FAILMSG for reduce
app =
new FailingAttemptsDuringAssignedMRApp(0, 1,
TaskAttemptEventType.TA_FAILMSG);
testTaskAttemptAssignedFailHistory(app);
// test TA_KILL for map
app =
new FailingAttemptsDuringAssignedMRApp(1, 0,
TaskAttemptEventType.TA_KILL);
testTaskAttemptAssignedKilledHistory(app);
// test TA_KILL for reduce
app =
new FailingAttemptsDuringAssignedMRApp(0, 1,
TaskAttemptEventType.TA_KILL);
testTaskAttemptAssignedKilledHistory(app);
}
@Test @Test
public void testSingleRackRequest() throws Exception { public void testSingleRackRequest() throws Exception {
TaskAttemptImpl.RequestContainerTransition rct = TaskAttemptImpl.RequestContainerTransition rct =
@ -299,6 +350,31 @@ public class TestTaskAttempt{
report.getTaskAttemptState()); report.getTaskAttemptState());
} }
private void testTaskAttemptAssignedFailHistory
(FailingAttemptsDuringAssignedMRApp app) throws Exception {
Configuration conf = new Configuration();
Job job = app.submit(conf);
app.waitForState(job, JobState.FAILED);
Map<TaskId, Task> tasks = job.getTasks();
Assert.assertTrue("No Ta Started JH Event", app.getTaStartJHEvent());
Assert.assertTrue("No Ta Failed JH Event", app.getTaFailedJHEvent());
}
private void testTaskAttemptAssignedKilledHistory
(FailingAttemptsDuringAssignedMRApp app) throws Exception {
Configuration conf = new Configuration();
Job job = app.submit(conf);
app.waitForState(job, JobState.RUNNING);
Map<TaskId, Task> tasks = job.getTasks();
Task task = tasks.values().iterator().next();
app.waitForState(task, TaskState.SCHEDULED);
Map<TaskAttemptId, TaskAttempt> attempts = task.getAttempts();
TaskAttempt attempt = attempts.values().iterator().next();
app.waitForState(attempt, TaskAttemptState.KILLED);
Assert.assertTrue("No Ta Started JH Event", app.getTaStartJHEvent());
Assert.assertTrue("No Ta Killed JH Event", app.getTaKilledJHEvent());
}
static class FailingAttemptsMRApp extends MRApp { static class FailingAttemptsMRApp extends MRApp {
FailingAttemptsMRApp(int maps, int reduces) { FailingAttemptsMRApp(int maps, int reduces) {
super(maps, reduces, true, "FailingAttemptsMRApp", true); super(maps, reduces, true, "FailingAttemptsMRApp", true);
@ -329,6 +405,72 @@ public class TestTaskAttempt{
} }
} }
static class FailingAttemptsDuringAssignedMRApp extends MRApp {
FailingAttemptsDuringAssignedMRApp(int maps, int reduces,
TaskAttemptEventType event) {
super(maps, reduces, true, "FailingAttemptsMRApp", true);
sendFailEvent = event;
}
TaskAttemptEventType sendFailEvent;
@Override
protected void containerLaunched(TaskAttemptId attemptID,
int shufflePort) {
//do nothing, not send TA_CONTAINER_LAUNCHED event
}
@Override
protected void attemptLaunched(TaskAttemptId attemptID) {
getContext().getEventHandler().handle(
new TaskAttemptEvent(attemptID, sendFailEvent));
}
private boolean receiveTaStartJHEvent = false;
private boolean receiveTaFailedJHEvent = false;
private boolean receiveTaKilledJHEvent = false;
public boolean getTaStartJHEvent(){
return receiveTaStartJHEvent;
}
public boolean getTaFailedJHEvent(){
return receiveTaFailedJHEvent;
}
public boolean getTaKilledJHEvent(){
return receiveTaKilledJHEvent;
}
protected EventHandler<JobHistoryEvent> createJobHistoryHandler(
AppContext context) {
return new EventHandler<JobHistoryEvent>() {
@Override
public void handle(JobHistoryEvent event) {
if (event.getType() == org.apache.hadoop.mapreduce.jobhistory.
EventType.MAP_ATTEMPT_FAILED) {
receiveTaFailedJHEvent = true;
} else if (event.getType() == org.apache.hadoop.mapreduce.
jobhistory.EventType.MAP_ATTEMPT_KILLED) {
receiveTaKilledJHEvent = true;
} else if (event.getType() == org.apache.hadoop.mapreduce.
jobhistory.EventType.MAP_ATTEMPT_STARTED) {
receiveTaStartJHEvent = true;
} else if (event.getType() == org.apache.hadoop.mapreduce.
jobhistory.EventType.REDUCE_ATTEMPT_FAILED) {
receiveTaFailedJHEvent = true;
} else if (event.getType() == org.apache.hadoop.mapreduce.
jobhistory.EventType.REDUCE_ATTEMPT_KILLED) {
receiveTaKilledJHEvent = true;
} else if (event.getType() == org.apache.hadoop.mapreduce.
jobhistory.EventType.REDUCE_ATTEMPT_STARTED) {
receiveTaStartJHEvent = true;
}
}
};
}
}
@Test @Test
public void testLaunchFailedWhileKilling() throws Exception { public void testLaunchFailedWhileKilling() throws Exception {
ApplicationId appId = ApplicationId.newInstance(1, 2); ApplicationId appId = ApplicationId.newInstance(1, 2);