MAPREDUCE-5982. Task attempts that fail from the ASSIGNED state can disappear. Contributed by Chang Li
(cherry picked from commit ee4ee6af6a
)
This commit is contained in:
parent
cadde8c1e5
commit
f27f1ef44c
|
@ -304,6 +304,9 @@ Release 2.8.0 - UNRELEASED
|
||||||
MAPREDUCE-5002. AM could potentially allocate a reduce container to a map
|
MAPREDUCE-5002. AM could potentially allocate a reduce container to a map
|
||||||
attempt (Chang Li via jlowe)
|
attempt (Chang Li via jlowe)
|
||||||
|
|
||||||
|
MAPREDUCE-5982. Task attempts that fail from the ASSIGNED state can
|
||||||
|
disappear (Chang Li via jlowe)
|
||||||
|
|
||||||
Release 2.7.2 - UNRELEASED
|
Release 2.7.2 - UNRELEASED
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -1500,6 +1500,19 @@ public abstract class TaskAttemptImpl implements
|
||||||
return tauce;
|
return tauce;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void
|
||||||
|
sendJHStartEventForAssignedFailTask(TaskAttemptImpl taskAttempt) {
|
||||||
|
TaskAttemptContainerLaunchedEvent event;
|
||||||
|
taskAttempt.launchTime = taskAttempt.clock.getTime();
|
||||||
|
|
||||||
|
InetSocketAddress nodeHttpInetAddr =
|
||||||
|
NetUtils.createSocketAddr(taskAttempt.container.getNodeHttpAddress());
|
||||||
|
taskAttempt.trackerName = nodeHttpInetAddr.getHostName();
|
||||||
|
taskAttempt.httpPort = nodeHttpInetAddr.getPort();
|
||||||
|
taskAttempt.sendLaunchedEvents();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
private void sendLaunchedEvents() {
|
private void sendLaunchedEvents() {
|
||||||
JobCounterUpdateEvent jce = new JobCounterUpdateEvent(attemptId.getTaskId()
|
JobCounterUpdateEvent jce = new JobCounterUpdateEvent(attemptId.getTaskId()
|
||||||
|
@ -1697,6 +1710,9 @@ public abstract class TaskAttemptImpl implements
|
||||||
@Override
|
@Override
|
||||||
public void transition(TaskAttemptImpl taskAttempt,
|
public void transition(TaskAttemptImpl taskAttempt,
|
||||||
TaskAttemptEvent event) {
|
TaskAttemptEvent event) {
|
||||||
|
if (taskAttempt.getLaunchTime() == 0) {
|
||||||
|
sendJHStartEventForAssignedFailTask(taskAttempt);
|
||||||
|
}
|
||||||
//set the finish time
|
//set the finish time
|
||||||
taskAttempt.setFinishTime();
|
taskAttempt.setFinishTime();
|
||||||
|
|
||||||
|
@ -1731,23 +1747,19 @@ public abstract class TaskAttemptImpl implements
|
||||||
default:
|
default:
|
||||||
LOG.error("Task final state is not FAILED or KILLED: " + finalState);
|
LOG.error("Task final state is not FAILED or KILLED: " + finalState);
|
||||||
}
|
}
|
||||||
if (taskAttempt.getLaunchTime() != 0) {
|
|
||||||
TaskAttemptUnsuccessfulCompletionEvent tauce =
|
TaskAttemptUnsuccessfulCompletionEvent tauce =
|
||||||
createTaskAttemptUnsuccessfulCompletionEvent(taskAttempt,
|
createTaskAttemptUnsuccessfulCompletionEvent(taskAttempt,
|
||||||
finalState);
|
finalState);
|
||||||
if(finalState == TaskAttemptStateInternal.FAILED) {
|
if(finalState == TaskAttemptStateInternal.FAILED) {
|
||||||
taskAttempt.eventHandler
|
taskAttempt.eventHandler
|
||||||
.handle(createJobCounterUpdateEventTAFailed(taskAttempt, false));
|
.handle(createJobCounterUpdateEventTAFailed(taskAttempt, false));
|
||||||
} else if(finalState == TaskAttemptStateInternal.KILLED) {
|
} else if(finalState == TaskAttemptStateInternal.KILLED) {
|
||||||
taskAttempt.eventHandler
|
taskAttempt.eventHandler
|
||||||
.handle(createJobCounterUpdateEventTAKilled(taskAttempt, false));
|
.handle(createJobCounterUpdateEventTAKilled(taskAttempt, false));
|
||||||
}
|
|
||||||
taskAttempt.eventHandler.handle(new JobHistoryEvent(
|
|
||||||
taskAttempt.attemptId.getTaskId().getJobId(), tauce));
|
|
||||||
} else {
|
|
||||||
LOG.debug("Not generating HistoryFinish event since start event not " +
|
|
||||||
"generated for taskAttempt: " + taskAttempt.getID());
|
|
||||||
}
|
}
|
||||||
|
taskAttempt.eventHandler.handle(new JobHistoryEvent(
|
||||||
|
taskAttempt.attemptId.getTaskId().getJobId(), tauce));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2039,27 +2051,25 @@ public abstract class TaskAttemptImpl implements
|
||||||
@Override
|
@Override
|
||||||
public void transition(TaskAttemptImpl taskAttempt,
|
public void transition(TaskAttemptImpl taskAttempt,
|
||||||
TaskAttemptEvent event) {
|
TaskAttemptEvent event) {
|
||||||
|
if (taskAttempt.getLaunchTime() == 0) {
|
||||||
|
sendJHStartEventForAssignedFailTask(taskAttempt);
|
||||||
|
}
|
||||||
//set the finish time
|
//set the finish time
|
||||||
taskAttempt.setFinishTime();
|
taskAttempt.setFinishTime();
|
||||||
if (taskAttempt.getLaunchTime() != 0) {
|
|
||||||
taskAttempt.eventHandler
|
taskAttempt.eventHandler
|
||||||
.handle(createJobCounterUpdateEventTAKilled(taskAttempt, false));
|
.handle(createJobCounterUpdateEventTAKilled(taskAttempt, false));
|
||||||
TaskAttemptUnsuccessfulCompletionEvent tauce =
|
TaskAttemptUnsuccessfulCompletionEvent tauce =
|
||||||
createTaskAttemptUnsuccessfulCompletionEvent(taskAttempt,
|
createTaskAttemptUnsuccessfulCompletionEvent(taskAttempt,
|
||||||
TaskAttemptStateInternal.KILLED);
|
TaskAttemptStateInternal.KILLED);
|
||||||
taskAttempt.eventHandler.handle(new JobHistoryEvent(
|
taskAttempt.eventHandler.handle(new JobHistoryEvent(
|
||||||
taskAttempt.attemptId.getTaskId().getJobId(), tauce));
|
taskAttempt.attemptId.getTaskId().getJobId(), tauce));
|
||||||
}else {
|
|
||||||
LOG.debug("Not generating HistoryFinish event since start event not " +
|
|
||||||
"generated for taskAttempt: " + taskAttempt.getID());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (event instanceof TaskAttemptKillEvent) {
|
if (event instanceof TaskAttemptKillEvent) {
|
||||||
taskAttempt.addDiagnosticInfo(
|
taskAttempt.addDiagnosticInfo(
|
||||||
((TaskAttemptKillEvent) event).getMessage());
|
((TaskAttemptKillEvent) event).getMessage());
|
||||||
}
|
}
|
||||||
|
|
||||||
// taskAttempt.logAttemptFinishedEvent(TaskAttemptStateInternal.KILLED); Not logging Map/Reduce attempts in case of failure.
|
|
||||||
taskAttempt.eventHandler.handle(new TaskTAttemptEvent(
|
taskAttempt.eventHandler.handle(new TaskTAttemptEvent(
|
||||||
taskAttempt.attemptId,
|
taskAttempt.attemptId,
|
||||||
TaskEventType.T_ATTEMPT_KILLED));
|
TaskEventType.T_ATTEMPT_KILLED));
|
||||||
|
@ -2173,23 +2183,19 @@ public abstract class TaskAttemptImpl implements
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
private static void notifyTaskAttemptFailed(TaskAttemptImpl taskAttempt) {
|
private static void notifyTaskAttemptFailed(TaskAttemptImpl taskAttempt) {
|
||||||
|
if (taskAttempt.getLaunchTime() == 0) {
|
||||||
|
sendJHStartEventForAssignedFailTask(taskAttempt);
|
||||||
|
}
|
||||||
// set the finish time
|
// set the finish time
|
||||||
taskAttempt.setFinishTime();
|
taskAttempt.setFinishTime();
|
||||||
|
taskAttempt.eventHandler
|
||||||
|
.handle(createJobCounterUpdateEventTAFailed(taskAttempt, false));
|
||||||
|
TaskAttemptUnsuccessfulCompletionEvent tauce =
|
||||||
|
createTaskAttemptUnsuccessfulCompletionEvent(taskAttempt,
|
||||||
|
TaskAttemptStateInternal.FAILED);
|
||||||
|
taskAttempt.eventHandler.handle(new JobHistoryEvent(
|
||||||
|
taskAttempt.attemptId.getTaskId().getJobId(), tauce));
|
||||||
|
|
||||||
if (taskAttempt.getLaunchTime() != 0) {
|
|
||||||
taskAttempt.eventHandler
|
|
||||||
.handle(createJobCounterUpdateEventTAFailed(taskAttempt, false));
|
|
||||||
TaskAttemptUnsuccessfulCompletionEvent tauce =
|
|
||||||
createTaskAttemptUnsuccessfulCompletionEvent(taskAttempt,
|
|
||||||
TaskAttemptStateInternal.FAILED);
|
|
||||||
taskAttempt.eventHandler.handle(new JobHistoryEvent(
|
|
||||||
taskAttempt.attemptId.getTaskId().getJobId(), tauce));
|
|
||||||
// taskAttempt.logAttemptFinishedEvent(TaskAttemptStateInternal.FAILED); Not
|
|
||||||
// handling failed map/reduce events.
|
|
||||||
}else {
|
|
||||||
LOG.debug("Not generating HistoryFinish event since start event not " +
|
|
||||||
"generated for taskAttempt: " + taskAttempt.getID());
|
|
||||||
}
|
|
||||||
taskAttempt.eventHandler.handle(new TaskTAttemptEvent(
|
taskAttempt.eventHandler.handle(new TaskTAttemptEvent(
|
||||||
taskAttempt.attemptId, TaskEventType.T_ATTEMPT_FAILED));
|
taskAttempt.attemptId, TaskEventType.T_ATTEMPT_FAILED));
|
||||||
|
|
||||||
|
|
|
@ -542,10 +542,7 @@ public class MRApp extends MRAppMaster {
|
||||||
public void handle(ContainerLauncherEvent event) {
|
public void handle(ContainerLauncherEvent event) {
|
||||||
switch (event.getType()) {
|
switch (event.getType()) {
|
||||||
case CONTAINER_REMOTE_LAUNCH:
|
case CONTAINER_REMOTE_LAUNCH:
|
||||||
getContext().getEventHandler().handle(
|
containerLaunched(event.getTaskAttemptID(), shufflePort);
|
||||||
new TaskAttemptContainerLaunchedEvent(event.getTaskAttemptID(),
|
|
||||||
shufflePort));
|
|
||||||
|
|
||||||
attemptLaunched(event.getTaskAttemptID());
|
attemptLaunched(event.getTaskAttemptID());
|
||||||
break;
|
break;
|
||||||
case CONTAINER_REMOTE_CLEANUP:
|
case CONTAINER_REMOTE_CLEANUP:
|
||||||
|
@ -559,6 +556,12 @@ public class MRApp extends MRAppMaster {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected void containerLaunched(TaskAttemptId attemptID, int shufflePort) {
|
||||||
|
getContext().getEventHandler().handle(
|
||||||
|
new TaskAttemptContainerLaunchedEvent(attemptID,
|
||||||
|
shufflePort));
|
||||||
|
}
|
||||||
|
|
||||||
protected void attemptLaunched(TaskAttemptId attemptID) {
|
protected void attemptLaunched(TaskAttemptId attemptID) {
|
||||||
if (autoComplete) {
|
if (autoComplete) {
|
||||||
// send the done event
|
// send the done event
|
||||||
|
|
|
@ -114,6 +114,69 @@ public class TestTaskAttempt{
|
||||||
testMRAppHistory(app);
|
testMRAppHistory(app);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMRAppHistoryForTAFailedInAssigned() throws Exception {
|
||||||
|
// test TA_CONTAINER_LAUNCH_FAILED for map
|
||||||
|
FailingAttemptsDuringAssignedMRApp app =
|
||||||
|
new FailingAttemptsDuringAssignedMRApp(1, 0,
|
||||||
|
TaskAttemptEventType.TA_CONTAINER_LAUNCH_FAILED);
|
||||||
|
testTaskAttemptAssignedFailHistory(app);
|
||||||
|
|
||||||
|
// test TA_CONTAINER_LAUNCH_FAILED for reduce
|
||||||
|
app =
|
||||||
|
new FailingAttemptsDuringAssignedMRApp(0, 1,
|
||||||
|
TaskAttemptEventType.TA_CONTAINER_LAUNCH_FAILED);
|
||||||
|
testTaskAttemptAssignedFailHistory(app);
|
||||||
|
|
||||||
|
// test TA_CONTAINER_COMPLETED for map
|
||||||
|
app =
|
||||||
|
new FailingAttemptsDuringAssignedMRApp(1, 0,
|
||||||
|
TaskAttemptEventType.TA_CONTAINER_COMPLETED);
|
||||||
|
testTaskAttemptAssignedFailHistory(app);
|
||||||
|
|
||||||
|
// test TA_CONTAINER_COMPLETED for reduce
|
||||||
|
app =
|
||||||
|
new FailingAttemptsDuringAssignedMRApp(0, 1,
|
||||||
|
TaskAttemptEventType.TA_CONTAINER_COMPLETED);
|
||||||
|
testTaskAttemptAssignedFailHistory(app);
|
||||||
|
|
||||||
|
// test TA_FAILMSG for map
|
||||||
|
app =
|
||||||
|
new FailingAttemptsDuringAssignedMRApp(1, 0,
|
||||||
|
TaskAttemptEventType.TA_FAILMSG);
|
||||||
|
testTaskAttemptAssignedFailHistory(app);
|
||||||
|
|
||||||
|
// test TA_FAILMSG for reduce
|
||||||
|
app =
|
||||||
|
new FailingAttemptsDuringAssignedMRApp(0, 1,
|
||||||
|
TaskAttemptEventType.TA_FAILMSG);
|
||||||
|
testTaskAttemptAssignedFailHistory(app);
|
||||||
|
|
||||||
|
// test TA_FAILMSG_BY_CLIENT for map
|
||||||
|
app =
|
||||||
|
new FailingAttemptsDuringAssignedMRApp(1, 0,
|
||||||
|
TaskAttemptEventType.TA_FAILMSG_BY_CLIENT);
|
||||||
|
testTaskAttemptAssignedFailHistory(app);
|
||||||
|
|
||||||
|
// test TA_FAILMSG_BY_CLIENT for reduce
|
||||||
|
app =
|
||||||
|
new FailingAttemptsDuringAssignedMRApp(0, 1,
|
||||||
|
TaskAttemptEventType.TA_FAILMSG_BY_CLIENT);
|
||||||
|
testTaskAttemptAssignedFailHistory(app);
|
||||||
|
|
||||||
|
// test TA_KILL for map
|
||||||
|
app =
|
||||||
|
new FailingAttemptsDuringAssignedMRApp(1, 0,
|
||||||
|
TaskAttemptEventType.TA_KILL);
|
||||||
|
testTaskAttemptAssignedKilledHistory(app);
|
||||||
|
|
||||||
|
// test TA_KILL for reduce
|
||||||
|
app =
|
||||||
|
new FailingAttemptsDuringAssignedMRApp(0, 1,
|
||||||
|
TaskAttemptEventType.TA_KILL);
|
||||||
|
testTaskAttemptAssignedKilledHistory(app);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSingleRackRequest() throws Exception {
|
public void testSingleRackRequest() throws Exception {
|
||||||
TaskAttemptImpl.RequestContainerTransition rct =
|
TaskAttemptImpl.RequestContainerTransition rct =
|
||||||
|
@ -301,6 +364,31 @@ public class TestTaskAttempt{
|
||||||
report.getTaskAttemptState());
|
report.getTaskAttemptState());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void testTaskAttemptAssignedFailHistory
|
||||||
|
(FailingAttemptsDuringAssignedMRApp app) throws Exception {
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
Job job = app.submit(conf);
|
||||||
|
app.waitForState(job, JobState.FAILED);
|
||||||
|
Map<TaskId, Task> tasks = job.getTasks();
|
||||||
|
Assert.assertTrue("No Ta Started JH Event", app.getTaStartJHEvent());
|
||||||
|
Assert.assertTrue("No Ta Failed JH Event", app.getTaFailedJHEvent());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void testTaskAttemptAssignedKilledHistory
|
||||||
|
(FailingAttemptsDuringAssignedMRApp app) throws Exception {
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
Job job = app.submit(conf);
|
||||||
|
app.waitForState(job, JobState.RUNNING);
|
||||||
|
Map<TaskId, Task> tasks = job.getTasks();
|
||||||
|
Task task = tasks.values().iterator().next();
|
||||||
|
app.waitForState(task, TaskState.SCHEDULED);
|
||||||
|
Map<TaskAttemptId, TaskAttempt> attempts = task.getAttempts();
|
||||||
|
TaskAttempt attempt = attempts.values().iterator().next();
|
||||||
|
app.waitForState(attempt, TaskAttemptState.KILLED);
|
||||||
|
Assert.assertTrue("No Ta Started JH Event", app.getTaStartJHEvent());
|
||||||
|
Assert.assertTrue("No Ta Killed JH Event", app.getTaKilledJHEvent());
|
||||||
|
}
|
||||||
|
|
||||||
static class FailingAttemptsMRApp extends MRApp {
|
static class FailingAttemptsMRApp extends MRApp {
|
||||||
FailingAttemptsMRApp(int maps, int reduces) {
|
FailingAttemptsMRApp(int maps, int reduces) {
|
||||||
super(maps, reduces, true, "FailingAttemptsMRApp", true);
|
super(maps, reduces, true, "FailingAttemptsMRApp", true);
|
||||||
|
@ -331,6 +419,72 @@ public class TestTaskAttempt{
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static class FailingAttemptsDuringAssignedMRApp extends MRApp {
|
||||||
|
FailingAttemptsDuringAssignedMRApp(int maps, int reduces,
|
||||||
|
TaskAttemptEventType event) {
|
||||||
|
super(maps, reduces, true, "FailingAttemptsMRApp", true);
|
||||||
|
sendFailEvent = event;
|
||||||
|
}
|
||||||
|
|
||||||
|
TaskAttemptEventType sendFailEvent;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void containerLaunched(TaskAttemptId attemptID,
|
||||||
|
int shufflePort) {
|
||||||
|
//do nothing, not send TA_CONTAINER_LAUNCHED event
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void attemptLaunched(TaskAttemptId attemptID) {
|
||||||
|
getContext().getEventHandler().handle(
|
||||||
|
new TaskAttemptEvent(attemptID, sendFailEvent));
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean receiveTaStartJHEvent = false;
|
||||||
|
private boolean receiveTaFailedJHEvent = false;
|
||||||
|
private boolean receiveTaKilledJHEvent = false;
|
||||||
|
|
||||||
|
public boolean getTaStartJHEvent(){
|
||||||
|
return receiveTaStartJHEvent;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean getTaFailedJHEvent(){
|
||||||
|
return receiveTaFailedJHEvent;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean getTaKilledJHEvent(){
|
||||||
|
return receiveTaKilledJHEvent;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected EventHandler<JobHistoryEvent> createJobHistoryHandler(
|
||||||
|
AppContext context) {
|
||||||
|
return new EventHandler<JobHistoryEvent>() {
|
||||||
|
@Override
|
||||||
|
public void handle(JobHistoryEvent event) {
|
||||||
|
if (event.getType() == org.apache.hadoop.mapreduce.jobhistory.
|
||||||
|
EventType.MAP_ATTEMPT_FAILED) {
|
||||||
|
receiveTaFailedJHEvent = true;
|
||||||
|
} else if (event.getType() == org.apache.hadoop.mapreduce.
|
||||||
|
jobhistory.EventType.MAP_ATTEMPT_KILLED) {
|
||||||
|
receiveTaKilledJHEvent = true;
|
||||||
|
} else if (event.getType() == org.apache.hadoop.mapreduce.
|
||||||
|
jobhistory.EventType.MAP_ATTEMPT_STARTED) {
|
||||||
|
receiveTaStartJHEvent = true;
|
||||||
|
} else if (event.getType() == org.apache.hadoop.mapreduce.
|
||||||
|
jobhistory.EventType.REDUCE_ATTEMPT_FAILED) {
|
||||||
|
receiveTaFailedJHEvent = true;
|
||||||
|
} else if (event.getType() == org.apache.hadoop.mapreduce.
|
||||||
|
jobhistory.EventType.REDUCE_ATTEMPT_KILLED) {
|
||||||
|
receiveTaKilledJHEvent = true;
|
||||||
|
} else if (event.getType() == org.apache.hadoop.mapreduce.
|
||||||
|
jobhistory.EventType.REDUCE_ATTEMPT_STARTED) {
|
||||||
|
receiveTaStartJHEvent = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testLaunchFailedWhileKilling() throws Exception {
|
public void testLaunchFailedWhileKilling() throws Exception {
|
||||||
ApplicationId appId = ApplicationId.newInstance(1, 2);
|
ApplicationId appId = ApplicationId.newInstance(1, 2);
|
||||||
|
|
Loading…
Reference in New Issue