MAPREDUCE-4751. AM stuck in KILL_WAIT for days (vinodkv via bobby)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1408314 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Joseph Evans 2012-11-12 15:15:52 +00:00
parent 42f4691ead
commit dd72ca3536
5 changed files with 208 additions and 48 deletions

View File

@ -650,6 +650,8 @@ Release 0.23.5 - UNRELEASED
MAPREDUCE-4774. JobImpl does not handle asynchronous task events in FAILED MAPREDUCE-4774. JobImpl does not handle asynchronous task events in FAILED
state (jlowe via bobby) state (jlowe via bobby)
MAPREDUCE-4751. AM stuck in KILL_WAIT for days (vinodkv via bobby)
Release 0.23.4 - UNRELEASED Release 0.23.4 - UNRELEASED

View File

@ -712,7 +712,10 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
* The only entry point to change the Job. * The only entry point to change the Job.
*/ */
public void handle(JobEvent event) { public void handle(JobEvent event) {
LOG.debug("Processing " + event.getJobId() + " of type " + event.getType()); if (LOG.isDebugEnabled()) {
LOG.debug("Processing " + event.getJobId() + " of type "
+ event.getType());
}
try { try {
writeLock.lock(); writeLock.lock();
JobStateInternal oldState = getInternalState(); JobStateInternal oldState = getInternalState();

View File

@ -22,9 +22,11 @@ import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.EnumSet; import java.util.EnumSet;
import java.util.HashSet;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock;
@ -118,9 +120,18 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
protected Credentials credentials; protected Credentials credentials;
protected Token<JobTokenIdentifier> jobToken; protected Token<JobTokenIdentifier> jobToken;
//should be set to one which comes first
//saying COMMIT_PENDING
private TaskAttemptId commitAttempt;
private TaskAttemptId successfulAttempt;
private final Set<TaskAttemptId> failedAttempts;
// Track the finished attempts - successful, failed and killed
private final Set<TaskAttemptId> finishedAttempts;
// counts the number of attempts that are either running or in a state where // counts the number of attempts that are either running or in a state where
// they will come to be running when they get a Container // they will come to be running when they get a Container
private int numberUncompletedAttempts = 0; private final Set<TaskAttemptId> inProgressAttempts;
private boolean historyTaskStartGenerated = false; private boolean historyTaskStartGenerated = false;
@ -182,6 +193,14 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
EnumSet.of(TaskStateInternal.KILL_WAIT, TaskStateInternal.KILLED), EnumSet.of(TaskStateInternal.KILL_WAIT, TaskStateInternal.KILLED),
TaskEventType.T_ATTEMPT_KILLED, TaskEventType.T_ATTEMPT_KILLED,
new KillWaitAttemptKilledTransition()) new KillWaitAttemptKilledTransition())
.addTransition(TaskStateInternal.KILL_WAIT,
EnumSet.of(TaskStateInternal.KILL_WAIT, TaskStateInternal.KILLED),
TaskEventType.T_ATTEMPT_SUCCEEDED,
new KillWaitAttemptSucceededTransition())
.addTransition(TaskStateInternal.KILL_WAIT,
EnumSet.of(TaskStateInternal.KILL_WAIT, TaskStateInternal.KILLED),
TaskEventType.T_ATTEMPT_FAILED,
new KillWaitAttemptFailedTransition())
// Ignore-able transitions. // Ignore-able transitions.
.addTransition( .addTransition(
TaskStateInternal.KILL_WAIT, TaskStateInternal.KILL_WAIT,
@ -189,8 +208,6 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
EnumSet.of(TaskEventType.T_KILL, EnumSet.of(TaskEventType.T_KILL,
TaskEventType.T_ATTEMPT_LAUNCHED, TaskEventType.T_ATTEMPT_LAUNCHED,
TaskEventType.T_ATTEMPT_COMMIT_PENDING, TaskEventType.T_ATTEMPT_COMMIT_PENDING,
TaskEventType.T_ATTEMPT_FAILED,
TaskEventType.T_ATTEMPT_SUCCEEDED,
TaskEventType.T_ADD_SPEC_ATTEMPT)) TaskEventType.T_ADD_SPEC_ATTEMPT))
// Transitions from SUCCEEDED state // Transitions from SUCCEEDED state
@ -242,15 +259,6 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
private static final RecoverdAttemptsComparator RECOVERED_ATTEMPTS_COMPARATOR = private static final RecoverdAttemptsComparator RECOVERED_ATTEMPTS_COMPARATOR =
new RecoverdAttemptsComparator(); new RecoverdAttemptsComparator();
//should be set to one which comes first
//saying COMMIT_PENDING
private TaskAttemptId commitAttempt;
private TaskAttemptId successfulAttempt;
private int failedAttempts;
private int finishedAttempts;//finish are total of success, failed and killed
@Override @Override
public TaskState getState() { public TaskState getState() {
readLock.lock(); readLock.lock();
@ -275,6 +283,9 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
readLock = readWriteLock.readLock(); readLock = readWriteLock.readLock();
writeLock = readWriteLock.writeLock(); writeLock = readWriteLock.writeLock();
this.attempts = Collections.emptyMap(); this.attempts = Collections.emptyMap();
this.finishedAttempts = new HashSet<TaskAttemptId>(2);
this.failedAttempts = new HashSet<TaskAttemptId>(2);
this.inProgressAttempts = new HashSet<TaskAttemptId>(2);
// This overridable method call is okay in a constructor because we // This overridable method call is okay in a constructor because we
// have a convention that none of the overrides depends on any // have a convention that none of the overrides depends on any
// fields that need initialization. // fields that need initialization.
@ -611,9 +622,9 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
taskAttemptsFromPreviousGeneration.remove(0).getAttemptId().getId(); taskAttemptsFromPreviousGeneration.remove(0).getAttemptId().getId();
} }
++numberUncompletedAttempts; inProgressAttempts.add(attempt.getID());
//schedule the nextAttemptNumber //schedule the nextAttemptNumber
if (failedAttempts > 0) { if (failedAttempts.size() > 0) {
eventHandler.handle(new TaskAttemptEvent(attempt.getID(), eventHandler.handle(new TaskAttemptEvent(attempt.getID(),
TaskAttemptEventType.TA_RESCHEDULE)); TaskAttemptEventType.TA_RESCHEDULE));
} else { } else {
@ -788,12 +799,14 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
implements SingleArcTransition<TaskImpl, TaskEvent> { implements SingleArcTransition<TaskImpl, TaskEvent> {
@Override @Override
public void transition(TaskImpl task, TaskEvent event) { public void transition(TaskImpl task, TaskEvent event) {
TaskTAttemptEvent taskTAttemptEvent = (TaskTAttemptEvent) event;
TaskAttemptId taskAttemptId = taskTAttemptEvent.getTaskAttemptID();
task.handleTaskAttemptCompletion( task.handleTaskAttemptCompletion(
((TaskTAttemptEvent) event).getTaskAttemptID(), taskAttemptId,
TaskAttemptCompletionEventStatus.SUCCEEDED); TaskAttemptCompletionEventStatus.SUCCEEDED);
task.finishedAttempts++; task.finishedAttempts.add(taskAttemptId);
--task.numberUncompletedAttempts; task.inProgressAttempts.remove(taskAttemptId);
task.successfulAttempt = ((TaskTAttemptEvent) event).getTaskAttemptID(); task.successfulAttempt = taskAttemptId;
task.eventHandler.handle(new JobTaskEvent( task.eventHandler.handle(new JobTaskEvent(
task.taskId, TaskState.SUCCEEDED)); task.taskId, TaskState.SUCCEEDED));
LOG.info("Task succeeded with attempt " + task.successfulAttempt); LOG.info("Task succeeded with attempt " + task.successfulAttempt);
@ -824,11 +837,13 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
SingleArcTransition<TaskImpl, TaskEvent> { SingleArcTransition<TaskImpl, TaskEvent> {
@Override @Override
public void transition(TaskImpl task, TaskEvent event) { public void transition(TaskImpl task, TaskEvent event) {
TaskAttemptId taskAttemptId =
((TaskTAttemptEvent) event).getTaskAttemptID();
task.handleTaskAttemptCompletion( task.handleTaskAttemptCompletion(
((TaskTAttemptEvent) event).getTaskAttemptID(), taskAttemptId,
TaskAttemptCompletionEventStatus.KILLED); TaskAttemptCompletionEventStatus.KILLED);
task.finishedAttempts++; task.finishedAttempts.add(taskAttemptId);
--task.numberUncompletedAttempts; task.inProgressAttempts.remove(taskAttemptId);
if (task.successfulAttempt == null) { if (task.successfulAttempt == null) {
task.addAndScheduleAttempt(); task.addAndScheduleAttempt();
} }
@ -840,15 +855,25 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
MultipleArcTransition<TaskImpl, TaskEvent, TaskStateInternal> { MultipleArcTransition<TaskImpl, TaskEvent, TaskStateInternal> {
protected TaskStateInternal finalState = TaskStateInternal.KILLED; protected TaskStateInternal finalState = TaskStateInternal.KILLED;
protected final TaskAttemptCompletionEventStatus taCompletionEventStatus;
public KillWaitAttemptKilledTransition() {
this(TaskAttemptCompletionEventStatus.KILLED);
}
public KillWaitAttemptKilledTransition(
TaskAttemptCompletionEventStatus taCompletionEventStatus) {
this.taCompletionEventStatus = taCompletionEventStatus;
}
@Override @Override
public TaskStateInternal transition(TaskImpl task, TaskEvent event) { public TaskStateInternal transition(TaskImpl task, TaskEvent event) {
task.handleTaskAttemptCompletion( TaskAttemptId taskAttemptId =
((TaskTAttemptEvent) event).getTaskAttemptID(), ((TaskTAttemptEvent) event).getTaskAttemptID();
TaskAttemptCompletionEventStatus.KILLED); task.handleTaskAttemptCompletion(taskAttemptId, taCompletionEventStatus);
task.finishedAttempts++; task.finishedAttempts.add(taskAttemptId);
// check whether all attempts are finished // check whether all attempts are finished
if (task.finishedAttempts == task.attempts.size()) { if (task.finishedAttempts.size() == task.attempts.size()) {
if (task.historyTaskStartGenerated) { if (task.historyTaskStartGenerated) {
TaskFailedEvent taskFailedEvent = createTaskFailedEvent(task, null, TaskFailedEvent taskFailedEvent = createTaskFailedEvent(task, null,
finalState, null); // TODO JH verify failedAttempt null finalState, null); // TODO JH verify failedAttempt null
@ -867,43 +892,57 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
} }
} }
private static class KillWaitAttemptSucceededTransition extends
KillWaitAttemptKilledTransition {
public KillWaitAttemptSucceededTransition() {
super(TaskAttemptCompletionEventStatus.SUCCEEDED);
}
}
private static class KillWaitAttemptFailedTransition extends
KillWaitAttemptKilledTransition {
public KillWaitAttemptFailedTransition() {
super(TaskAttemptCompletionEventStatus.FAILED);
}
}
private static class AttemptFailedTransition implements private static class AttemptFailedTransition implements
MultipleArcTransition<TaskImpl, TaskEvent, TaskStateInternal> { MultipleArcTransition<TaskImpl, TaskEvent, TaskStateInternal> {
@Override @Override
public TaskStateInternal transition(TaskImpl task, TaskEvent event) { public TaskStateInternal transition(TaskImpl task, TaskEvent event) {
task.failedAttempts++;
TaskTAttemptEvent castEvent = (TaskTAttemptEvent) event; TaskTAttemptEvent castEvent = (TaskTAttemptEvent) event;
if (castEvent.getTaskAttemptID().equals(task.commitAttempt)) { TaskAttemptId taskAttemptId = castEvent.getTaskAttemptID();
task.failedAttempts.add(taskAttemptId);
if (taskAttemptId.equals(task.commitAttempt)) {
task.commitAttempt = null; task.commitAttempt = null;
} }
TaskAttempt attempt = task.attempts.get(castEvent.getTaskAttemptID()); TaskAttempt attempt = task.attempts.get(taskAttemptId);
if (attempt.getAssignedContainerMgrAddress() != null) { if (attempt.getAssignedContainerMgrAddress() != null) {
//container was assigned //container was assigned
task.eventHandler.handle(new ContainerFailedEvent(attempt.getID(), task.eventHandler.handle(new ContainerFailedEvent(attempt.getID(),
attempt.getAssignedContainerMgrAddress())); attempt.getAssignedContainerMgrAddress()));
} }
task.finishedAttempts++; task.finishedAttempts.add(taskAttemptId);
if (task.failedAttempts < task.maxAttempts) { if (task.failedAttempts.size() < task.maxAttempts) {
task.handleTaskAttemptCompletion( task.handleTaskAttemptCompletion(
((TaskTAttemptEvent) event).getTaskAttemptID(), taskAttemptId,
TaskAttemptCompletionEventStatus.FAILED); TaskAttemptCompletionEventStatus.FAILED);
// we don't need a new event if we already have a spare // we don't need a new event if we already have a spare
if (--task.numberUncompletedAttempts == 0 task.inProgressAttempts.remove(taskAttemptId);
if (task.inProgressAttempts.size() == 0
&& task.successfulAttempt == null) { && task.successfulAttempt == null) {
task.addAndScheduleAttempt(); task.addAndScheduleAttempt();
} }
} else { } else {
task.handleTaskAttemptCompletion( task.handleTaskAttemptCompletion(
((TaskTAttemptEvent) event).getTaskAttemptID(), taskAttemptId,
TaskAttemptCompletionEventStatus.TIPFAILED); TaskAttemptCompletionEventStatus.TIPFAILED);
TaskTAttemptEvent ev = (TaskTAttemptEvent) event;
TaskAttemptId taId = ev.getTaskAttemptID();
if (task.historyTaskStartGenerated) { if (task.historyTaskStartGenerated) {
TaskFailedEvent taskFailedEvent = createTaskFailedEvent(task, attempt.getDiagnostics(), TaskFailedEvent taskFailedEvent = createTaskFailedEvent(task, attempt.getDiagnostics(),
TaskStateInternal.FAILED, taId); TaskStateInternal.FAILED, taskAttemptId);
task.eventHandler.handle(new JobHistoryEvent(task.taskId.getJobId(), task.eventHandler.handle(new JobHistoryEvent(task.taskId.getJobId(),
taskFailedEvent)); taskFailedEvent));
} else { } else {
@ -927,14 +966,12 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
@Override @Override
public TaskStateInternal transition(TaskImpl task, TaskEvent event) { public TaskStateInternal transition(TaskImpl task, TaskEvent event) {
if (event instanceof TaskTAttemptEvent) { TaskTAttemptEvent castEvent = (TaskTAttemptEvent) event;
TaskTAttemptEvent castEvent = (TaskTAttemptEvent) event; if (task.getInternalState() == TaskStateInternal.SUCCEEDED &&
if (task.getInternalState() == TaskStateInternal.SUCCEEDED && !castEvent.getTaskAttemptID().equals(task.successfulAttempt)) {
!castEvent.getTaskAttemptID().equals(task.successfulAttempt)) { // don't allow a different task attempt to override a previous
// don't allow a different task attempt to override a previous // succeeded state
// succeeded state return TaskStateInternal.SUCCEEDED;
return TaskStateInternal.SUCCEEDED;
}
} }
// a successful REDUCE task should not be overridden // a successful REDUCE task should not be overridden
@ -953,7 +990,7 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
// believe that there's no redundancy. // believe that there's no redundancy.
unSucceed(task); unSucceed(task);
// fake increase in Uncomplete attempts for super.transition // fake increase in Uncomplete attempts for super.transition
++task.numberUncompletedAttempts; task.inProgressAttempts.add(castEvent.getTaskAttemptID());
return super.transition(task, event); return super.transition(task, event);
} }
@ -1045,7 +1082,7 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
(attempt, "Task KILL is received. Killing attempt!"); (attempt, "Task KILL is received. Killing attempt!");
} }
task.numberUncompletedAttempts = 0; task.inProgressAttempts.clear();
} }
} }

View File

@ -54,6 +54,7 @@ import org.apache.hadoop.mapreduce.v2.app.job.JobStateInternal;
import org.apache.hadoop.mapreduce.v2.app.job.Task; import org.apache.hadoop.mapreduce.v2.app.job.Task;
import org.apache.hadoop.mapreduce.v2.app.job.TaskAttempt; import org.apache.hadoop.mapreduce.v2.app.job.TaskAttempt;
import org.apache.hadoop.mapreduce.v2.app.job.TaskAttemptStateInternal; import org.apache.hadoop.mapreduce.v2.app.job.TaskAttemptStateInternal;
import org.apache.hadoop.mapreduce.v2.app.job.TaskStateInternal;
import org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent; import org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.JobEventType; import org.apache.hadoop.mapreduce.v2.app.job.event.JobEventType;
import org.apache.hadoop.mapreduce.v2.app.job.event.JobFinishEvent; import org.apache.hadoop.mapreduce.v2.app.job.event.JobFinishEvent;
@ -63,6 +64,7 @@ import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType; import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType;
import org.apache.hadoop.mapreduce.v2.app.job.impl.JobImpl; import org.apache.hadoop.mapreduce.v2.app.job.impl.JobImpl;
import org.apache.hadoop.mapreduce.v2.app.job.impl.TaskAttemptImpl; import org.apache.hadoop.mapreduce.v2.app.job.impl.TaskAttemptImpl;
import org.apache.hadoop.mapreduce.v2.app.job.impl.TaskImpl;
import org.apache.hadoop.mapreduce.v2.app.launcher.ContainerLauncher; import org.apache.hadoop.mapreduce.v2.app.launcher.ContainerLauncher;
import org.apache.hadoop.mapreduce.v2.app.launcher.ContainerLauncherEvent; import org.apache.hadoop.mapreduce.v2.app.launcher.ContainerLauncherEvent;
import org.apache.hadoop.mapreduce.v2.app.rm.ContainerAllocator; import org.apache.hadoop.mapreduce.v2.app.rm.ContainerAllocator;
@ -243,6 +245,39 @@ public class MRApp extends MRAppMaster {
return job; return job;
} }
public void waitForInternalState(JobImpl job,
JobStateInternal finalState) throws Exception {
int timeoutSecs = 0;
JobStateInternal iState = job.getInternalState();
while (!finalState.equals(iState) && timeoutSecs++ < 20) {
System.out.println("Job Internal State is : " + iState
+ " Waiting for Internal state : " + finalState);
Thread.sleep(500);
iState = job.getInternalState();
}
System.out.println("Task Internal State is : " + iState);
Assert.assertEquals("Task Internal state is not correct (timedout)",
finalState, iState);
}
public void waitForInternalState(TaskImpl task,
TaskStateInternal finalState) throws Exception {
int timeoutSecs = 0;
TaskReport report = task.getReport();
TaskStateInternal iState = task.getInternalState();
while (!finalState.equals(iState) && timeoutSecs++ < 20) {
System.out.println("Task Internal State is : " + iState
+ " Waiting for Internal state : " + finalState + " progress : "
+ report.getProgress());
Thread.sleep(500);
report = task.getReport();
iState = task.getInternalState();
}
System.out.println("Task Internal State is : " + iState);
Assert.assertEquals("Task Internal state is not correct (timedout)",
finalState, iState);
}
public void waitForInternalState(TaskAttemptImpl attempt, public void waitForInternalState(TaskAttemptImpl attempt,
TaskAttemptStateInternal finalState) throws Exception { TaskAttemptStateInternal finalState) throws Exception {
int timeoutSecs = 0; int timeoutSecs = 0;

View File

@ -25,12 +25,15 @@ import java.util.concurrent.CountDownLatch;
import junit.framework.Assert; import junit.framework.Assert;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.v2.api.records.JobId;
import org.apache.hadoop.mapreduce.v2.api.records.JobState; import org.apache.hadoop.mapreduce.v2.api.records.JobState;
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId; import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId;
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptState; import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptState;
import org.apache.hadoop.mapreduce.v2.api.records.TaskId; import org.apache.hadoop.mapreduce.v2.api.records.TaskId;
import org.apache.hadoop.mapreduce.v2.api.records.TaskState; import org.apache.hadoop.mapreduce.v2.api.records.TaskState;
import org.apache.hadoop.mapreduce.v2.api.records.TaskType;
import org.apache.hadoop.mapreduce.v2.app.job.Job; import org.apache.hadoop.mapreduce.v2.app.job.Job;
import org.apache.hadoop.mapreduce.v2.app.job.JobStateInternal;
import org.apache.hadoop.mapreduce.v2.app.job.Task; import org.apache.hadoop.mapreduce.v2.app.job.Task;
import org.apache.hadoop.mapreduce.v2.app.job.TaskAttempt; import org.apache.hadoop.mapreduce.v2.app.job.TaskAttempt;
import org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent; import org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent;
@ -39,12 +42,18 @@ import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType; import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskEvent; import org.apache.hadoop.mapreduce.v2.app.job.event.TaskEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskEventType; import org.apache.hadoop.mapreduce.v2.app.job.event.TaskEventType;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskTAttemptEvent;
import org.apache.hadoop.mapreduce.v2.app.job.impl.JobImpl;
import org.apache.hadoop.yarn.event.AsyncDispatcher;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.event.Event;
import org.junit.Test; import org.junit.Test;
/** /**
* Tests the state machine with respect to Job/Task/TaskAttempt kill scenarios. * Tests the state machine with respect to Job/Task/TaskAttempt kill scenarios.
* *
*/ */
@SuppressWarnings({"unchecked", "rawtypes"})
public class TestKill { public class TestKill {
@Test @Test
@ -131,6 +140,80 @@ public class TestKill {
iter.next().getReport().getTaskAttemptState()); iter.next().getReport().getTaskAttemptState());
} }
@Test
public void testKillTaskWait() throws Exception {
final Dispatcher dispatcher = new AsyncDispatcher() {
private TaskAttemptEvent cachedKillEvent;
@Override
protected void dispatch(Event event) {
if (event instanceof TaskAttemptEvent) {
TaskAttemptEvent killEvent = (TaskAttemptEvent) event;
if (killEvent.getType() == TaskAttemptEventType.TA_KILL) {
TaskAttemptId taID = killEvent.getTaskAttemptID();
if (taID.getTaskId().getTaskType() == TaskType.REDUCE
&& taID.getTaskId().getId() == 0 && taID.getId() == 0) {
// Task is asking the reduce TA to kill itself. 'Create' a race
// condition. Make the task succeed and then inform the task that
// TA has succeeded. Once Task gets the TA succeeded event at
// KILL_WAIT, then relay the actual kill signal to TA
super.dispatch(new TaskAttemptEvent(taID,
TaskAttemptEventType.TA_DONE));
super.dispatch(new TaskAttemptEvent(taID,
TaskAttemptEventType.TA_CONTAINER_CLEANED));
super.dispatch(new TaskTAttemptEvent(taID,
TaskEventType.T_ATTEMPT_SUCCEEDED));
this.cachedKillEvent = killEvent;
return;
}
}
} else if (event instanceof TaskEvent) {
TaskEvent taskEvent = (TaskEvent) event;
if (taskEvent.getType() == TaskEventType.T_ATTEMPT_SUCCEEDED
&& this.cachedKillEvent != null) {
// When the TA comes and reports that it is done, send the
// cachedKillEvent
super.dispatch(this.cachedKillEvent);
return;
}
}
super.dispatch(event);
}
};
MRApp app = new MRApp(1, 1, false, this.getClass().getName(), true) {
@Override
public Dispatcher createDispatcher() {
return dispatcher;
}
};
Job job = app.submit(new Configuration());
JobId jobId = app.getJobId();
app.waitForState(job, JobState.RUNNING);
Assert.assertEquals("Num tasks not correct", 2, job.getTasks().size());
Iterator<Task> it = job.getTasks().values().iterator();
Task mapTask = it.next();
Task reduceTask = it.next();
app.waitForState(mapTask, TaskState.RUNNING);
app.waitForState(reduceTask, TaskState.RUNNING);
TaskAttempt mapAttempt = mapTask.getAttempts().values().iterator().next();
app.waitForState(mapAttempt, TaskAttemptState.RUNNING);
TaskAttempt reduceAttempt = reduceTask.getAttempts().values().iterator().next();
app.waitForState(reduceAttempt, TaskAttemptState.RUNNING);
// Finish map
app.getContext().getEventHandler().handle(
new TaskAttemptEvent(
mapAttempt.getID(),
TaskAttemptEventType.TA_DONE));
app.waitForState(mapTask, TaskState.SUCCEEDED);
// Now kill the job
app.getContext().getEventHandler()
.handle(new JobEvent(jobId, JobEventType.JOB_KILL));
app.waitForInternalState((JobImpl)job, JobStateInternal.KILLED);
}
@Test @Test
public void testKillTaskAttempt() throws Exception { public void testKillTaskAttempt() throws Exception {
final CountDownLatch latch = new CountDownLatch(1); final CountDownLatch latch = new CountDownLatch(1);