MAPREDUCE-4751. AM stuck in KILL_WAIT for days (vinodkv via bobby)
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1408314 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
42f4691ead
commit
dd72ca3536
|
@ -650,6 +650,8 @@ Release 0.23.5 - UNRELEASED
|
||||||
|
|
||||||
MAPREDUCE-4774. JobImpl does not handle asynchronous task events in FAILED
|
MAPREDUCE-4774. JobImpl does not handle asynchronous task events in FAILED
|
||||||
state (jlowe via bobby)
|
state (jlowe via bobby)
|
||||||
|
|
||||||
|
MAPREDUCE-4751. AM stuck in KILL_WAIT for days (vinodkv via bobby)
|
||||||
|
|
||||||
Release 0.23.4 - UNRELEASED
|
Release 0.23.4 - UNRELEASED
|
||||||
|
|
||||||
|
|
|
@ -712,7 +712,10 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
||||||
* The only entry point to change the Job.
|
* The only entry point to change the Job.
|
||||||
*/
|
*/
|
||||||
public void handle(JobEvent event) {
|
public void handle(JobEvent event) {
|
||||||
LOG.debug("Processing " + event.getJobId() + " of type " + event.getType());
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("Processing " + event.getJobId() + " of type "
|
||||||
|
+ event.getType());
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
writeLock.lock();
|
writeLock.lock();
|
||||||
JobStateInternal oldState = getInternalState();
|
JobStateInternal oldState = getInternalState();
|
||||||
|
|
|
@ -22,9 +22,11 @@ import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.EnumSet;
|
import java.util.EnumSet;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.LinkedHashMap;
|
import java.util.LinkedHashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.concurrent.locks.Lock;
|
import java.util.concurrent.locks.Lock;
|
||||||
import java.util.concurrent.locks.ReadWriteLock;
|
import java.util.concurrent.locks.ReadWriteLock;
|
||||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||||
|
@ -118,9 +120,18 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
|
||||||
protected Credentials credentials;
|
protected Credentials credentials;
|
||||||
protected Token<JobTokenIdentifier> jobToken;
|
protected Token<JobTokenIdentifier> jobToken;
|
||||||
|
|
||||||
|
//should be set to one which comes first
|
||||||
|
//saying COMMIT_PENDING
|
||||||
|
private TaskAttemptId commitAttempt;
|
||||||
|
|
||||||
|
private TaskAttemptId successfulAttempt;
|
||||||
|
|
||||||
|
private final Set<TaskAttemptId> failedAttempts;
|
||||||
|
// Track the finished attempts - successful, failed and killed
|
||||||
|
private final Set<TaskAttemptId> finishedAttempts;
|
||||||
// counts the number of attempts that are either running or in a state where
|
// counts the number of attempts that are either running or in a state where
|
||||||
// they will come to be running when they get a Container
|
// they will come to be running when they get a Container
|
||||||
private int numberUncompletedAttempts = 0;
|
private final Set<TaskAttemptId> inProgressAttempts;
|
||||||
|
|
||||||
private boolean historyTaskStartGenerated = false;
|
private boolean historyTaskStartGenerated = false;
|
||||||
|
|
||||||
|
@ -182,6 +193,14 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
|
||||||
EnumSet.of(TaskStateInternal.KILL_WAIT, TaskStateInternal.KILLED),
|
EnumSet.of(TaskStateInternal.KILL_WAIT, TaskStateInternal.KILLED),
|
||||||
TaskEventType.T_ATTEMPT_KILLED,
|
TaskEventType.T_ATTEMPT_KILLED,
|
||||||
new KillWaitAttemptKilledTransition())
|
new KillWaitAttemptKilledTransition())
|
||||||
|
.addTransition(TaskStateInternal.KILL_WAIT,
|
||||||
|
EnumSet.of(TaskStateInternal.KILL_WAIT, TaskStateInternal.KILLED),
|
||||||
|
TaskEventType.T_ATTEMPT_SUCCEEDED,
|
||||||
|
new KillWaitAttemptSucceededTransition())
|
||||||
|
.addTransition(TaskStateInternal.KILL_WAIT,
|
||||||
|
EnumSet.of(TaskStateInternal.KILL_WAIT, TaskStateInternal.KILLED),
|
||||||
|
TaskEventType.T_ATTEMPT_FAILED,
|
||||||
|
new KillWaitAttemptFailedTransition())
|
||||||
// Ignore-able transitions.
|
// Ignore-able transitions.
|
||||||
.addTransition(
|
.addTransition(
|
||||||
TaskStateInternal.KILL_WAIT,
|
TaskStateInternal.KILL_WAIT,
|
||||||
|
@ -189,8 +208,6 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
|
||||||
EnumSet.of(TaskEventType.T_KILL,
|
EnumSet.of(TaskEventType.T_KILL,
|
||||||
TaskEventType.T_ATTEMPT_LAUNCHED,
|
TaskEventType.T_ATTEMPT_LAUNCHED,
|
||||||
TaskEventType.T_ATTEMPT_COMMIT_PENDING,
|
TaskEventType.T_ATTEMPT_COMMIT_PENDING,
|
||||||
TaskEventType.T_ATTEMPT_FAILED,
|
|
||||||
TaskEventType.T_ATTEMPT_SUCCEEDED,
|
|
||||||
TaskEventType.T_ADD_SPEC_ATTEMPT))
|
TaskEventType.T_ADD_SPEC_ATTEMPT))
|
||||||
|
|
||||||
// Transitions from SUCCEEDED state
|
// Transitions from SUCCEEDED state
|
||||||
|
@ -242,15 +259,6 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
|
||||||
private static final RecoverdAttemptsComparator RECOVERED_ATTEMPTS_COMPARATOR =
|
private static final RecoverdAttemptsComparator RECOVERED_ATTEMPTS_COMPARATOR =
|
||||||
new RecoverdAttemptsComparator();
|
new RecoverdAttemptsComparator();
|
||||||
|
|
||||||
//should be set to one which comes first
|
|
||||||
//saying COMMIT_PENDING
|
|
||||||
private TaskAttemptId commitAttempt;
|
|
||||||
|
|
||||||
private TaskAttemptId successfulAttempt;
|
|
||||||
|
|
||||||
private int failedAttempts;
|
|
||||||
private int finishedAttempts;//finish are total of success, failed and killed
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TaskState getState() {
|
public TaskState getState() {
|
||||||
readLock.lock();
|
readLock.lock();
|
||||||
|
@ -275,6 +283,9 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
|
||||||
readLock = readWriteLock.readLock();
|
readLock = readWriteLock.readLock();
|
||||||
writeLock = readWriteLock.writeLock();
|
writeLock = readWriteLock.writeLock();
|
||||||
this.attempts = Collections.emptyMap();
|
this.attempts = Collections.emptyMap();
|
||||||
|
this.finishedAttempts = new HashSet<TaskAttemptId>(2);
|
||||||
|
this.failedAttempts = new HashSet<TaskAttemptId>(2);
|
||||||
|
this.inProgressAttempts = new HashSet<TaskAttemptId>(2);
|
||||||
// This overridable method call is okay in a constructor because we
|
// This overridable method call is okay in a constructor because we
|
||||||
// have a convention that none of the overrides depends on any
|
// have a convention that none of the overrides depends on any
|
||||||
// fields that need initialization.
|
// fields that need initialization.
|
||||||
|
@ -611,9 +622,9 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
|
||||||
taskAttemptsFromPreviousGeneration.remove(0).getAttemptId().getId();
|
taskAttemptsFromPreviousGeneration.remove(0).getAttemptId().getId();
|
||||||
}
|
}
|
||||||
|
|
||||||
++numberUncompletedAttempts;
|
inProgressAttempts.add(attempt.getID());
|
||||||
//schedule the nextAttemptNumber
|
//schedule the nextAttemptNumber
|
||||||
if (failedAttempts > 0) {
|
if (failedAttempts.size() > 0) {
|
||||||
eventHandler.handle(new TaskAttemptEvent(attempt.getID(),
|
eventHandler.handle(new TaskAttemptEvent(attempt.getID(),
|
||||||
TaskAttemptEventType.TA_RESCHEDULE));
|
TaskAttemptEventType.TA_RESCHEDULE));
|
||||||
} else {
|
} else {
|
||||||
|
@ -788,12 +799,14 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
|
||||||
implements SingleArcTransition<TaskImpl, TaskEvent> {
|
implements SingleArcTransition<TaskImpl, TaskEvent> {
|
||||||
@Override
|
@Override
|
||||||
public void transition(TaskImpl task, TaskEvent event) {
|
public void transition(TaskImpl task, TaskEvent event) {
|
||||||
|
TaskTAttemptEvent taskTAttemptEvent = (TaskTAttemptEvent) event;
|
||||||
|
TaskAttemptId taskAttemptId = taskTAttemptEvent.getTaskAttemptID();
|
||||||
task.handleTaskAttemptCompletion(
|
task.handleTaskAttemptCompletion(
|
||||||
((TaskTAttemptEvent) event).getTaskAttemptID(),
|
taskAttemptId,
|
||||||
TaskAttemptCompletionEventStatus.SUCCEEDED);
|
TaskAttemptCompletionEventStatus.SUCCEEDED);
|
||||||
task.finishedAttempts++;
|
task.finishedAttempts.add(taskAttemptId);
|
||||||
--task.numberUncompletedAttempts;
|
task.inProgressAttempts.remove(taskAttemptId);
|
||||||
task.successfulAttempt = ((TaskTAttemptEvent) event).getTaskAttemptID();
|
task.successfulAttempt = taskAttemptId;
|
||||||
task.eventHandler.handle(new JobTaskEvent(
|
task.eventHandler.handle(new JobTaskEvent(
|
||||||
task.taskId, TaskState.SUCCEEDED));
|
task.taskId, TaskState.SUCCEEDED));
|
||||||
LOG.info("Task succeeded with attempt " + task.successfulAttempt);
|
LOG.info("Task succeeded with attempt " + task.successfulAttempt);
|
||||||
|
@ -824,11 +837,13 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
|
||||||
SingleArcTransition<TaskImpl, TaskEvent> {
|
SingleArcTransition<TaskImpl, TaskEvent> {
|
||||||
@Override
|
@Override
|
||||||
public void transition(TaskImpl task, TaskEvent event) {
|
public void transition(TaskImpl task, TaskEvent event) {
|
||||||
|
TaskAttemptId taskAttemptId =
|
||||||
|
((TaskTAttemptEvent) event).getTaskAttemptID();
|
||||||
task.handleTaskAttemptCompletion(
|
task.handleTaskAttemptCompletion(
|
||||||
((TaskTAttemptEvent) event).getTaskAttemptID(),
|
taskAttemptId,
|
||||||
TaskAttemptCompletionEventStatus.KILLED);
|
TaskAttemptCompletionEventStatus.KILLED);
|
||||||
task.finishedAttempts++;
|
task.finishedAttempts.add(taskAttemptId);
|
||||||
--task.numberUncompletedAttempts;
|
task.inProgressAttempts.remove(taskAttemptId);
|
||||||
if (task.successfulAttempt == null) {
|
if (task.successfulAttempt == null) {
|
||||||
task.addAndScheduleAttempt();
|
task.addAndScheduleAttempt();
|
||||||
}
|
}
|
||||||
|
@ -840,15 +855,25 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
|
||||||
MultipleArcTransition<TaskImpl, TaskEvent, TaskStateInternal> {
|
MultipleArcTransition<TaskImpl, TaskEvent, TaskStateInternal> {
|
||||||
|
|
||||||
protected TaskStateInternal finalState = TaskStateInternal.KILLED;
|
protected TaskStateInternal finalState = TaskStateInternal.KILLED;
|
||||||
|
protected final TaskAttemptCompletionEventStatus taCompletionEventStatus;
|
||||||
|
|
||||||
|
public KillWaitAttemptKilledTransition() {
|
||||||
|
this(TaskAttemptCompletionEventStatus.KILLED);
|
||||||
|
}
|
||||||
|
|
||||||
|
public KillWaitAttemptKilledTransition(
|
||||||
|
TaskAttemptCompletionEventStatus taCompletionEventStatus) {
|
||||||
|
this.taCompletionEventStatus = taCompletionEventStatus;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TaskStateInternal transition(TaskImpl task, TaskEvent event) {
|
public TaskStateInternal transition(TaskImpl task, TaskEvent event) {
|
||||||
task.handleTaskAttemptCompletion(
|
TaskAttemptId taskAttemptId =
|
||||||
((TaskTAttemptEvent) event).getTaskAttemptID(),
|
((TaskTAttemptEvent) event).getTaskAttemptID();
|
||||||
TaskAttemptCompletionEventStatus.KILLED);
|
task.handleTaskAttemptCompletion(taskAttemptId, taCompletionEventStatus);
|
||||||
task.finishedAttempts++;
|
task.finishedAttempts.add(taskAttemptId);
|
||||||
// check whether all attempts are finished
|
// check whether all attempts are finished
|
||||||
if (task.finishedAttempts == task.attempts.size()) {
|
if (task.finishedAttempts.size() == task.attempts.size()) {
|
||||||
if (task.historyTaskStartGenerated) {
|
if (task.historyTaskStartGenerated) {
|
||||||
TaskFailedEvent taskFailedEvent = createTaskFailedEvent(task, null,
|
TaskFailedEvent taskFailedEvent = createTaskFailedEvent(task, null,
|
||||||
finalState, null); // TODO JH verify failedAttempt null
|
finalState, null); // TODO JH verify failedAttempt null
|
||||||
|
@ -867,43 +892,57 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static class KillWaitAttemptSucceededTransition extends
|
||||||
|
KillWaitAttemptKilledTransition {
|
||||||
|
public KillWaitAttemptSucceededTransition() {
|
||||||
|
super(TaskAttemptCompletionEventStatus.SUCCEEDED);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class KillWaitAttemptFailedTransition extends
|
||||||
|
KillWaitAttemptKilledTransition {
|
||||||
|
public KillWaitAttemptFailedTransition() {
|
||||||
|
super(TaskAttemptCompletionEventStatus.FAILED);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static class AttemptFailedTransition implements
|
private static class AttemptFailedTransition implements
|
||||||
MultipleArcTransition<TaskImpl, TaskEvent, TaskStateInternal> {
|
MultipleArcTransition<TaskImpl, TaskEvent, TaskStateInternal> {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TaskStateInternal transition(TaskImpl task, TaskEvent event) {
|
public TaskStateInternal transition(TaskImpl task, TaskEvent event) {
|
||||||
task.failedAttempts++;
|
|
||||||
TaskTAttemptEvent castEvent = (TaskTAttemptEvent) event;
|
TaskTAttemptEvent castEvent = (TaskTAttemptEvent) event;
|
||||||
if (castEvent.getTaskAttemptID().equals(task.commitAttempt)) {
|
TaskAttemptId taskAttemptId = castEvent.getTaskAttemptID();
|
||||||
|
task.failedAttempts.add(taskAttemptId);
|
||||||
|
if (taskAttemptId.equals(task.commitAttempt)) {
|
||||||
task.commitAttempt = null;
|
task.commitAttempt = null;
|
||||||
}
|
}
|
||||||
TaskAttempt attempt = task.attempts.get(castEvent.getTaskAttemptID());
|
TaskAttempt attempt = task.attempts.get(taskAttemptId);
|
||||||
if (attempt.getAssignedContainerMgrAddress() != null) {
|
if (attempt.getAssignedContainerMgrAddress() != null) {
|
||||||
//container was assigned
|
//container was assigned
|
||||||
task.eventHandler.handle(new ContainerFailedEvent(attempt.getID(),
|
task.eventHandler.handle(new ContainerFailedEvent(attempt.getID(),
|
||||||
attempt.getAssignedContainerMgrAddress()));
|
attempt.getAssignedContainerMgrAddress()));
|
||||||
}
|
}
|
||||||
|
|
||||||
task.finishedAttempts++;
|
task.finishedAttempts.add(taskAttemptId);
|
||||||
if (task.failedAttempts < task.maxAttempts) {
|
if (task.failedAttempts.size() < task.maxAttempts) {
|
||||||
task.handleTaskAttemptCompletion(
|
task.handleTaskAttemptCompletion(
|
||||||
((TaskTAttemptEvent) event).getTaskAttemptID(),
|
taskAttemptId,
|
||||||
TaskAttemptCompletionEventStatus.FAILED);
|
TaskAttemptCompletionEventStatus.FAILED);
|
||||||
// we don't need a new event if we already have a spare
|
// we don't need a new event if we already have a spare
|
||||||
if (--task.numberUncompletedAttempts == 0
|
task.inProgressAttempts.remove(taskAttemptId);
|
||||||
|
if (task.inProgressAttempts.size() == 0
|
||||||
&& task.successfulAttempt == null) {
|
&& task.successfulAttempt == null) {
|
||||||
task.addAndScheduleAttempt();
|
task.addAndScheduleAttempt();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
task.handleTaskAttemptCompletion(
|
task.handleTaskAttemptCompletion(
|
||||||
((TaskTAttemptEvent) event).getTaskAttemptID(),
|
taskAttemptId,
|
||||||
TaskAttemptCompletionEventStatus.TIPFAILED);
|
TaskAttemptCompletionEventStatus.TIPFAILED);
|
||||||
TaskTAttemptEvent ev = (TaskTAttemptEvent) event;
|
|
||||||
TaskAttemptId taId = ev.getTaskAttemptID();
|
|
||||||
|
|
||||||
if (task.historyTaskStartGenerated) {
|
if (task.historyTaskStartGenerated) {
|
||||||
TaskFailedEvent taskFailedEvent = createTaskFailedEvent(task, attempt.getDiagnostics(),
|
TaskFailedEvent taskFailedEvent = createTaskFailedEvent(task, attempt.getDiagnostics(),
|
||||||
TaskStateInternal.FAILED, taId);
|
TaskStateInternal.FAILED, taskAttemptId);
|
||||||
task.eventHandler.handle(new JobHistoryEvent(task.taskId.getJobId(),
|
task.eventHandler.handle(new JobHistoryEvent(task.taskId.getJobId(),
|
||||||
taskFailedEvent));
|
taskFailedEvent));
|
||||||
} else {
|
} else {
|
||||||
|
@ -927,14 +966,12 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TaskStateInternal transition(TaskImpl task, TaskEvent event) {
|
public TaskStateInternal transition(TaskImpl task, TaskEvent event) {
|
||||||
if (event instanceof TaskTAttemptEvent) {
|
TaskTAttemptEvent castEvent = (TaskTAttemptEvent) event;
|
||||||
TaskTAttemptEvent castEvent = (TaskTAttemptEvent) event;
|
if (task.getInternalState() == TaskStateInternal.SUCCEEDED &&
|
||||||
if (task.getInternalState() == TaskStateInternal.SUCCEEDED &&
|
!castEvent.getTaskAttemptID().equals(task.successfulAttempt)) {
|
||||||
!castEvent.getTaskAttemptID().equals(task.successfulAttempt)) {
|
// don't allow a different task attempt to override a previous
|
||||||
// don't allow a different task attempt to override a previous
|
// succeeded state
|
||||||
// succeeded state
|
return TaskStateInternal.SUCCEEDED;
|
||||||
return TaskStateInternal.SUCCEEDED;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// a successful REDUCE task should not be overridden
|
// a successful REDUCE task should not be overridden
|
||||||
|
@ -953,7 +990,7 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
|
||||||
// believe that there's no redundancy.
|
// believe that there's no redundancy.
|
||||||
unSucceed(task);
|
unSucceed(task);
|
||||||
// fake increase in Uncomplete attempts for super.transition
|
// fake increase in Uncomplete attempts for super.transition
|
||||||
++task.numberUncompletedAttempts;
|
task.inProgressAttempts.add(castEvent.getTaskAttemptID());
|
||||||
return super.transition(task, event);
|
return super.transition(task, event);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1045,7 +1082,7 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
|
||||||
(attempt, "Task KILL is received. Killing attempt!");
|
(attempt, "Task KILL is received. Killing attempt!");
|
||||||
}
|
}
|
||||||
|
|
||||||
task.numberUncompletedAttempts = 0;
|
task.inProgressAttempts.clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -54,6 +54,7 @@ import org.apache.hadoop.mapreduce.v2.app.job.JobStateInternal;
|
||||||
import org.apache.hadoop.mapreduce.v2.app.job.Task;
|
import org.apache.hadoop.mapreduce.v2.app.job.Task;
|
||||||
import org.apache.hadoop.mapreduce.v2.app.job.TaskAttempt;
|
import org.apache.hadoop.mapreduce.v2.app.job.TaskAttempt;
|
||||||
import org.apache.hadoop.mapreduce.v2.app.job.TaskAttemptStateInternal;
|
import org.apache.hadoop.mapreduce.v2.app.job.TaskAttemptStateInternal;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.app.job.TaskStateInternal;
|
||||||
import org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent;
|
import org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent;
|
||||||
import org.apache.hadoop.mapreduce.v2.app.job.event.JobEventType;
|
import org.apache.hadoop.mapreduce.v2.app.job.event.JobEventType;
|
||||||
import org.apache.hadoop.mapreduce.v2.app.job.event.JobFinishEvent;
|
import org.apache.hadoop.mapreduce.v2.app.job.event.JobFinishEvent;
|
||||||
|
@ -63,6 +64,7 @@ import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent;
|
||||||
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType;
|
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType;
|
||||||
import org.apache.hadoop.mapreduce.v2.app.job.impl.JobImpl;
|
import org.apache.hadoop.mapreduce.v2.app.job.impl.JobImpl;
|
||||||
import org.apache.hadoop.mapreduce.v2.app.job.impl.TaskAttemptImpl;
|
import org.apache.hadoop.mapreduce.v2.app.job.impl.TaskAttemptImpl;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.app.job.impl.TaskImpl;
|
||||||
import org.apache.hadoop.mapreduce.v2.app.launcher.ContainerLauncher;
|
import org.apache.hadoop.mapreduce.v2.app.launcher.ContainerLauncher;
|
||||||
import org.apache.hadoop.mapreduce.v2.app.launcher.ContainerLauncherEvent;
|
import org.apache.hadoop.mapreduce.v2.app.launcher.ContainerLauncherEvent;
|
||||||
import org.apache.hadoop.mapreduce.v2.app.rm.ContainerAllocator;
|
import org.apache.hadoop.mapreduce.v2.app.rm.ContainerAllocator;
|
||||||
|
@ -243,6 +245,39 @@ public class MRApp extends MRAppMaster {
|
||||||
return job;
|
return job;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void waitForInternalState(JobImpl job,
|
||||||
|
JobStateInternal finalState) throws Exception {
|
||||||
|
int timeoutSecs = 0;
|
||||||
|
JobStateInternal iState = job.getInternalState();
|
||||||
|
while (!finalState.equals(iState) && timeoutSecs++ < 20) {
|
||||||
|
System.out.println("Job Internal State is : " + iState
|
||||||
|
+ " Waiting for Internal state : " + finalState);
|
||||||
|
Thread.sleep(500);
|
||||||
|
iState = job.getInternalState();
|
||||||
|
}
|
||||||
|
System.out.println("Task Internal State is : " + iState);
|
||||||
|
Assert.assertEquals("Task Internal state is not correct (timedout)",
|
||||||
|
finalState, iState);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void waitForInternalState(TaskImpl task,
|
||||||
|
TaskStateInternal finalState) throws Exception {
|
||||||
|
int timeoutSecs = 0;
|
||||||
|
TaskReport report = task.getReport();
|
||||||
|
TaskStateInternal iState = task.getInternalState();
|
||||||
|
while (!finalState.equals(iState) && timeoutSecs++ < 20) {
|
||||||
|
System.out.println("Task Internal State is : " + iState
|
||||||
|
+ " Waiting for Internal state : " + finalState + " progress : "
|
||||||
|
+ report.getProgress());
|
||||||
|
Thread.sleep(500);
|
||||||
|
report = task.getReport();
|
||||||
|
iState = task.getInternalState();
|
||||||
|
}
|
||||||
|
System.out.println("Task Internal State is : " + iState);
|
||||||
|
Assert.assertEquals("Task Internal state is not correct (timedout)",
|
||||||
|
finalState, iState);
|
||||||
|
}
|
||||||
|
|
||||||
public void waitForInternalState(TaskAttemptImpl attempt,
|
public void waitForInternalState(TaskAttemptImpl attempt,
|
||||||
TaskAttemptStateInternal finalState) throws Exception {
|
TaskAttemptStateInternal finalState) throws Exception {
|
||||||
int timeoutSecs = 0;
|
int timeoutSecs = 0;
|
||||||
|
|
|
@ -25,12 +25,15 @@ import java.util.concurrent.CountDownLatch;
|
||||||
import junit.framework.Assert;
|
import junit.framework.Assert;
|
||||||
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.api.records.JobId;
|
||||||
import org.apache.hadoop.mapreduce.v2.api.records.JobState;
|
import org.apache.hadoop.mapreduce.v2.api.records.JobState;
|
||||||
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId;
|
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId;
|
||||||
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptState;
|
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptState;
|
||||||
import org.apache.hadoop.mapreduce.v2.api.records.TaskId;
|
import org.apache.hadoop.mapreduce.v2.api.records.TaskId;
|
||||||
import org.apache.hadoop.mapreduce.v2.api.records.TaskState;
|
import org.apache.hadoop.mapreduce.v2.api.records.TaskState;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.api.records.TaskType;
|
||||||
import org.apache.hadoop.mapreduce.v2.app.job.Job;
|
import org.apache.hadoop.mapreduce.v2.app.job.Job;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.app.job.JobStateInternal;
|
||||||
import org.apache.hadoop.mapreduce.v2.app.job.Task;
|
import org.apache.hadoop.mapreduce.v2.app.job.Task;
|
||||||
import org.apache.hadoop.mapreduce.v2.app.job.TaskAttempt;
|
import org.apache.hadoop.mapreduce.v2.app.job.TaskAttempt;
|
||||||
import org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent;
|
import org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent;
|
||||||
|
@ -39,12 +42,18 @@ import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent;
|
||||||
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType;
|
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType;
|
||||||
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskEvent;
|
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskEvent;
|
||||||
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskEventType;
|
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskEventType;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskTAttemptEvent;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.app.job.impl.JobImpl;
|
||||||
|
import org.apache.hadoop.yarn.event.AsyncDispatcher;
|
||||||
|
import org.apache.hadoop.yarn.event.Dispatcher;
|
||||||
|
import org.apache.hadoop.yarn.event.Event;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tests the state machine with respect to Job/Task/TaskAttempt kill scenarios.
|
* Tests the state machine with respect to Job/Task/TaskAttempt kill scenarios.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
@SuppressWarnings({"unchecked", "rawtypes"})
|
||||||
public class TestKill {
|
public class TestKill {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -131,6 +140,80 @@ public class TestKill {
|
||||||
iter.next().getReport().getTaskAttemptState());
|
iter.next().getReport().getTaskAttemptState());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testKillTaskWait() throws Exception {
|
||||||
|
final Dispatcher dispatcher = new AsyncDispatcher() {
|
||||||
|
private TaskAttemptEvent cachedKillEvent;
|
||||||
|
@Override
|
||||||
|
protected void dispatch(Event event) {
|
||||||
|
if (event instanceof TaskAttemptEvent) {
|
||||||
|
TaskAttemptEvent killEvent = (TaskAttemptEvent) event;
|
||||||
|
if (killEvent.getType() == TaskAttemptEventType.TA_KILL) {
|
||||||
|
TaskAttemptId taID = killEvent.getTaskAttemptID();
|
||||||
|
if (taID.getTaskId().getTaskType() == TaskType.REDUCE
|
||||||
|
&& taID.getTaskId().getId() == 0 && taID.getId() == 0) {
|
||||||
|
// Task is asking the reduce TA to kill itself. 'Create' a race
|
||||||
|
// condition. Make the task succeed and then inform the task that
|
||||||
|
// TA has succeeded. Once Task gets the TA succeeded event at
|
||||||
|
// KILL_WAIT, then relay the actual kill signal to TA
|
||||||
|
super.dispatch(new TaskAttemptEvent(taID,
|
||||||
|
TaskAttemptEventType.TA_DONE));
|
||||||
|
super.dispatch(new TaskAttemptEvent(taID,
|
||||||
|
TaskAttemptEventType.TA_CONTAINER_CLEANED));
|
||||||
|
super.dispatch(new TaskTAttemptEvent(taID,
|
||||||
|
TaskEventType.T_ATTEMPT_SUCCEEDED));
|
||||||
|
this.cachedKillEvent = killEvent;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (event instanceof TaskEvent) {
|
||||||
|
TaskEvent taskEvent = (TaskEvent) event;
|
||||||
|
if (taskEvent.getType() == TaskEventType.T_ATTEMPT_SUCCEEDED
|
||||||
|
&& this.cachedKillEvent != null) {
|
||||||
|
// When the TA comes and reports that it is done, send the
|
||||||
|
// cachedKillEvent
|
||||||
|
super.dispatch(this.cachedKillEvent);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
super.dispatch(event);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
MRApp app = new MRApp(1, 1, false, this.getClass().getName(), true) {
|
||||||
|
@Override
|
||||||
|
public Dispatcher createDispatcher() {
|
||||||
|
return dispatcher;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
Job job = app.submit(new Configuration());
|
||||||
|
JobId jobId = app.getJobId();
|
||||||
|
app.waitForState(job, JobState.RUNNING);
|
||||||
|
Assert.assertEquals("Num tasks not correct", 2, job.getTasks().size());
|
||||||
|
Iterator<Task> it = job.getTasks().values().iterator();
|
||||||
|
Task mapTask = it.next();
|
||||||
|
Task reduceTask = it.next();
|
||||||
|
app.waitForState(mapTask, TaskState.RUNNING);
|
||||||
|
app.waitForState(reduceTask, TaskState.RUNNING);
|
||||||
|
TaskAttempt mapAttempt = mapTask.getAttempts().values().iterator().next();
|
||||||
|
app.waitForState(mapAttempt, TaskAttemptState.RUNNING);
|
||||||
|
TaskAttempt reduceAttempt = reduceTask.getAttempts().values().iterator().next();
|
||||||
|
app.waitForState(reduceAttempt, TaskAttemptState.RUNNING);
|
||||||
|
|
||||||
|
// Finish map
|
||||||
|
app.getContext().getEventHandler().handle(
|
||||||
|
new TaskAttemptEvent(
|
||||||
|
mapAttempt.getID(),
|
||||||
|
TaskAttemptEventType.TA_DONE));
|
||||||
|
app.waitForState(mapTask, TaskState.SUCCEEDED);
|
||||||
|
|
||||||
|
// Now kill the job
|
||||||
|
app.getContext().getEventHandler()
|
||||||
|
.handle(new JobEvent(jobId, JobEventType.JOB_KILL));
|
||||||
|
|
||||||
|
app.waitForInternalState((JobImpl)job, JobStateInternal.KILLED);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testKillTaskAttempt() throws Exception {
|
public void testKillTaskAttempt() throws Exception {
|
||||||
final CountDownLatch latch = new CountDownLatch(1);
|
final CountDownLatch latch = new CountDownLatch(1);
|
||||||
|
|
Loading…
Reference in New Issue