YARN-10822. Containers going from New to Scheduled transition for kil… (#3632)
This commit is contained in:
parent
aeae5716cc
commit
87abc437c7
|
@ -1187,10 +1187,7 @@ public class ContainerImpl implements Container {
|
|||
if (container.recoveredStatus == RecoveredContainerStatus.COMPLETED) {
|
||||
container.sendFinishedEvents();
|
||||
return ContainerState.DONE;
|
||||
} else if (container.recoveredStatus == RecoveredContainerStatus.QUEUED) {
|
||||
return ContainerState.SCHEDULED;
|
||||
} else if (container.recoveredAsKilled &&
|
||||
container.recoveredStatus == RecoveredContainerStatus.REQUESTED) {
|
||||
} else if (isContainerRecoveredAsKilled(container)) {
|
||||
// container was killed but never launched
|
||||
container.metrics.killedContainer();
|
||||
NMAuditLogger.logSuccess(container.user,
|
||||
|
@ -1201,6 +1198,8 @@ public class ContainerImpl implements Container {
|
|||
container.containerTokenIdentifier.getResource());
|
||||
container.sendFinishedEvents();
|
||||
return ContainerState.DONE;
|
||||
} else if (container.recoveredStatus == RecoveredContainerStatus.QUEUED) {
|
||||
return ContainerState.SCHEDULED;
|
||||
}
|
||||
|
||||
final ContainerLaunchContext ctxt = container.launchContext;
|
||||
|
@ -1264,6 +1263,16 @@ public class ContainerImpl implements Container {
|
|||
return ContainerState.LOCALIZATION_FAILED;
|
||||
}
|
||||
}
|
||||
|
||||
static boolean isContainerRecoveredAsKilled(ContainerImpl container) {
|
||||
if (!container.recoveredAsKilled) {
|
||||
return false;
|
||||
}
|
||||
// container was killed but never launched
|
||||
RecoveredContainerStatus containerStatus = container.recoveredStatus;
|
||||
return containerStatus == RecoveredContainerStatus.REQUESTED
|
||||
|| containerStatus == RecoveredContainerStatus.QUEUED;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -42,6 +42,7 @@ import java.util.Collections;
|
|||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileContext;
|
||||
|
@ -682,7 +683,49 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
|
|||
verify(cm, never()).handle(isA(CMgrCompletedAppsEvent.class));
|
||||
}
|
||||
|
||||
private void commonLaunchContainer(ApplicationId appId, ContainerId cid,
|
||||
@Test
|
||||
public void testKilledContainerInQueuedStateRecovery() throws Exception {
|
||||
conf.setBoolean(YarnConfiguration.NM_RECOVERY_ENABLED, true);
|
||||
conf.setBoolean(YarnConfiguration.NM_RECOVERY_SUPERVISED, true);
|
||||
NMStateStoreService stateStore = new NMMemoryStateStoreService();
|
||||
stateStore.init(conf);
|
||||
stateStore.start();
|
||||
context = createContext(conf, stateStore);
|
||||
ContainerManagerImpl cm = createContainerManager(context, delSrvc);
|
||||
((NMContext) context).setContainerManager(cm);
|
||||
cm.init(conf);
|
||||
cm.start();
|
||||
|
||||
// add an application by starting a container
|
||||
ApplicationId appId = ApplicationId.newInstance(0, 0);
|
||||
ApplicationAttemptId attemptId =
|
||||
ApplicationAttemptId.newInstance(appId, 1);
|
||||
ContainerId cid = ContainerId.newContainerId(attemptId, 1);
|
||||
createStartContainerRequest(appId, cid, cm);
|
||||
|
||||
Application app = context.getApplications().get(appId);
|
||||
assertEquals(1, context.getApplications().size());
|
||||
assertNotNull(app);
|
||||
|
||||
stateStore.storeContainerKilled(cid);
|
||||
// restart and verify container scheduler has recovered correctly
|
||||
cm.stop();
|
||||
context = createContext(conf, stateStore);
|
||||
cm = createContainerManager(context, delSrvc);
|
||||
((NMContext) context).setContainerManager(cm);
|
||||
cm.init(conf);
|
||||
cm.start();
|
||||
assertEquals(1, context.getApplications().size());
|
||||
|
||||
ConcurrentMap<ContainerId, Container> containers = context.getContainers();
|
||||
Container c = containers.get(cid);
|
||||
assertEquals(ContainerState.DONE, c.getContainerState());
|
||||
app = context.getApplications().get(appId);
|
||||
assertNotNull(app);
|
||||
cm.stop();
|
||||
}
|
||||
|
||||
private void createStartContainerRequest(ApplicationId appId, ContainerId cid,
|
||||
ContainerManagerImpl cm) throws Exception {
|
||||
Map<String, String> containerEnv = new HashMap<>();
|
||||
setFlowContext(containerEnv, "app_name1", appId);
|
||||
|
@ -727,6 +770,11 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
|
|||
context, cm, cid, clc, null, ContainerType.TASK);
|
||||
assertTrue(startResponse.getFailedRequests().isEmpty());
|
||||
assertEquals(1, context.getApplications().size());
|
||||
}
|
||||
|
||||
private void commonLaunchContainer(ApplicationId appId, ContainerId cid,
|
||||
ContainerManagerImpl cm) throws Exception {
|
||||
createStartContainerRequest(appId, cid, cm);
|
||||
// make sure the container reaches RUNNING state
|
||||
waitForNMContainerState(cm, cid,
|
||||
org.apache.hadoop.yarn.server.nodemanager
|
||||
|
|
|
@ -168,6 +168,7 @@ public class NMMemoryStateStoreService extends NMStateStoreService {
|
|||
int version, long startTime, StartContainerRequest startRequest) {
|
||||
RecoveredContainerState rcs = new RecoveredContainerState(containerId);
|
||||
rcs.startRequest = startRequest;
|
||||
rcs.status = RecoveredContainerStatus.REQUESTED;
|
||||
rcs.version = version;
|
||||
try {
|
||||
ContainerTokenIdentifier containerTokenIdentifier = BuilderUtils
|
||||
|
|
Loading…
Reference in New Issue