YARN-10822. Containers going from New to Scheduled transition for kil… (#3632)

This commit is contained in:
minni31 2022-02-01 23:05:59 +05:30 committed by GitHub
parent aeae5716cc
commit 87abc437c7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 63 additions and 5 deletions

View File

@ -1187,10 +1187,7 @@ public class ContainerImpl implements Container {
if (container.recoveredStatus == RecoveredContainerStatus.COMPLETED) {
container.sendFinishedEvents();
return ContainerState.DONE;
} else if (container.recoveredStatus == RecoveredContainerStatus.QUEUED) {
return ContainerState.SCHEDULED;
} else if (container.recoveredAsKilled &&
container.recoveredStatus == RecoveredContainerStatus.REQUESTED) {
} else if (isContainerRecoveredAsKilled(container)) {
// container was killed but never launched
container.metrics.killedContainer();
NMAuditLogger.logSuccess(container.user,
@ -1201,6 +1198,8 @@ public class ContainerImpl implements Container {
container.containerTokenIdentifier.getResource());
container.sendFinishedEvents();
return ContainerState.DONE;
} else if (container.recoveredStatus == RecoveredContainerStatus.QUEUED) {
return ContainerState.SCHEDULED;
}
final ContainerLaunchContext ctxt = container.launchContext;
@ -1264,6 +1263,16 @@ public class ContainerImpl implements Container {
return ContainerState.LOCALIZATION_FAILED;
}
}
static boolean isContainerRecoveredAsKilled(ContainerImpl container) {
if (!container.recoveredAsKilled) {
return false;
}
// container was killed but never launched
RecoveredContainerStatus containerStatus = container.recoveredStatus;
return containerStatus == RecoveredContainerStatus.REQUESTED
|| containerStatus == RecoveredContainerStatus.QUEUED;
}
}
/**

View File

@ -42,6 +42,7 @@ import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileContext;
@ -682,7 +683,49 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
verify(cm, never()).handle(isA(CMgrCompletedAppsEvent.class));
}
private void commonLaunchContainer(ApplicationId appId, ContainerId cid,
@Test
public void testKilledContainerInQueuedStateRecovery() throws Exception {
conf.setBoolean(YarnConfiguration.NM_RECOVERY_ENABLED, true);
conf.setBoolean(YarnConfiguration.NM_RECOVERY_SUPERVISED, true);
NMStateStoreService stateStore = new NMMemoryStateStoreService();
stateStore.init(conf);
stateStore.start();
context = createContext(conf, stateStore);
ContainerManagerImpl cm = createContainerManager(context, delSrvc);
((NMContext) context).setContainerManager(cm);
cm.init(conf);
cm.start();
// add an application by starting a container
ApplicationId appId = ApplicationId.newInstance(0, 0);
ApplicationAttemptId attemptId =
ApplicationAttemptId.newInstance(appId, 1);
ContainerId cid = ContainerId.newContainerId(attemptId, 1);
createStartContainerRequest(appId, cid, cm);
Application app = context.getApplications().get(appId);
assertEquals(1, context.getApplications().size());
assertNotNull(app);
stateStore.storeContainerKilled(cid);
// restart and verify container scheduler has recovered correctly
cm.stop();
context = createContext(conf, stateStore);
cm = createContainerManager(context, delSrvc);
((NMContext) context).setContainerManager(cm);
cm.init(conf);
cm.start();
assertEquals(1, context.getApplications().size());
ConcurrentMap<ContainerId, Container> containers = context.getContainers();
Container c = containers.get(cid);
assertEquals(ContainerState.DONE, c.getContainerState());
app = context.getApplications().get(appId);
assertNotNull(app);
cm.stop();
}
private void createStartContainerRequest(ApplicationId appId, ContainerId cid,
ContainerManagerImpl cm) throws Exception {
Map<String, String> containerEnv = new HashMap<>();
setFlowContext(containerEnv, "app_name1", appId);
@ -727,6 +770,11 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
context, cm, cid, clc, null, ContainerType.TASK);
assertTrue(startResponse.getFailedRequests().isEmpty());
assertEquals(1, context.getApplications().size());
}
private void commonLaunchContainer(ApplicationId appId, ContainerId cid,
ContainerManagerImpl cm) throws Exception {
createStartContainerRequest(appId, cid, cm);
// make sure the container reaches RUNNING state
waitForNMContainerState(cm, cid,
org.apache.hadoop.yarn.server.nodemanager

View File

@ -168,6 +168,7 @@ public class NMMemoryStateStoreService extends NMStateStoreService {
int version, long startTime, StartContainerRequest startRequest) {
RecoveredContainerState rcs = new RecoveredContainerState(containerId);
rcs.startRequest = startRequest;
rcs.status = RecoveredContainerStatus.REQUESTED;
rcs.version = version;
try {
ContainerTokenIdentifier containerTokenIdentifier = BuilderUtils