YARN-2816. NM fail to start with NPE during container recovery. Contributed by Zhihai Xu
This commit is contained in:
parent
3baaa42945
commit
49c38898b0
|
@ -90,6 +90,9 @@ Release 2.7.0 - UNRELEASED
|
||||||
YARN-2857. ConcurrentModificationException in ContainerLogAppender
|
YARN-2857. ConcurrentModificationException in ContainerLogAppender
|
||||||
(Mohammad Kamrul Islam via jlowe)
|
(Mohammad Kamrul Islam via jlowe)
|
||||||
|
|
||||||
|
YARN-2816. NM fail to start with NPE during container recovery (Zhihai Xu
|
||||||
|
via jlowe)
|
||||||
|
|
||||||
Release 2.6.0 - 2014-11-18
|
Release 2.6.0 - 2014-11-18
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -146,6 +146,8 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
|
||||||
throws IOException {
|
throws IOException {
|
||||||
ArrayList<RecoveredContainerState> containers =
|
ArrayList<RecoveredContainerState> containers =
|
||||||
new ArrayList<RecoveredContainerState>();
|
new ArrayList<RecoveredContainerState>();
|
||||||
|
ArrayList<ContainerId> containersToRemove =
|
||||||
|
new ArrayList<ContainerId>();
|
||||||
LeveldbIterator iter = null;
|
LeveldbIterator iter = null;
|
||||||
try {
|
try {
|
||||||
iter = new LeveldbIterator(db);
|
iter = new LeveldbIterator(db);
|
||||||
|
@ -165,7 +167,14 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
|
||||||
ContainerId containerId = ConverterUtils.toContainerId(
|
ContainerId containerId = ConverterUtils.toContainerId(
|
||||||
key.substring(CONTAINERS_KEY_PREFIX.length(), idEndPos));
|
key.substring(CONTAINERS_KEY_PREFIX.length(), idEndPos));
|
||||||
String keyPrefix = key.substring(0, idEndPos+1);
|
String keyPrefix = key.substring(0, idEndPos+1);
|
||||||
containers.add(loadContainerState(containerId, iter, keyPrefix));
|
RecoveredContainerState rcs = loadContainerState(containerId,
|
||||||
|
iter, keyPrefix);
|
||||||
|
// Don't load container without StartContainerRequest
|
||||||
|
if (rcs.startRequest != null) {
|
||||||
|
containers.add(rcs);
|
||||||
|
} else {
|
||||||
|
containersToRemove.add(containerId);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (DBException e) {
|
} catch (DBException e) {
|
||||||
throw new IOException(e);
|
throw new IOException(e);
|
||||||
|
@ -175,6 +184,19 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// remove container without StartContainerRequest
|
||||||
|
for (ContainerId containerId : containersToRemove) {
|
||||||
|
LOG.warn("Remove container " + containerId +
|
||||||
|
" with incomplete records");
|
||||||
|
try {
|
||||||
|
removeContainer(containerId);
|
||||||
|
// TODO: kill and cleanup the leaked container
|
||||||
|
} catch (IOException e) {
|
||||||
|
LOG.error("Unable to remove container " + containerId +
|
||||||
|
" in store", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return containers;
|
return containers;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -274,6 +274,13 @@ public class TestNMLeveldbStateStoreService {
|
||||||
assertEquals(containerReq, rcs.getStartRequest());
|
assertEquals(containerReq, rcs.getStartRequest());
|
||||||
assertTrue(rcs.getDiagnostics().isEmpty());
|
assertTrue(rcs.getDiagnostics().isEmpty());
|
||||||
|
|
||||||
|
// store a new container record without StartContainerRequest
|
||||||
|
ContainerId containerId1 = ContainerId.newContainerId(appAttemptId, 6);
|
||||||
|
stateStore.storeContainerLaunched(containerId1);
|
||||||
|
recoveredContainers = stateStore.loadContainersState();
|
||||||
|
// check whether the new container record is discarded
|
||||||
|
assertEquals(1, recoveredContainers.size());
|
||||||
|
|
||||||
// launch the container, add some diagnostics, and verify recovered
|
// launch the container, add some diagnostics, and verify recovered
|
||||||
StringBuilder diags = new StringBuilder();
|
StringBuilder diags = new StringBuilder();
|
||||||
stateStore.storeContainerLaunched(containerId);
|
stateStore.storeContainerLaunched(containerId);
|
||||||
|
|
Loading…
Reference in New Issue