YARN-2816. NM fail to start with NPE during container recovery. Contributed by Zhihai Xu

This commit is contained in:
Jason Lowe 2014-11-14 21:25:59 +00:00
parent 3baaa42945
commit 49c38898b0
3 changed files with 33 additions and 1 deletions

View File

@ -90,6 +90,9 @@ Release 2.7.0 - UNRELEASED
YARN-2857. ConcurrentModificationException in ContainerLogAppender YARN-2857. ConcurrentModificationException in ContainerLogAppender
(Mohammad Kamrul Islam via jlowe) (Mohammad Kamrul Islam via jlowe)
YARN-2816. NM fail to start with NPE during container recovery (Zhihai Xu
via jlowe)
Release 2.6.0 - 2014-11-18 Release 2.6.0 - 2014-11-18
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -146,6 +146,8 @@ public List<RecoveredContainerState> loadContainersState()
throws IOException { throws IOException {
ArrayList<RecoveredContainerState> containers = ArrayList<RecoveredContainerState> containers =
new ArrayList<RecoveredContainerState>(); new ArrayList<RecoveredContainerState>();
ArrayList<ContainerId> containersToRemove =
new ArrayList<ContainerId>();
LeveldbIterator iter = null; LeveldbIterator iter = null;
try { try {
iter = new LeveldbIterator(db); iter = new LeveldbIterator(db);
@ -165,7 +167,14 @@ public List<RecoveredContainerState> loadContainersState()
ContainerId containerId = ConverterUtils.toContainerId( ContainerId containerId = ConverterUtils.toContainerId(
key.substring(CONTAINERS_KEY_PREFIX.length(), idEndPos)); key.substring(CONTAINERS_KEY_PREFIX.length(), idEndPos));
String keyPrefix = key.substring(0, idEndPos+1); String keyPrefix = key.substring(0, idEndPos+1);
containers.add(loadContainerState(containerId, iter, keyPrefix)); RecoveredContainerState rcs = loadContainerState(containerId,
iter, keyPrefix);
// Don't load container without StartContainerRequest
if (rcs.startRequest != null) {
containers.add(rcs);
} else {
containersToRemove.add(containerId);
}
} }
} catch (DBException e) { } catch (DBException e) {
throw new IOException(e); throw new IOException(e);
@ -175,6 +184,19 @@ public List<RecoveredContainerState> loadContainersState()
} }
} }
// remove container without StartContainerRequest
for (ContainerId containerId : containersToRemove) {
LOG.warn("Remove container " + containerId +
" with incomplete records");
try {
removeContainer(containerId);
// TODO: kill and cleanup the leaked container
} catch (IOException e) {
LOG.error("Unable to remove container " + containerId +
" in store", e);
}
}
return containers; return containers;
} }

View File

@ -274,6 +274,13 @@ public void testContainerStorage() throws IOException {
assertEquals(containerReq, rcs.getStartRequest()); assertEquals(containerReq, rcs.getStartRequest());
assertTrue(rcs.getDiagnostics().isEmpty()); assertTrue(rcs.getDiagnostics().isEmpty());
// store a new container record without StartContainerRequest
ContainerId containerId1 = ContainerId.newContainerId(appAttemptId, 6);
stateStore.storeContainerLaunched(containerId1);
recoveredContainers = stateStore.loadContainersState();
// check whether the new container record is discarded
assertEquals(1, recoveredContainers.size());
// launch the container, add some diagnostics, and verify recovered // launch the container, add some diagnostics, and verify recovered
StringBuilder diags = new StringBuilder(); StringBuilder diags = new StringBuilder();
stateStore.storeContainerLaunched(containerId); stateStore.storeContainerLaunched(containerId);