From ece01478c5e7762a9037880bdc3c18d549d38b32 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Thu, 14 Apr 2016 19:17:14 +0000 Subject: [PATCH] YARN-4924. NM recovery race can lead to container not cleaned up. Contributed by sandflee (cherry picked from commit 3150ae8108a1fc40a67926be6254824c1e37cb38) --- .../ContainerManagerImpl.java | 17 ---- .../recovery/NMLeveldbStateStoreService.java | 80 ++++++++++++------- .../recovery/NMNullStateStoreService.java | 4 - .../recovery/NMStateStoreService.java | 12 --- .../TestContainerManagerRecovery.java | 4 + .../recovery/NMMemoryStateStoreService.java | 10 --- .../TestNMLeveldbStateStoreService.java | 10 +-- 7 files changed, 54 insertions(+), 83 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java index 8d09aa75bcc..b8cca28e82d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java @@ -296,20 +296,8 @@ public class ContainerManagerImpl extends CompositeService implements if (LOG.isDebugEnabled()) { LOG.debug("Recovering container with state: " + rcs); } - recoverContainer(rcs); } - - String diagnostic = "Application marked finished during recovery"; - for (ApplicationId appId : appsState.getFinishedApplications()) { - - if (LOG.isDebugEnabled()) { - LOG.debug("Application marked finished during recovery: " + appId); - } - - dispatcher.getEventHandler().handle( - new ApplicationFinishEvent(appId, diagnostic)); - } } else { LOG.info("Not a recoverable state store. Nothing to recover."); } @@ -1332,11 +1320,6 @@ public class ContainerManagerImpl extends CompositeService implements } else if (appsFinishedEvent.getReason() == CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER) { diagnostic = "Application killed by ResourceManager"; } - try { - this.context.getNMStateStore().storeFinishedApplication(appID); - } catch (IOException e) { - LOG.error("Unable to update application state in store", e); - } this.dispatcher.getEventHandler().handle( new ApplicationFinishEvent(appID, diagnostic)); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java index 81d6c57de6e..26dea2daa0d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java @@ -84,6 +84,7 @@ public class NMLeveldbStateStoreService extends NMStateStoreService { private static final String APPLICATIONS_KEY_PREFIX = "ContainerManager/applications/"; + @Deprecated private static final String FINISHED_APPS_KEY_PREFIX = "ContainerManager/finishedApps/"; @@ -392,20 +393,6 @@ public class NMLeveldbStateStoreService extends NMStateStoreService { state.applications.add( ContainerManagerApplicationProto.parseFrom(entry.getValue())); } - - state.finishedApplications = new ArrayList(); - keyPrefix = FINISHED_APPS_KEY_PREFIX; - iter.seek(bytes(keyPrefix)); - while (iter.hasNext()) { - Entry entry = iter.next(); - String key = asString(entry.getKey()); - if (!key.startsWith(keyPrefix)) { - break; - } - ApplicationId appId = - ConverterUtils.toApplicationId(key.substring(keyPrefix.length())); - state.finishedApplications.add(appId); - } } catch (DBException e) { throw new IOException(e); } finally { @@ -414,6 +401,8 @@ public class NMLeveldbStateStoreService extends NMStateStoreService { } } + cleanupDeprecatedFinishedApps(); + return state; } @@ -433,21 +422,6 @@ public class NMLeveldbStateStoreService extends NMStateStoreService { } } - @Override - public void storeFinishedApplication(ApplicationId appId) - throws IOException { - if (LOG.isDebugEnabled()) { - LOG.debug("storeFinishedApplication.appId: " + appId); - } - - String key = FINISHED_APPS_KEY_PREFIX + appId; - try { - db.put(bytes(key), new byte[0]); - } catch (DBException e) { - throw new IOException(e); - } - } - @Override public void removeApplication(ApplicationId appId) throws IOException { @@ -460,8 +434,6 @@ public class NMLeveldbStateStoreService extends NMStateStoreService { try { String key = APPLICATIONS_KEY_PREFIX + appId; batch.delete(bytes(key)); - key = FINISHED_APPS_KEY_PREFIX + appId; - batch.delete(bytes(key)); db.write(batch); } finally { batch.close(); @@ -979,6 +951,52 @@ public class NMLeveldbStateStoreService extends NMStateStoreService { } } + @SuppressWarnings("deprecation") + private void cleanupDeprecatedFinishedApps() { + try { + cleanupKeysWithPrefix(FINISHED_APPS_KEY_PREFIX); + } catch (Exception e) { + LOG.warn("cleanup keys with prefix " + FINISHED_APPS_KEY_PREFIX + + " from leveldb failed", e); + } + } + + private void cleanupKeysWithPrefix(String prefix) throws IOException { + WriteBatch batch = null; + LeveldbIterator iter = null; + try { + iter = new LeveldbIterator(db); + try { + batch = db.createWriteBatch(); + iter.seek(bytes(prefix)); + while (iter.hasNext()) { + byte[] key = iter.next().getKey(); + String keyStr = asString(key); + if (!keyStr.startsWith(prefix)) { + break; + } + batch.delete(key); + if (LOG.isDebugEnabled()) { + LOG.debug("cleanup " + keyStr + " from leveldb"); + } + } + db.write(batch); + } catch (DBException e) { + throw new IOException(e); + } finally { + if (batch != null) { + batch.close(); + } + } + } catch (DBException e) { + throw new IOException(e); + } finally { + if (iter != null) { + iter.close(); + } + } + } + private String getLogDeleterKey(ApplicationId appId) { return LOG_DELETER_KEY_PREFIX + appId; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java index d5dce9bb2ee..a887e71e9e1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java @@ -58,10 +58,6 @@ public class NMNullStateStoreService extends NMStateStoreService { ContainerManagerApplicationProto p) throws IOException { } - @Override - public void storeFinishedApplication(ApplicationId appId) { - } - @Override public void removeApplication(ApplicationId appId) throws IOException { } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java index 84c5aa982a7..463815ec9c1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java @@ -52,15 +52,11 @@ public abstract class NMStateStoreService extends AbstractService { public static class RecoveredApplicationsState { List applications; - List finishedApplications; public List getApplications() { return applications; } - public List getFinishedApplications() { - return finishedApplications; - } } public enum RecoveredContainerStatus { @@ -258,14 +254,6 @@ public abstract class NMStateStoreService extends AbstractService { public abstract void storeApplication(ApplicationId appId, ContainerManagerApplicationProto p) throws IOException; - /** - * Record that an application has finished - * @param appId the application ID - * @throws IOException - */ - public abstract void storeFinishedApplication(ApplicationId appId) - throws IOException; - /** * Remove records corresponding to an application * @param appId the application ID diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java index 2e014decbc8..9fa3fcc13c8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java @@ -259,6 +259,10 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest { assertEquals(1, context.getApplications().size()); app = context.getApplications().get(appId); assertNotNull(app); + // no longer saving FINISH_APP event in NM stateStore, + // simulate by resending FINISH_APP event + cm.handle(new CMgrCompletedAppsEvent(finishedApps, + CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER)); waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP); assertTrue(context.getApplicationACLsManager().checkAccess( UserGroupInformation.createRemoteUser(modUser), diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java index a1c95ab03b9..12798963390 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java @@ -44,7 +44,6 @@ import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl; public class NMMemoryStateStoreService extends NMStateStoreService { private Map apps; - private Set finishedApps; private Map containerStates; private Map trackerStates; private Map deleteTasks; @@ -59,7 +58,6 @@ public class NMMemoryStateStoreService extends NMStateStoreService { @Override protected void initStorage(Configuration conf) { apps = new HashMap(); - finishedApps = new HashSet(); containerStates = new HashMap(); nmTokenState = new RecoveredNMTokensState(); nmTokenState.applicationMasterKeys = @@ -86,7 +84,6 @@ public class NMMemoryStateStoreService extends NMStateStoreService { RecoveredApplicationsState state = new RecoveredApplicationsState(); state.applications = new ArrayList( apps.values()); - state.finishedApplications = new ArrayList(finishedApps); return state; } @@ -98,16 +95,10 @@ public class NMMemoryStateStoreService extends NMStateStoreService { apps.put(appId, protoCopy); } - @Override - public synchronized void storeFinishedApplication(ApplicationId appId) { - finishedApps.add(appId); - } - @Override public synchronized void removeApplication(ApplicationId appId) throws IOException { apps.remove(appId); - finishedApps.remove(appId); } @Override @@ -393,7 +384,6 @@ public class NMMemoryStateStoreService extends NMStateStoreService { logDeleterState.remove(appId); } - private static class TrackerState { Map inProgressMap = new HashMap(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java index 08b49e75383..47468d6c25e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java @@ -174,7 +174,6 @@ public class TestNMLeveldbStateStoreService { // test empty when no state RecoveredApplicationsState state = stateStore.loadApplicationsState(); assertTrue(state.getApplications().isEmpty()); - assertTrue(state.getFinishedApplications().isEmpty()); // store an application and verify recovered final ApplicationId appId1 = ApplicationId.newInstance(1234, 1); @@ -188,10 +187,8 @@ public class TestNMLeveldbStateStoreService { state = stateStore.loadApplicationsState(); assertEquals(1, state.getApplications().size()); assertEquals(appProto1, state.getApplications().get(0)); - assertTrue(state.getFinishedApplications().isEmpty()); - // finish an application and add a new one - stateStore.storeFinishedApplication(appId1); + // add a new app final ApplicationId appId2 = ApplicationId.newInstance(1234, 2); builder = ContainerManagerApplicationProto.newBuilder(); builder.setId(((ApplicationIdPBImpl) appId2).getProto()); @@ -203,18 +200,13 @@ public class TestNMLeveldbStateStoreService { assertEquals(2, state.getApplications().size()); assertTrue(state.getApplications().contains(appProto1)); assertTrue(state.getApplications().contains(appProto2)); - assertEquals(1, state.getFinishedApplications().size()); - assertEquals(appId1, state.getFinishedApplications().get(0)); // test removing an application - stateStore.storeFinishedApplication(appId2); stateStore.removeApplication(appId2); restartStateStore(); state = stateStore.loadApplicationsState(); assertEquals(1, state.getApplications().size()); assertEquals(appProto1, state.getApplications().get(0)); - assertEquals(1, state.getFinishedApplications().size()); - assertEquals(appId1, state.getFinishedApplications().get(0)); } @Test