From e65ae575a059a426c4c38fdabe22a31eabbb349e Mon Sep 17 00:00:00 2001 From: XuanGong Date: Fri, 12 Sep 2014 15:21:46 -0700 Subject: [PATCH] YARN-2456. Possible livelock in CapacityScheduler when RM is recovering apps. Contributed by Jian He --- hadoop-yarn-project/CHANGES.txt | 3 ++ .../recovery/RMStateStore.java | 3 +- .../server/resourcemanager/TestRMRestart.java | 43 +++++++++++++++++++ 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 9002e6a4d5c..efc3e093ee2 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -345,6 +345,9 @@ Release 2.6.0 - UNRELEASED YARN-2484. FileSystemRMStateStore#readFile/writeFile should close FSData(In|Out)putStream in final block (Tsuyoshi OZAWA via jlowe) + YARN-2456. Possible livelock in CapacityScheduler when RM is recovering apps. + (Jian He via xgong) + Release 2.5.1 - 2014-09-05 INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java index ac51a1747c0..df4f3a9773e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java @@ -22,6 +22,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; +import java.util.TreeMap; import javax.crypto.SecretKey; @@ -421,7 +422,7 @@ public abstract class RMStateStore extends AbstractService { */ public static class RMState { Map appState = - new HashMap(); + new TreeMap(); RMDTSecretManagerState rmSecretManagerState = new RMDTSecretManagerState(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java index 7d511db36fe..caa5647c738 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java @@ -19,9 +19,11 @@ package org.apache.hadoop.yarn.server.resourcemanager; import static org.mockito.Matchers.isA; +import static org.mockito.Mockito.mock; import static org.mockito.Mockito.spy; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; import java.io.File; import java.io.FileOutputStream; @@ -1656,6 +1658,47 @@ public class TestRMRestart { rm2.waitForState(app1.getApplicationId(), RMAppState.FAILED); } + @Test (timeout = 20000) + public void testAppRecoveredInOrderOnRMRestart() throws Exception { + MemoryRMStateStore memStore = new MemoryRMStateStore(); + memStore.init(conf); + + for (int i = 10; i > 0; i--) { + ApplicationState appState = mock(ApplicationState.class); + when(appState.getAppId()).thenReturn(ApplicationId.newInstance(1234, i)); + memStore.getState().getApplicationState() + .put(appState.getAppId(), appState); + } + + MockRM rm1 = new MockRM(conf, memStore) { + @Override + protected RMAppManager createRMAppManager() { + return new TestRMAppManager(this.rmContext, this.scheduler, + this.masterService, this.applicationACLsManager, conf); + } + + class TestRMAppManager extends RMAppManager { + ApplicationId prevId = ApplicationId.newInstance(1234, 0); + + public TestRMAppManager(RMContext context, YarnScheduler scheduler, + ApplicationMasterService masterService, + ApplicationACLsManager applicationACLsManager, Configuration conf) { + super(context, scheduler, masterService, applicationACLsManager, conf); + } + + @Override + protected void recoverApplication(ApplicationState appState, + RMState rmState) throws Exception { + // check application is recovered in order. + Assert.assertTrue(rmState.getApplicationState().size() > 0); + Assert.assertTrue(appState.getAppId().compareTo(prevId) > 0); + prevId = appState.getAppId(); + } + } + }; + rm1.start(); + } + @SuppressWarnings("resource") @Test (timeout = 60000) public void testQueueMetricsOnRMRestart() throws Exception {