YARN-5037. Fix random failure of TestRMRestart#testQueueMetricsOnRMRestart (sandflee via Varun Saxena).

This commit is contained in:
Varun Saxena 2016-07-10 21:28:22 +05:30
parent b191c6b202
commit 0fd3980a1f
2 changed files with 54 additions and 8 deletions

View File

@ -88,6 +88,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEventType;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStartedEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
import org.apache.hadoop.yarn.server.resourcemanager.security.ClientToAMTokenSecretManagerInRM;
@ -109,6 +110,7 @@ public class MockRM extends ResourceManager {
static final String ENABLE_WEBAPP = "mockrm.webapp.enabled";
private static final int SECOND = 1000;
private static final int TIMEOUT_MS_FOR_ATTEMPT = 40 * SECOND;
private static final int TIMEOUT_MS_FOR_APP_REMOVED = 40 * SECOND;
private static final int TIMEOUT_MS_FOR_CONTAINER_AND_NODE = 10 * SECOND;
private static final int WAIT_MS_PER_LOOP = 10;
@ -1016,4 +1018,44 @@ public class MockRM extends ResourceManager {
SignalContainerRequest.newInstance(containerId, command);
client.signalToContainer(req);
}
/**
* Wait until an app removed from scheduler.
* The timeout is 40 seconds.
* @param appId the id of an app
* @throws InterruptedException
* if interrupted while waiting for app removed
*/
public void waitForAppRemovedFromScheduler(ApplicationId appId)
throws InterruptedException {
waitForAppRemovedFromScheduler(appId, TIMEOUT_MS_FOR_APP_REMOVED);
}
/**
* Wait until an app is removed from scheduler.
* @param appId the id of an app
* @param timeoutMsecs the length of timeout in milliseconds
* @throws InterruptedException
* if interrupted while waiting for app removed
*/
public void waitForAppRemovedFromScheduler(ApplicationId appId,
long timeoutMsecs) throws InterruptedException {
int timeWaiting = 0;
Map<ApplicationId, SchedulerApplication> apps =
((AbstractYarnScheduler) getResourceScheduler())
.getSchedulerApplications();
while (apps.containsKey(appId)) {
if (timeWaiting >= timeoutMsecs) {
break;
}
LOG.info("wait for app removed, " + appId);
Thread.sleep(WAIT_MS_PER_LOOP);
timeWaiting += WAIT_MS_PER_LOOP;
}
Assert.assertTrue("app is not removed from scheduler (timeout).",
!apps.containsKey(appId));
LOG.info("app is removed from scheduler, " + appId);
}
}

View File

@ -1909,6 +1909,10 @@ public class TestRMRestart extends ParameterizedSchedulerTestBase {
// finish the AMs
finishApplicationMaster(loadedApp1, rm2, nm1, am1);
// now AppAttempt and App becomes FINISHED,
// we should also grant APP_ATTEMPT_REMOVE/APP_REMOVE event
// had processed by scheduler
rm2.waitForAppRemovedFromScheduler(loadedApp1.getApplicationId());
assertQueueMetrics(qm2, 1, 0, 0, 1);
}
@ -1930,14 +1934,14 @@ public class TestRMRestart extends ParameterizedSchedulerTestBase {
private void assertQueueMetrics(QueueMetrics qm, int appsSubmitted,
int appsPending, int appsRunning, int appsCompleted) {
Assert.assertEquals(qm.getAppsSubmitted(),
appsSubmitted + appsSubmittedCarryOn);
Assert.assertEquals(qm.getAppsPending(),
appsPending + appsPendingCarryOn);
Assert.assertEquals(qm.getAppsRunning(),
appsRunning + appsRunningCarryOn);
Assert.assertEquals(qm.getAppsCompleted(),
appsCompleted + appsCompletedCarryOn);
Assert.assertEquals(appsSubmitted + appsSubmittedCarryOn,
qm.getAppsSubmitted());
Assert.assertEquals(appsPending + appsPendingCarryOn,
qm.getAppsPending());
Assert.assertEquals(appsRunning + appsRunningCarryOn,
qm.getAppsRunning());
Assert.assertEquals(appsCompleted + appsCompletedCarryOn,
qm.getAppsCompleted());
}
@Test (timeout = 60000)