YARN-6272. TestAMRMClient#testAMRMClientWithContainerResourceChange fails intermittently. Contributed by Andras Gyory & Prabhu Joseph

This commit is contained in:
Szilard Nemeth 2021-07-28 17:02:15 +02:00
parent 1b9efe58c9
commit f2b6c03fc1
1 changed files with 33 additions and 22 deletions

View File

@ -956,7 +956,6 @@ public class TestAMRMClient extends BaseAMRMClientTest{
return containers; return containers;
} }
private void doContainerResourceChange( private void doContainerResourceChange(
final AMRMClient<ContainerRequest> amClient, List<Container> containers) final AMRMClient<ContainerRequest> amClient, List<Container> containers)
throws YarnException, IOException { throws YarnException, IOException {
@ -986,38 +985,50 @@ public class TestAMRMClient extends BaseAMRMClientTest{
Resource.newInstance(512, 1), null)); Resource.newInstance(512, 1), null));
assertEquals(Resource.newInstance(512, 1), assertEquals(Resource.newInstance(512, 1),
amClientImpl.change.get(container1.getId()).getValue().getCapability()); amClientImpl.change.get(container1.getId()).getValue().getCapability());
// request resource increase for container2
amClientImpl.requestContainerUpdate(container2,
UpdateContainerRequest.newInstance(container2.getVersion(),
container2.getId(), ContainerUpdateType.INCREASE_RESOURCE,
Resource.newInstance(2048, 1), null));
assertEquals(Resource.newInstance(2048, 1),
amClientImpl.change.get(container2.getId()).getValue().getCapability());
// verify release request will cancel pending change requests for the same // verify release request will cancel pending change requests for the same
// container // container
amClientImpl.requestContainerUpdate(container3, amClientImpl.requestContainerUpdate(container3,
UpdateContainerRequest.newInstance(container3.getVersion(), UpdateContainerRequest.newInstance(container3.getVersion(),
container3.getId(), ContainerUpdateType.INCREASE_RESOURCE, container3.getId(), ContainerUpdateType.INCREASE_RESOURCE,
Resource.newInstance(2048, 1), null)); Resource.newInstance(2048, 1), null));
assertEquals(3, amClientImpl.pendingChange.size());
amClientImpl.releaseAssignedContainer(container3.getId());
assertEquals(2, amClientImpl.pendingChange.size()); assertEquals(2, amClientImpl.pendingChange.size());
amClientImpl.releaseAssignedContainer(container3.getId());
assertEquals(1, amClientImpl.pendingChange.size());
// as of now: container1 asks to decrease to (512, 1) // as of now: container1 asks to decrease to (512, 1)
// container2 asks to increase to (2048, 1) // container2 asks to increase to (2048, 1)
// send allocation requests // send allocation requests
AllocateResponse allocResponse = amClient.allocate(0.1f);
assertEquals(0, amClientImpl.change.size());
// we should get decrease confirmation right away
List<UpdatedContainer> updatedContainers =
allocResponse.getUpdatedContainers();
assertEquals(1, updatedContainers.size());
// we should get increase allocation after the next NM's heartbeat to RM // we should get increase allocation after the next NM's heartbeat to RM
triggerSchedulingWithNMHeartBeat(); assertUpdatedContainers(amClient, container1);
// get allocations // request resource increase for container2
allocResponse = amClient.allocate(0.1f); amClientImpl.requestContainerUpdate(container2,
updatedContainers = UpdateContainerRequest.newInstance(container2.getVersion(),
allocResponse.getUpdatedContainers(); container2.getId(), ContainerUpdateType.INCREASE_RESOURCE,
assertEquals(1, updatedContainers.size()); Resource.newInstance(2048, 1), null));
assertEquals(Resource.newInstance(2048, 1),
amClientImpl.change.get(container2.getId()).getValue().getCapability());
assertUpdatedContainers(amClient, container2);
}
private void assertUpdatedContainers(AMRMClient<ContainerRequest> amClient, Container container) {
RMContext context = yarnCluster.getResourceManager().getRMContext();
RMNode rmNode = context.getRMNodes().get(container.getNodeId());
List<UpdatedContainer> updateResponse = new ArrayList<>();
int allocationAttempts = 0;
while (allocationAttempts < 1000) {
context.getScheduler().handle(new NodeUpdateSchedulerEvent(rmNode));
try {
updateResponse = amClient.allocate(0.1f).getUpdatedContainers();
if (updateResponse.size() == 1) {
break;
} else {
allocationAttempts++;
sleep(20);
}
} catch (Exception ignored) {
}
}
Assert.assertEquals("Container resource change update failed", 1, updateResponse.size());
} }
@Test @Test