YARN-6678. Handle IllegalStateException in Async Scheduling mode of CapacityScheduler. Contributed by Tao Yang.

This commit is contained in:
Sunil G 2017-08-03 19:27:10 +05:30
parent 79df1e750e
commit f64cfeaf61
2 changed files with 160 additions and 0 deletions

View File

@ -426,6 +426,19 @@ public class FiCaSchedulerApp extends SchedulerApplicationAttempt {
// accepted & confirmed, it will become RESERVED state // accepted & confirmed, it will become RESERVED state
if (schedulerContainer.getRmContainer().getState() if (schedulerContainer.getRmContainer().getState()
== RMContainerState.RESERVED) { == RMContainerState.RESERVED) {
// Check if node currently reserved by other application, there may
// be some outdated proposals in async-scheduling environment
if (schedulerContainer.getRmContainer() != schedulerContainer
.getSchedulerNode().getReservedContainer()) {
if (LOG.isDebugEnabled()) {
LOG.debug("Try to re-reserve a container, but node "
+ schedulerContainer.getSchedulerNode()
+ " is already reserved by another container"
+ schedulerContainer.getSchedulerNode()
.getReservedContainer().getContainerId());
}
return false;
}
// Set reReservation == true // Set reReservation == true
reReservation = true; reReservation = true;
} else { } else {

View File

@ -20,7 +20,10 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerState;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceRequest; import org.apache.hadoop.yarn.api.records.ResourceRequest;
@ -41,20 +44,26 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnSched
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.ContainerAllocationProposal; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.ContainerAllocationProposal;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.ResourceCommitRequest; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.ResourceCommitRequest;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.SchedulerContainer; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.SchedulerContainer;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptRemovedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptRemovedSchedulerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent;
import org.apache.hadoop.yarn.server.scheduler.SchedulerRequestKey; import org.apache.hadoop.yarn.server.scheduler.SchedulerRequestKey;
import org.apache.hadoop.yarn.util.resource.Resources; import org.apache.hadoop.yarn.util.resource.Resources;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import org.mockito.Mockito;
import org.mockito.invocation.InvocationOnMock;
import org.mockito.stubbing.Answer;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
public class TestCapacitySchedulerAsyncScheduling { public class TestCapacitySchedulerAsyncScheduling {
private final int GB = 1024; private final int GB = 1024;
@ -257,6 +266,144 @@ public class TestCapacitySchedulerAsyncScheduling {
rm.stop(); rm.stop();
} }
// Testcase for YARN-6678
@Test(timeout = 30000)
public void testCommitOutdatedReservedProposal() throws Exception {
// disable async-scheduling for simulating complex since scene
Configuration disableAsyncConf = new Configuration(conf);
disableAsyncConf.setBoolean(
CapacitySchedulerConfiguration.SCHEDULE_ASYNCHRONOUSLY_ENABLE, false);
// init RM & NMs & Nodes
final MockRM rm = new MockRM(disableAsyncConf);
rm.start();
final MockNM nm1 = rm.registerNode("h1:1234", 9 * GB);
final MockNM nm2 = rm.registerNode("h2:2234", 9 * GB);
// init scheduler nodes
int waitTime = 1000;
while (waitTime > 0 &&
((AbstractYarnScheduler) rm.getRMContext().getScheduler())
.getNodeTracker().nodeCount() < 2) {
waitTime -= 10;
Thread.sleep(10);
}
Assert.assertEquals(2,
((AbstractYarnScheduler) rm.getRMContext().getScheduler())
.getNodeTracker().nodeCount());
YarnScheduler scheduler = rm.getRMContext().getScheduler();
final SchedulerNode sn1 =
((CapacityScheduler) scheduler).getSchedulerNode(nm1.getNodeId());
final SchedulerNode sn2 =
((CapacityScheduler) scheduler).getSchedulerNode(nm2.getNodeId());
// submit app1, am1 is running on nm1
RMApp app = rm.submitApp(200, "app", "user", null, "default");
final MockAM am = MockRM.launchAndRegisterAM(app, rm, nm1);
// submit app2, am2 is running on nm1
RMApp app2 = rm.submitApp(200, "app", "user", null, "default");
final MockAM am2 = MockRM.launchAndRegisterAM(app2, rm, nm1);
// allocate and launch 2 containers for app1
allocateAndLaunchContainers(am, nm1, rm, 1,
Resources.createResource(5 * GB), 0, 2);
allocateAndLaunchContainers(am, nm2, rm, 1,
Resources.createResource(5 * GB), 0, 3);
// nm1 runs 3 containers(app1-container_01/AM, app1-container_02,
// app2-container_01/AM)
// nm2 runs 1 container(app1-container_03)
Assert.assertEquals(3, sn1.getNumContainers());
Assert.assertEquals(1, sn2.getNumContainers());
// reserve 1 container(app1-container_04) for app1 on nm1
ResourceRequest rr2 = ResourceRequest
.newInstance(Priority.newInstance(0), "*",
Resources.createResource(5 * GB), 1);
am.allocate(Arrays.asList(rr2), null);
nm1.nodeHeartbeat(true);
// wait app1-container_04 reserved on nm1
waitTime = 1000;
while (waitTime > 0 && sn1.getReservedContainer() == null) {
waitTime -= 10;
Thread.sleep(10);
}
Assert.assertNotNull(sn1.getReservedContainer());
final CapacityScheduler cs = (CapacityScheduler) scheduler;
final CapacityScheduler spyCs = Mockito.spy(cs);
final AtomicBoolean isFirstReserve = new AtomicBoolean(true);
final AtomicBoolean isChecked = new AtomicBoolean(false);
// handle CapacityScheduler#tryCommit,
// reproduce the process that can raise IllegalStateException before
Mockito.doAnswer(new Answer<Object>() {
public Object answer(InvocationOnMock invocation) throws Exception {
ResourceCommitRequest request =
(ResourceCommitRequest) invocation.getArguments()[1];
if (request.getContainersToReserve().size() > 0 && isFirstReserve
.compareAndSet(true, false)) {
// release app1-container_03 on nm2
RMContainer killableContainer =
sn2.getCopiedListOfRunningContainers().get(0);
cs.completedContainer(killableContainer, ContainerStatus
.newInstance(killableContainer.getContainerId(),
ContainerState.COMPLETE, "",
ContainerExitStatus.KILLED_BY_RESOURCEMANAGER),
RMContainerEventType.KILL);
Assert.assertEquals(0, sn2.getCopiedListOfRunningContainers().size());
// unreserve app1-container_04 on nm1
// and allocate app1-container_05 on nm2
cs.handle(new NodeUpdateSchedulerEvent(sn2.getRMNode()));
int waitTime = 1000;
while (waitTime > 0
&& sn2.getCopiedListOfRunningContainers().size() == 0) {
waitTime -= 10;
Thread.sleep(10);
}
Assert.assertEquals(1, sn2.getCopiedListOfRunningContainers().size());
Assert.assertNull(sn1.getReservedContainer());
// reserve app2-container_02 on nm1
ResourceRequest rr3 = ResourceRequest
.newInstance(Priority.newInstance(0), "*",
Resources.createResource(5 * GB), 1);
am2.allocate(Arrays.asList(rr3), null);
cs.handle(new NodeUpdateSchedulerEvent(sn1.getRMNode()));
waitTime = 1000;
while (waitTime > 0 && sn1.getReservedContainer() == null) {
waitTime -= 10;
Thread.sleep(10);
}
Assert.assertNotNull(sn1.getReservedContainer());
// call real apply
try {
cs.tryCommit((Resource) invocation.getArguments()[0],
(ResourceCommitRequest) invocation.getArguments()[1]);
} catch (Exception e) {
e.printStackTrace();
Assert.fail();
}
isChecked.set(true);
} else {
cs.tryCommit((Resource) invocation.getArguments()[0],
(ResourceCommitRequest) invocation.getArguments()[1]);
}
return null;
}
}).when(spyCs).tryCommit(Mockito.any(Resource.class),
Mockito.any(ResourceCommitRequest.class));
spyCs.handle(new NodeUpdateSchedulerEvent(sn1.getRMNode()));
waitTime = 1000;
while (waitTime > 0 && !isChecked.get()) {
waitTime -= 10;
Thread.sleep(10);
}
rm.stop();
}
private void allocateAndLaunchContainers(MockAM am, MockNM nm, MockRM rm, private void allocateAndLaunchContainers(MockAM am, MockNM nm, MockRM rm,
int nContainer, Resource resource, int priority, int startContainerId) int nContainer, Resource resource, int priority, int startContainerId)