YARN-6629. NPE occurred when container allocation proposal is applied but its resource requests are removed before. Contributed by Tao Yang.
This commit is contained in:
parent
754e5189ee
commit
2b2e2ac5f4
@ -2516,8 +2516,7 @@ public void tryCommit(Resource cluster, ResourceCommitRequest r) {
|
|||||||
// proposal might be outdated if AM failover just finished
|
// proposal might be outdated if AM failover just finished
|
||||||
// and proposal queue was not be consumed in time
|
// and proposal queue was not be consumed in time
|
||||||
if (app != null && attemptId.equals(app.getApplicationAttemptId())) {
|
if (app != null && attemptId.equals(app.getApplicationAttemptId())) {
|
||||||
if (app.accept(cluster, request)) {
|
if (app.accept(cluster, request) && app.apply(cluster, request)) {
|
||||||
app.apply(cluster, request);
|
|
||||||
LOG.info("Allocation proposal accepted");
|
LOG.info("Allocation proposal accepted");
|
||||||
} else{
|
} else{
|
||||||
LOG.info("Failed to accept allocation proposal");
|
LOG.info("Failed to accept allocation proposal");
|
||||||
|
@ -484,7 +484,7 @@ public boolean accept(Resource cluster,
|
|||||||
return accepted;
|
return accepted;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void apply(Resource cluster,
|
public boolean apply(Resource cluster,
|
||||||
ResourceCommitRequest<FiCaSchedulerApp, FiCaSchedulerNode> request) {
|
ResourceCommitRequest<FiCaSchedulerApp, FiCaSchedulerNode> request) {
|
||||||
boolean reReservation = false;
|
boolean reReservation = false;
|
||||||
|
|
||||||
@ -497,8 +497,15 @@ public void apply(Resource cluster,
|
|||||||
allocation = request.getFirstAllocatedOrReservedContainer();
|
allocation = request.getFirstAllocatedOrReservedContainer();
|
||||||
SchedulerContainer<FiCaSchedulerApp, FiCaSchedulerNode>
|
SchedulerContainer<FiCaSchedulerApp, FiCaSchedulerNode>
|
||||||
schedulerContainer = allocation.getAllocatedOrReservedContainer();
|
schedulerContainer = allocation.getAllocatedOrReservedContainer();
|
||||||
RMContainer rmContainer = schedulerContainer.getRmContainer();
|
|
||||||
|
|
||||||
|
// Required sanity check - AM can call 'allocate' to update resource
|
||||||
|
// request without locking the scheduler, hence we need to check
|
||||||
|
if (getOutstandingAsksCount(schedulerContainer.getSchedulerRequestKey())
|
||||||
|
<= 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
RMContainer rmContainer = schedulerContainer.getRmContainer();
|
||||||
reReservation =
|
reReservation =
|
||||||
(!schedulerContainer.isAllocated()) && (rmContainer.getState()
|
(!schedulerContainer.isAllocated()) && (rmContainer.getState()
|
||||||
== RMContainerState.RESERVED);
|
== RMContainerState.RESERVED);
|
||||||
@ -578,6 +585,7 @@ public void apply(Resource cluster,
|
|||||||
if (!reReservation) {
|
if (!reReservation) {
|
||||||
getCSLeafQueue().apply(cluster, request);
|
getCSLeafQueue().apply(cluster, request);
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean unreserve(SchedulerRequestKey schedulerKey,
|
public boolean unreserve(SchedulerRequestKey schedulerKey,
|
||||||
|
@ -111,6 +111,7 @@
|
|||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNodeReport;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNodeReport;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.TestSchedulerUtils;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.TestSchedulerUtils;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.ResourceCommitRequest;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent;
|
||||||
@ -144,6 +145,8 @@
|
|||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.mockito.Mockito;
|
import org.mockito.Mockito;
|
||||||
|
import org.mockito.invocation.InvocationOnMock;
|
||||||
|
import org.mockito.stubbing.Answer;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.InetSocketAddress;
|
import java.net.InetSocketAddress;
|
||||||
@ -5010,4 +5013,53 @@ private void waitforNMRegistered(ResourceScheduler scheduler, int nodecount,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test (timeout = 30000)
|
||||||
|
public void testClearRequestsBeforeApplyTheProposal()
|
||||||
|
throws Exception {
|
||||||
|
// init RM & NMs & Nodes
|
||||||
|
final MockRM rm = new MockRM(new CapacitySchedulerConfiguration());
|
||||||
|
rm.start();
|
||||||
|
final MockNM nm = rm.registerNode("h1:1234", 200 * GB);
|
||||||
|
|
||||||
|
// submit app
|
||||||
|
final RMApp app = rm.submitApp(200, "app", "user");
|
||||||
|
MockRM.launchAndRegisterAM(app, rm, nm);
|
||||||
|
|
||||||
|
// spy capacity scheduler to handle CapacityScheduler#apply
|
||||||
|
final Priority priority = Priority.newInstance(1);
|
||||||
|
final CapacityScheduler cs = (CapacityScheduler) rm.getResourceScheduler();
|
||||||
|
final CapacityScheduler spyCs = Mockito.spy(cs);
|
||||||
|
Mockito.doAnswer(new Answer<Object>() {
|
||||||
|
public Object answer(InvocationOnMock invocation) throws Exception {
|
||||||
|
// clear resource request before applying the proposal for container_2
|
||||||
|
spyCs.allocate(app.getCurrentAppAttempt().getAppAttemptId(),
|
||||||
|
Arrays.asList(ResourceRequest.newInstance(priority, "*",
|
||||||
|
Resources.createResource(1 * GB), 0)),
|
||||||
|
Collections.<ContainerId>emptyList(), null, null,
|
||||||
|
NULL_UPDATE_REQUESTS);
|
||||||
|
// trigger real apply which can raise NPE before YARN-6629
|
||||||
|
try {
|
||||||
|
FiCaSchedulerApp schedulerApp = cs.getApplicationAttempt(
|
||||||
|
app.getCurrentAppAttempt().getAppAttemptId());
|
||||||
|
schedulerApp.apply((Resource) invocation.getArguments()[0],
|
||||||
|
(ResourceCommitRequest) invocation.getArguments()[1]);
|
||||||
|
// the proposal of removed request should be rejected
|
||||||
|
Assert.assertEquals(1, schedulerApp.getLiveContainers().size());
|
||||||
|
} catch (Throwable e) {
|
||||||
|
Assert.fail();
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}).when(spyCs).tryCommit(Mockito.any(Resource.class),
|
||||||
|
Mockito.any(ResourceCommitRequest.class));
|
||||||
|
|
||||||
|
// rm allocates container_2 to reproduce the process that can raise NPE
|
||||||
|
spyCs.allocate(app.getCurrentAppAttempt().getAppAttemptId(),
|
||||||
|
Arrays.asList(ResourceRequest.newInstance(priority, "*",
|
||||||
|
Resources.createResource(1 * GB), 1)),
|
||||||
|
Collections.<ContainerId>emptyList(), null, null, NULL_UPDATE_REQUESTS);
|
||||||
|
spyCs.handle(new NodeUpdateSchedulerEvent(
|
||||||
|
spyCs.getNode(nm.getNodeId()).getRMNode()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user