YARN-2244. FairScheduler missing handling of containers for unknown application attempts. (Anubhav Dhoot via kasha)
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1611841 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5b0492f7c5
commit
fcb0fe0548
|
@ -53,6 +53,9 @@ Release 2.6.0 - UNRELEASED
|
||||||
after RM recovery but before scheduler learns about apps and app-attempts.
|
after RM recovery but before scheduler learns about apps and app-attempts.
|
||||||
(Jian He via vinodkv)
|
(Jian He via vinodkv)
|
||||||
|
|
||||||
|
YARN-2244. FairScheduler missing handling of containers for unknown
|
||||||
|
application attempts. (Anubhav Dhoot via kasha)
|
||||||
|
|
||||||
Release 2.5.0 - UNRELEASED
|
Release 2.5.0 - UNRELEASED
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -123,6 +123,23 @@ public abstract class AbstractYarnScheduler
|
||||||
return maximumAllocation;
|
return maximumAllocation;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected void containerLaunchedOnNode(ContainerId containerId,
|
||||||
|
SchedulerNode node) {
|
||||||
|
// Get the application for the finished container
|
||||||
|
SchedulerApplicationAttempt application = getCurrentAttemptForContainer
|
||||||
|
(containerId);
|
||||||
|
if (application == null) {
|
||||||
|
LOG.info("Unknown application "
|
||||||
|
+ containerId.getApplicationAttemptId().getApplicationId()
|
||||||
|
+ " launched container " + containerId + " on node: " + node);
|
||||||
|
this.rmContext.getDispatcher().getEventHandler()
|
||||||
|
.handle(new RMNodeCleanContainerEvent(node.getNodeID(), containerId));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
application.containerLaunchedOnNode(containerId, node.getNodeID());
|
||||||
|
}
|
||||||
|
|
||||||
public T getApplicationAttempt(ApplicationAttemptId applicationAttemptId) {
|
public T getApplicationAttempt(ApplicationAttemptId applicationAttemptId) {
|
||||||
SchedulerApplication<T> app =
|
SchedulerApplication<T> app =
|
||||||
applications.get(applicationAttemptId.getApplicationId());
|
applications.get(applicationAttemptId.getApplicationId());
|
||||||
|
|
|
@ -69,7 +69,6 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeCleanContainerEvent;
|
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.UpdatedContainerInfo;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.UpdatedContainerInfo;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation;
|
||||||
|
@ -866,21 +865,6 @@ public class CapacityScheduler extends
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void containerLaunchedOnNode(ContainerId containerId, FiCaSchedulerNode node) {
|
|
||||||
// Get the application for the finished container
|
|
||||||
FiCaSchedulerApp application = getCurrentAttemptForContainer(containerId);
|
|
||||||
if (application == null) {
|
|
||||||
LOG.info("Unknown application "
|
|
||||||
+ containerId.getApplicationAttemptId().getApplicationId()
|
|
||||||
+ " launched container " + containerId + " on node: " + node);
|
|
||||||
this.rmContext.getDispatcher().getEventHandler()
|
|
||||||
.handle(new RMNodeCleanContainerEvent(node.getNodeID(), containerId));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
application.containerLaunchedOnNode(containerId, node.getNodeID());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void handle(SchedulerEvent event) {
|
public void handle(SchedulerEvent event) {
|
||||||
switch(event.getType()) {
|
switch(event.getType()) {
|
||||||
|
|
|
@ -928,22 +928,6 @@ public class FairScheduler extends
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Process a container which has launched on a node, as reported by the node.
|
|
||||||
*/
|
|
||||||
private void containerLaunchedOnNode(ContainerId containerId, FSSchedulerNode node) {
|
|
||||||
// Get the application for the finished container
|
|
||||||
FSSchedulerApp application = getCurrentAttemptForContainer(containerId);
|
|
||||||
if (application == null) {
|
|
||||||
LOG.info("Unknown application "
|
|
||||||
+ containerId.getApplicationAttemptId().getApplicationId()
|
|
||||||
+ " launched container " + containerId + " on node: " + node);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
application.containerLaunchedOnNode(containerId, node.getNodeID());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Process a heartbeat update from a node.
|
* Process a heartbeat update from a node.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -66,7 +66,6 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeCleanContainerEvent;
|
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.UpdatedContainerInfo;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.UpdatedContainerInfo;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManager;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManager;
|
||||||
|
@ -831,23 +830,6 @@ public class FifoScheduler extends
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void containerLaunchedOnNode(ContainerId containerId, FiCaSchedulerNode node) {
|
|
||||||
// Get the application for the finished container
|
|
||||||
FiCaSchedulerApp application = getCurrentAttemptForContainer(containerId);
|
|
||||||
if (application == null) {
|
|
||||||
LOG.info("Unknown application "
|
|
||||||
+ containerId.getApplicationAttemptId().getApplicationId()
|
|
||||||
+ " launched container " + containerId + " on node: " + node);
|
|
||||||
// Some unknown container sneaked into the system. Kill it.
|
|
||||||
this.rmContext.getDispatcher().getEventHandler()
|
|
||||||
.handle(new RMNodeCleanContainerEvent(node.getNodeID(), containerId));
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
application.containerLaunchedOnNode(containerId, node.getNodeID());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Lock(FifoScheduler.class)
|
@Lock(FifoScheduler.class)
|
||||||
private synchronized void containerCompleted(RMContainer rmContainer,
|
private synchronized void containerCompleted(RMContainer rmContainer,
|
||||||
ContainerStatus containerStatus, RMContainerEventType event) {
|
ContainerStatus containerStatus, RMContainerEventType event) {
|
||||||
|
|
|
@ -232,20 +232,7 @@ public class TestApplicationCleanup {
|
||||||
containerStatuses.put(app.getApplicationId(), containerStatusList);
|
containerStatuses.put(app.getApplicationId(), containerStatusList);
|
||||||
|
|
||||||
NodeHeartbeatResponse resp = nm1.nodeHeartbeat(containerStatuses, true);
|
NodeHeartbeatResponse resp = nm1.nodeHeartbeat(containerStatuses, true);
|
||||||
dispatcher.await();
|
waitForContainerCleanup(dispatcher, nm1, resp);
|
||||||
List<ContainerId> contsToClean = resp.getContainersToCleanup();
|
|
||||||
int cleanedConts = contsToClean.size();
|
|
||||||
waitCount = 0;
|
|
||||||
while (cleanedConts < 1 && waitCount++ < 200) {
|
|
||||||
LOG.info("Waiting to get cleanup events.. cleanedConts: " + cleanedConts);
|
|
||||||
Thread.sleep(100);
|
|
||||||
resp = nm1.nodeHeartbeat(true);
|
|
||||||
dispatcher.await();
|
|
||||||
contsToClean = resp.getContainersToCleanup();
|
|
||||||
cleanedConts += contsToClean.size();
|
|
||||||
}
|
|
||||||
LOG.info("Got cleanup for " + contsToClean.get(0));
|
|
||||||
Assert.assertEquals(1, cleanedConts);
|
|
||||||
|
|
||||||
// Now to test the case when RM already gave cleanup, and NM suddenly
|
// Now to test the case when RM already gave cleanup, and NM suddenly
|
||||||
// realizes that the container is running.
|
// realizes that the container is running.
|
||||||
|
@ -258,26 +245,36 @@ public class TestApplicationCleanup {
|
||||||
containerStatuses.put(app.getApplicationId(), containerStatusList);
|
containerStatuses.put(app.getApplicationId(), containerStatusList);
|
||||||
|
|
||||||
resp = nm1.nodeHeartbeat(containerStatuses, true);
|
resp = nm1.nodeHeartbeat(containerStatuses, true);
|
||||||
dispatcher.await();
|
|
||||||
contsToClean = resp.getContainersToCleanup();
|
|
||||||
cleanedConts = contsToClean.size();
|
|
||||||
// The cleanup list won't be instantaneous as it is given out by scheduler
|
// The cleanup list won't be instantaneous as it is given out by scheduler
|
||||||
// and not RMNodeImpl.
|
// and not RMNodeImpl.
|
||||||
waitCount = 0;
|
waitForContainerCleanup(dispatcher, nm1, resp);
|
||||||
while (cleanedConts < 1 && waitCount++ < 200) {
|
|
||||||
LOG.info("Waiting to get cleanup events.. cleanedConts: " + cleanedConts);
|
|
||||||
Thread.sleep(100);
|
|
||||||
resp = nm1.nodeHeartbeat(true);
|
|
||||||
dispatcher.await();
|
|
||||||
contsToClean = resp.getContainersToCleanup();
|
|
||||||
cleanedConts += contsToClean.size();
|
|
||||||
}
|
|
||||||
LOG.info("Got cleanup for " + contsToClean.get(0));
|
|
||||||
Assert.assertEquals(1, cleanedConts);
|
|
||||||
|
|
||||||
rm.stop();
|
rm.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected void waitForContainerCleanup(DrainDispatcher dispatcher, MockNM nm,
|
||||||
|
NodeHeartbeatResponse resp) throws Exception {
|
||||||
|
int waitCount = 0, cleanedConts = 0;
|
||||||
|
List<ContainerId> contsToClean;
|
||||||
|
do {
|
||||||
|
dispatcher.await();
|
||||||
|
contsToClean = resp.getContainersToCleanup();
|
||||||
|
cleanedConts += contsToClean.size();
|
||||||
|
if (cleanedConts >= 1) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Thread.sleep(100);
|
||||||
|
resp = nm.nodeHeartbeat(true);
|
||||||
|
} while(waitCount++ < 200);
|
||||||
|
|
||||||
|
if (contsToClean.isEmpty()) {
|
||||||
|
LOG.error("Failed to get any containers to cleanup");
|
||||||
|
} else {
|
||||||
|
LOG.info("Got cleanup for " + contsToClean.get(0));
|
||||||
|
}
|
||||||
|
Assert.assertEquals(1, cleanedConts);
|
||||||
|
}
|
||||||
|
|
||||||
private void waitForAppCleanupMessageRecved(MockNM nm, ApplicationId appId)
|
private void waitForAppCleanupMessageRecved(MockNM nm, ApplicationId appId)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
while (true) {
|
while (true) {
|
||||||
|
@ -400,6 +397,58 @@ public class TestApplicationCleanup {
|
||||||
rm2.stop();
|
rm2.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("resource")
|
||||||
|
@Test (timeout = 60000)
|
||||||
|
public void testContainerCleanupWhenRMRestartedAppNotRegistered() throws
|
||||||
|
Exception {
|
||||||
|
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
|
||||||
|
MemoryRMStateStore memStore = new MemoryRMStateStore();
|
||||||
|
memStore.init(conf);
|
||||||
|
|
||||||
|
// start RM
|
||||||
|
final DrainDispatcher dispatcher = new DrainDispatcher();
|
||||||
|
MockRM rm1 = new MockRM(conf, memStore) {
|
||||||
|
@Override
|
||||||
|
protected Dispatcher createDispatcher() {
|
||||||
|
return dispatcher;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
rm1.start();
|
||||||
|
MockNM nm1 =
|
||||||
|
new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
|
||||||
|
nm1.registerNode();
|
||||||
|
|
||||||
|
// create app and launch the AM
|
||||||
|
RMApp app0 = rm1.submitApp(200);
|
||||||
|
MockAM am0 = launchAM(app0, rm1, nm1);
|
||||||
|
nm1.nodeHeartbeat(am0.getApplicationAttemptId(), 1, ContainerState.RUNNING);
|
||||||
|
rm1.waitForState(app0.getApplicationId(), RMAppState.RUNNING);
|
||||||
|
|
||||||
|
// start new RM
|
||||||
|
final DrainDispatcher dispatcher2 = new DrainDispatcher();
|
||||||
|
MockRM rm2 = new MockRM(conf, memStore) {
|
||||||
|
@Override
|
||||||
|
protected Dispatcher createDispatcher() {
|
||||||
|
return dispatcher2;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
rm2.start();
|
||||||
|
|
||||||
|
// nm1 register to rm2, and do a heartbeat
|
||||||
|
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
|
||||||
|
nm1.registerNode(Arrays.asList(app0.getApplicationId()));
|
||||||
|
rm2.waitForState(app0.getApplicationId(), RMAppState.ACCEPTED);
|
||||||
|
|
||||||
|
// Add unknown container for application unknown to scheduler
|
||||||
|
NodeHeartbeatResponse response = nm1.nodeHeartbeat(am0
|
||||||
|
.getApplicationAttemptId(), 2, ContainerState.RUNNING);
|
||||||
|
|
||||||
|
waitForContainerCleanup(dispatcher2, nm1, response);
|
||||||
|
|
||||||
|
rm1.stop();
|
||||||
|
rm2.stop();
|
||||||
|
}
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
TestApplicationCleanup t = new TestApplicationCleanup();
|
TestApplicationCleanup t = new TestApplicationCleanup();
|
||||||
t.testAppCleanup();
|
t.testAppCleanup();
|
||||||
|
|
Loading…
Reference in New Issue