YARN-2674. Fix distributed shell AM container relaunch during RM work preserving restart. Contributed by Shane Kumpf

This commit is contained in:
Billie Rinaldi 2018-04-30 14:34:51 -07:00
parent d6139c5106
commit 4e1382aca4
2 changed files with 44 additions and 28 deletions

View File

@ -31,6 +31,7 @@ import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.security.PrivilegedExceptionAction; import java.security.PrivilegedExceptionAction;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
@ -105,6 +106,7 @@ import org.apache.hadoop.yarn.api.records.timeline.TimelineEntityGroupId;
import org.apache.hadoop.yarn.api.records.timeline.TimelineEvent; import org.apache.hadoop.yarn.api.records.timeline.TimelineEvent;
import org.apache.hadoop.yarn.api.records.timeline.TimelinePutResponse; import org.apache.hadoop.yarn.api.records.timeline.TimelinePutResponse;
import org.apache.hadoop.yarn.api.resource.PlacementConstraint; import org.apache.hadoop.yarn.api.resource.PlacementConstraint;
import org.apache.hadoop.yarn.client.api.AMRMClient;
import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest; import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest;
import org.apache.hadoop.yarn.client.api.TimelineClient; import org.apache.hadoop.yarn.client.api.TimelineClient;
import org.apache.hadoop.yarn.client.api.TimelineV2Client; import org.apache.hadoop.yarn.client.api.TimelineV2Client;
@ -1060,25 +1062,31 @@ public class ApplicationMaster {
public void onContainersAllocated(List<Container> allocatedContainers) { public void onContainersAllocated(List<Container> allocatedContainers) {
LOG.info("Got response from RM for container ask, allocatedCnt=" LOG.info("Got response from RM for container ask, allocatedCnt="
+ allocatedContainers.size()); + allocatedContainers.size());
numAllocatedContainers.addAndGet(allocatedContainers.size());
for (Container allocatedContainer : allocatedContainers) { for (Container allocatedContainer : allocatedContainers) {
if (numAllocatedContainers.get() == numTotalContainers) {
LOG.info("The requested number of containers have been allocated."
+ " Releasing the extra container allocation from the RM.");
amRMClient.releaseAssignedContainer(allocatedContainer.getId());
} else {
numAllocatedContainers.addAndGet(1);
String yarnShellId = Integer.toString(yarnShellIdCounter); String yarnShellId = Integer.toString(yarnShellIdCounter);
yarnShellIdCounter++; yarnShellIdCounter++;
LOG.info("Launching shell command on a new container." LOG.info(
"Launching shell command on a new container."
+ ", containerId=" + allocatedContainer.getId() + ", containerId=" + allocatedContainer.getId()
+ ", yarnShellId=" + yarnShellId + ", yarnShellId=" + yarnShellId
+ ", containerNode=" + allocatedContainer.getNodeId().getHost() + ", containerNode="
+ allocatedContainer.getNodeId().getHost()
+ ":" + allocatedContainer.getNodeId().getPort() + ":" + allocatedContainer.getNodeId().getPort()
+ ", containerNodeURI=" + allocatedContainer.getNodeHttpAddress() + ", containerNodeURI="
+ allocatedContainer.getNodeHttpAddress()
+ ", containerResourceMemory" + ", containerResourceMemory"
+ allocatedContainer.getResource().getMemorySize() + allocatedContainer.getResource().getMemorySize()
+ ", containerResourceVirtualCores" + ", containerResourceVirtualCores"
+ allocatedContainer.getResource().getVirtualCores()); + allocatedContainer.getResource().getVirtualCores());
// + ", containerToken"
// +allocatedContainer.getContainerToken().getIdentifier().toString());
Thread launchThread = createLaunchContainerThread(allocatedContainer, Thread launchThread =
yarnShellId); createLaunchContainerThread(allocatedContainer, yarnShellId);
// launch and start the container on a separate thread to keep // launch and start the container on a separate thread to keep
// the main thread unblocked // the main thread unblocked
@ -1086,6 +1094,16 @@ public class ApplicationMaster {
launchThreads.add(launchThread); launchThreads.add(launchThread);
launchedContainers.add(allocatedContainer.getId()); launchedContainers.add(allocatedContainer.getId());
launchThread.start(); launchThread.start();
// Remove the corresponding request
Collection<AMRMClient.ContainerRequest> requests =
amRMClient.getMatchingRequests(
allocatedContainer.getAllocationRequestId());
if (requests.iterator().hasNext()) {
AMRMClient.ContainerRequest request = requests.iterator().next();
amRMClient.removeContainerRequest(request);
}
}
} }
} }

View File

@ -106,7 +106,6 @@ public class TestDSAppMaster {
handler.onContainersAllocated(containers); handler.onContainersAllocated(containers);
Assert.assertEquals("Wrong container allocation count", 1, Assert.assertEquals("Wrong container allocation count", 1,
master.getAllocatedContainers()); master.getAllocatedContainers());
Mockito.verifyZeroInteractions(mockClient);
Assert.assertEquals("Incorrect number of threads launched", 1, Assert.assertEquals("Incorrect number of threads launched", 1,
master.threadsLaunched); master.threadsLaunched);
Assert.assertEquals("Incorrect YARN Shell IDs", Assert.assertEquals("Incorrect YARN Shell IDs",
@ -121,15 +120,14 @@ public class TestDSAppMaster {
ContainerId id4 = BuilderUtils.newContainerId(1, 1, 1, 4); ContainerId id4 = BuilderUtils.newContainerId(1, 1, 1, 4);
containers.add(generateContainer(id4)); containers.add(generateContainer(id4));
handler.onContainersAllocated(containers); handler.onContainersAllocated(containers);
Assert.assertEquals("Wrong final container allocation count", 4, Assert.assertEquals("Wrong final container allocation count", 2,
master.getAllocatedContainers()); master.getAllocatedContainers());
Assert.assertEquals("Incorrect number of threads launched", 4, Assert.assertEquals("Incorrect number of threads launched", 2,
master.threadsLaunched); master.threadsLaunched);
Assert.assertEquals("Incorrect YARN Shell IDs", Assert.assertEquals("Incorrect YARN Shell IDs",
Arrays.asList("1", "2", "3", "4"), master.yarnShellIds); Arrays.asList("1", "2"), master.yarnShellIds);
// make sure we handle completion events correctly // make sure we handle completion events correctly
List<ContainerStatus> status = new ArrayList<>(); List<ContainerStatus> status = new ArrayList<>();
status.add(generateContainerStatus(id1, ContainerExitStatus.SUCCESS)); status.add(generateContainerStatus(id1, ContainerExitStatus.SUCCESS));