YARN-8545. Return allocated resource to RM for failed container.

Contributed by Chandni Singh

(cherry picked from commit 40fad32824)
This commit is contained in:
Eric Yang 2018-07-26 18:22:57 -04:00
parent 8e3807afe0
commit 177f6045ac
9 changed files with 135 additions and 43 deletions

View File

@ -687,7 +687,8 @@ public class ServiceScheduler extends CompositeService {
}
ComponentEvent event =
new ComponentEvent(instance.getCompName(), CONTAINER_COMPLETED)
.setStatus(status).setInstance(instance);
.setStatus(status).setInstance(instance)
.setContainerId(containerId);
dispatcher.getEventHandler().handle(event);
}
}

View File

@ -19,6 +19,7 @@
package org.apache.hadoop.yarn.service.component;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.ExecutionType;
@ -518,10 +519,10 @@ public class Component implements EventHandler<ComponentEvent> {
private static class ContainerCompletedTransition extends BaseTransition {
@Override
public void transition(Component component, ComponentEvent event) {
Preconditions.checkNotNull(event.getContainerId());
component.updateMetrics(event.getStatus());
component.dispatcher.getEventHandler().handle(
new ComponentInstanceEvent(event.getStatus().getContainerId(), STOP)
new ComponentInstanceEvent(event.getContainerId(), STOP)
.setStatus(event.getStatus()));
ComponentRestartPolicy restartPolicy =
@ -784,6 +785,9 @@ public class Component implements EventHandler<ComponentEvent> {
}
private void updateMetrics(ContainerStatus status) {
//when a container preparation fails while building launch context, then
//the container status may not exist.
if (status != null) {
switch (status.getExitStatus()) {
case SUCCESS:
componentMetrics.containersSucceeded.incr();
@ -800,12 +804,14 @@ public class Component implements EventHandler<ComponentEvent> {
default:
break;
}
}
// containersFailed include preempted, disks_failed etc.
componentMetrics.containersFailed.incr();
scheduler.getServiceMetrics().containersFailed.incr();
if (Apps.shouldCountTowardsNodeBlacklisting(status.getExitStatus())) {
if (status != null && Apps.shouldCountTowardsNodeBlacklisting(
status.getExitStatus())) {
String host = scheduler.getLiveInstances().get(status.getContainerId())
.getNodeId().getHost();
failureTracker.incNodeFailure(host);

View File

@ -76,6 +76,8 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
Comparable<ComponentInstance> {
private static final Logger LOG =
LoggerFactory.getLogger(ComponentInstance.class);
private static final String FAILED_BEFORE_LAUNCH_DIAG =
"failed before launch";
private StateMachine<ComponentInstanceState, ComponentInstanceEventType,
ComponentInstanceEvent> stateMachine;
@ -236,7 +238,8 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
@VisibleForTesting
static void handleComponentInstanceRelaunch(
ComponentInstance compInstance, ComponentInstanceEvent event) {
ComponentInstance compInstance, ComponentInstanceEvent event,
boolean failureBeforeLaunch) {
Component comp = compInstance.getComponent();
// Do we need to relaunch the service?
@ -252,7 +255,9 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
+ ": {} completed. Reinsert back to pending list and requested " +
"a new container." + System.lineSeparator() +
" exitStatus={}, diagnostics={}.",
event.getContainerId(), event.getStatus().getExitStatus(),
event.getContainerId(), failureBeforeLaunch ? null :
event.getStatus().getExitStatus(),
failureBeforeLaunch ? FAILED_BEFORE_LAUNCH_DIAG :
event.getStatus().getDiagnostics());
} else {
// When no relaunch, update component's #succeeded/#failed
@ -292,8 +297,8 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
Component comp = compInstance.component;
String containerDiag =
compInstance.getCompInstanceId() + ": " + event.getStatus()
.getDiagnostics();
compInstance.getCompInstanceId() + ": " + (failedBeforeLaunching ?
FAILED_BEFORE_LAUNCH_DIAG : event.getStatus().getDiagnostics());
compInstance.diagnostics.append(containerDiag + System.lineSeparator());
compInstance.cancelContainerStatusRetriever();
if (compInstance.getState().equals(ComponentInstanceState.UPGRADING)) {
@ -307,6 +312,9 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
boolean shouldFailService = false;
final ServiceScheduler scheduler = comp.getScheduler();
scheduler.getAmRMClient().releaseAssignedContainer(
event.getContainerId());
// Check if it exceeds the failure threshold, but only if health threshold
// monitor is not enabled
if (!comp.isHealthThresholdMonitorEnabled()
@ -347,7 +355,8 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
// According to component restart policy, handle container restart
// or finish the service (if all components finished)
handleComponentInstanceRelaunch(compInstance, event);
handleComponentInstanceRelaunch(compInstance, event,
failedBeforeLaunching);
if (shouldFailService) {
scheduler.getTerminationHandler().terminate(-1);

View File

@ -18,6 +18,7 @@
package org.apache.hadoop.yarn.service.component.instance;
import com.google.common.base.Preconditions;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.event.AbstractEvent;
@ -32,6 +33,7 @@ public class ComponentInstanceEvent
public ComponentInstanceEvent(ContainerId containerId,
ComponentInstanceEventType componentInstanceEventType) {
super(componentInstanceEventType);
Preconditions.checkNotNull(containerId);
this.id = containerId;
}

View File

@ -22,8 +22,11 @@ import com.google.common.base.Preconditions;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.service.ServiceContext;
import org.apache.hadoop.yarn.service.api.records.Artifact;
import org.apache.hadoop.yarn.service.component.ComponentEvent;
import org.apache.hadoop.yarn.service.component.ComponentEventType;
import org.apache.hadoop.yarn.service.component.instance.ComponentInstance;
import org.apache.hadoop.yarn.service.provider.ProviderService;
import org.apache.hadoop.yarn.service.provider.ProviderFactory;
@ -116,9 +119,12 @@ public class ContainerLaunchService extends AbstractService{
launcher.completeContainerLaunch(), true);
}
} catch (Exception e) {
LOG.error(instance.getCompInstanceId()
+ ": Failed to launch container. ", e);
LOG.error("{}: Failed to launch container.",
instance.getCompInstanceId(), e);
ComponentEvent event = new ComponentEvent(instance.getCompName(),
ComponentEventType.CONTAINER_COMPLETED)
.setInstance(instance).setContainerId(container.getId());
context.scheduler.getDispatcher().getEventHandler().handle(event);
}
}
}

View File

@ -68,6 +68,7 @@ import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeoutException;
@ -99,6 +100,8 @@ public class MockServiceAM extends ServiceMaster {
private Map<ContainerId, ContainerStatus> containerStatuses =
new ConcurrentHashMap<>();
private Set<ContainerId> releasedContainers = ConcurrentHashMap.newKeySet();
private Credentials amCreds;
public MockServiceAM(Service service) {
@ -223,6 +226,13 @@ public class MockServiceAM extends ServiceMaster {
return response;
}
@Override
public synchronized void releaseAssignedContainer(
ContainerId containerId) {
releasedContainers.add(containerId);
super.releaseAssignedContainer(containerId);
}
@Override public void unregisterApplicationMaster(
FinalApplicationStatus appStatus, String appMessage,
String appTrackingUrl) {
@ -288,7 +298,7 @@ public class MockServiceAM extends ServiceMaster {
}
/**
*
* Creates a mock container and container ID and feeds to the component.
* @param service The service for the component
* @param id The id for the container
* @param compName The component to which the container is fed
@ -297,6 +307,18 @@ public class MockServiceAM extends ServiceMaster {
public Container feedContainerToComp(Service service, int id,
String compName) {
ContainerId containerId = createContainerId(id);
return feedContainerToComp(service, containerId, compName);
}
/**
* Feeds the container to the component.
* @param service The service for the component
* @param containerId container id
* @param compName The component to which the container is fed
* @return
*/
public Container feedContainerToComp(Service service, ContainerId containerId,
String compName) {
Container container = createContainer(containerId, compName);
synchronized (feedContainers) {
feedContainers.add(container);
@ -423,4 +445,14 @@ public class MockServiceAM extends ServiceMaster {
}
return ByteBuffer.wrap(dob.getData(), 0, dob.getLength());
}
/**
* Waits for the container to get released
* @param containerId ContainerId
*/
public void waitForContainerToRelease(ContainerId containerId)
throws TimeoutException, InterruptedException {
GenericTestUtils.waitFor(() -> releasedContainers.contains(containerId),
1000, 9990000);
}
}

View File

@ -18,6 +18,7 @@
package org.apache.hadoop.yarn.service;
import com.google.common.base.Supplier;
import com.google.common.collect.ImmutableMap;
import org.apache.commons.io.FileUtils;
import org.apache.curator.test.TestingCluster;
@ -391,4 +392,38 @@ public class TestServiceAM extends ServiceTestUtils{
.equals("newer.host"), 2000, 200000);
am.stop();
}
// Test to verify that the containers are released and the
// component instance is added to the pending queue when building the launch
// context fails.
@Test(timeout = 9990000)
public void testContainersReleasedWhenPreLaunchFails()
throws Exception {
ApplicationId applicationId = ApplicationId.newInstance(
System.currentTimeMillis(), 1);
Service exampleApp = new Service();
exampleApp.setId(applicationId.toString());
exampleApp.setVersion("v1");
exampleApp.setName("testContainersReleasedWhenPreLaunchFails");
Component compA = createComponent("compa", 1, "pwd");
Artifact artifact = new Artifact();
artifact.setType(Artifact.TypeEnum.TARBALL);
compA.artifact(artifact);
exampleApp.addComponent(compA);
MockServiceAM am = new MockServiceAM(exampleApp);
am.init(conf);
am.start();
ContainerId containerId = am.createContainerId(1);
// allocate a container
am.feedContainerToComp(exampleApp, containerId, "compa");
am.waitForContainerToRelease(containerId);
Assert.assertEquals(1,
am.getComponent("compa").getPendingInstances().size());
am.stop();
}
}

View File

@ -196,7 +196,8 @@ public class TestComponent {
org.apache.hadoop.yarn.api.records.ContainerState.COMPLETE,
"successful", 0);
comp.handle(new ComponentEvent(comp.getName(),
ComponentEventType.CONTAINER_COMPLETED).setStatus(containerStatus));
ComponentEventType.CONTAINER_COMPLETED).setStatus(containerStatus)
.setContainerId(instanceContainer.getId()));
componentInstance.handle(
new ComponentInstanceEvent(componentInstance.getContainer().getId(),
ComponentInstanceEventType.STOP).setStatus(containerStatus));

View File

@ -245,7 +245,7 @@ public class TestComponentInstance {
comp.getAllComponentInstances().iterator().next();
ComponentInstance.handleComponentInstanceRelaunch(componentInstance,
componentInstanceEvent);
componentInstanceEvent, false);
verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
verify(comp, never()).markAsFailed(any(ComponentInstance.class));
@ -262,7 +262,7 @@ public class TestComponentInstance {
componentInstance = comp.getAllComponentInstances().iterator().next();
containerStatus.setExitStatus(1);
ComponentInstance.handleComponentInstanceRelaunch(componentInstance,
componentInstanceEvent);
componentInstanceEvent, false);
verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
verify(comp, never()).markAsFailed(any(ComponentInstance.class));
verify(comp, times(1)).reInsertPendingInstance(
@ -286,7 +286,7 @@ public class TestComponentInstance {
when(comp.getNumSucceededInstances()).thenReturn(new Long(1));
ComponentInstance.handleComponentInstanceRelaunch(componentInstance,
componentInstanceEvent);
componentInstanceEvent, false);
verify(comp, times(1)).markAsSucceeded(any(ComponentInstance.class));
verify(comp, never()).markAsFailed(any(ComponentInstance.class));
verify(comp, times(0)).reInsertPendingInstance(
@ -304,7 +304,7 @@ public class TestComponentInstance {
when(comp.getNumFailedInstances()).thenReturn(new Long(1));
ComponentInstance.handleComponentInstanceRelaunch(componentInstance,
componentInstanceEvent);
componentInstanceEvent, false);
verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
verify(comp, times(1)).markAsFailed(any(ComponentInstance.class));
@ -323,7 +323,7 @@ public class TestComponentInstance {
componentInstance = comp.getAllComponentInstances().iterator().next();
containerStatus.setExitStatus(1);
ComponentInstance.handleComponentInstanceRelaunch(componentInstance,
componentInstanceEvent);
componentInstanceEvent, false);
verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
verify(comp, never()).markAsFailed(any(ComponentInstance.class));
verify(comp, times(1)).reInsertPendingInstance(
@ -340,7 +340,7 @@ public class TestComponentInstance {
componentInstance = comp.getAllComponentInstances().iterator().next();
containerStatus.setExitStatus(1);
ComponentInstance.handleComponentInstanceRelaunch(componentInstance,
componentInstanceEvent);
componentInstanceEvent, false);
verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
verify(comp, times(1)).markAsFailed(any(ComponentInstance.class));
verify(comp, times(0)).reInsertPendingInstance(
@ -363,7 +363,7 @@ public class TestComponentInstance {
containerStatus.setExitStatus(1);
ComponentInstance commponentInstance = iter.next();
ComponentInstance.handleComponentInstanceRelaunch(commponentInstance,
componentInstanceEvent);
componentInstanceEvent, false);
verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
verify(comp, never()).markAsFailed(any(ComponentInstance.class));
@ -404,7 +404,7 @@ public class TestComponentInstance {
when(component2Instance.getComponent().getNumFailedInstances())
.thenReturn(new Long(failed2Instances.size()));
ComponentInstance.handleComponentInstanceRelaunch(component2Instance,
componentInstanceEvent);
componentInstanceEvent, false);
}
Map<String, ComponentInstance> failed1Instances = new HashMap<>();
@ -418,7 +418,7 @@ public class TestComponentInstance {
when(component1Instance.getComponent().getNumFailedInstances())
.thenReturn(new Long(failed1Instances.size()));
ComponentInstance.handleComponentInstanceRelaunch(component1Instance,
componentInstanceEvent);
componentInstanceEvent, false);
}
verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
@ -458,7 +458,7 @@ public class TestComponentInstance {
when(component2Instance.getComponent().getNumSucceededInstances())
.thenReturn(new Long(succeeded2Instances.size()));
ComponentInstance.handleComponentInstanceRelaunch(component2Instance,
componentInstanceEvent);
componentInstanceEvent, false);
}
Map<String, ComponentInstance> succeeded1Instances = new HashMap<>();
@ -471,7 +471,7 @@ public class TestComponentInstance {
when(component1Instance.getComponent().getNumSucceededInstances())
.thenReturn(new Long(succeeded1Instances.size()));
ComponentInstance.handleComponentInstanceRelaunch(component1Instance,
componentInstanceEvent);
componentInstanceEvent, false);
}
verify(comp, times(2)).markAsSucceeded(any(ComponentInstance.class));
@ -500,7 +500,7 @@ public class TestComponentInstance {
for (ComponentInstance component2Instance : component2Instances) {
ComponentInstance.handleComponentInstanceRelaunch(component2Instance,
componentInstanceEvent);
componentInstanceEvent, false);
}
succeeded1Instances = new HashMap<>();
@ -511,7 +511,7 @@ public class TestComponentInstance {
when(component1Instance.getComponent().getSucceededInstances())
.thenReturn(succeeded1Instances.values());
ComponentInstance.handleComponentInstanceRelaunch(component1Instance,
componentInstanceEvent);
componentInstanceEvent, false);
}
verify(comp, times(2)).markAsSucceeded(any(ComponentInstance.class));