YARN-8545. Return allocated resource to RM for failed container.
Contributed by Chandni Singh
This commit is contained in:
parent
d70d845705
commit
40fad32824
|
@ -687,7 +687,8 @@ public class ServiceScheduler extends CompositeService {
|
|||
}
|
||||
ComponentEvent event =
|
||||
new ComponentEvent(instance.getCompName(), CONTAINER_COMPLETED)
|
||||
.setStatus(status).setInstance(instance);
|
||||
.setStatus(status).setInstance(instance)
|
||||
.setContainerId(containerId);
|
||||
dispatcher.getEventHandler().handle(event);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
package org.apache.hadoop.yarn.service.component;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
import org.apache.hadoop.yarn.api.records.Container;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
||||
import org.apache.hadoop.yarn.api.records.ExecutionType;
|
||||
|
@ -518,10 +519,10 @@ public class Component implements EventHandler<ComponentEvent> {
|
|||
private static class ContainerCompletedTransition extends BaseTransition {
|
||||
@Override
|
||||
public void transition(Component component, ComponentEvent event) {
|
||||
|
||||
Preconditions.checkNotNull(event.getContainerId());
|
||||
component.updateMetrics(event.getStatus());
|
||||
component.dispatcher.getEventHandler().handle(
|
||||
new ComponentInstanceEvent(event.getStatus().getContainerId(), STOP)
|
||||
new ComponentInstanceEvent(event.getContainerId(), STOP)
|
||||
.setStatus(event.getStatus()));
|
||||
|
||||
ComponentRestartPolicy restartPolicy =
|
||||
|
@ -784,28 +785,33 @@ public class Component implements EventHandler<ComponentEvent> {
|
|||
}
|
||||
|
||||
private void updateMetrics(ContainerStatus status) {
|
||||
switch (status.getExitStatus()) {
|
||||
case SUCCESS:
|
||||
componentMetrics.containersSucceeded.incr();
|
||||
scheduler.getServiceMetrics().containersSucceeded.incr();
|
||||
return;
|
||||
case PREEMPTED:
|
||||
componentMetrics.containersPreempted.incr();
|
||||
scheduler.getServiceMetrics().containersPreempted.incr();
|
||||
break;
|
||||
case DISKS_FAILED:
|
||||
componentMetrics.containersDiskFailure.incr();
|
||||
scheduler.getServiceMetrics().containersDiskFailure.incr();
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
//when a container preparation fails while building launch context, then
|
||||
//the container status may not exist.
|
||||
if (status != null) {
|
||||
switch (status.getExitStatus()) {
|
||||
case SUCCESS:
|
||||
componentMetrics.containersSucceeded.incr();
|
||||
scheduler.getServiceMetrics().containersSucceeded.incr();
|
||||
return;
|
||||
case PREEMPTED:
|
||||
componentMetrics.containersPreempted.incr();
|
||||
scheduler.getServiceMetrics().containersPreempted.incr();
|
||||
break;
|
||||
case DISKS_FAILED:
|
||||
componentMetrics.containersDiskFailure.incr();
|
||||
scheduler.getServiceMetrics().containersDiskFailure.incr();
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// containersFailed include preempted, disks_failed etc.
|
||||
componentMetrics.containersFailed.incr();
|
||||
scheduler.getServiceMetrics().containersFailed.incr();
|
||||
|
||||
if (Apps.shouldCountTowardsNodeBlacklisting(status.getExitStatus())) {
|
||||
if (status != null && Apps.shouldCountTowardsNodeBlacklisting(
|
||||
status.getExitStatus())) {
|
||||
String host = scheduler.getLiveInstances().get(status.getContainerId())
|
||||
.getNodeId().getHost();
|
||||
failureTracker.incNodeFailure(host);
|
||||
|
|
|
@ -76,6 +76,8 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
|
|||
Comparable<ComponentInstance> {
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(ComponentInstance.class);
|
||||
private static final String FAILED_BEFORE_LAUNCH_DIAG =
|
||||
"failed before launch";
|
||||
|
||||
private StateMachine<ComponentInstanceState, ComponentInstanceEventType,
|
||||
ComponentInstanceEvent> stateMachine;
|
||||
|
@ -241,7 +243,8 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
|
|||
|
||||
@VisibleForTesting
|
||||
static void handleComponentInstanceRelaunch(
|
||||
ComponentInstance compInstance, ComponentInstanceEvent event) {
|
||||
ComponentInstance compInstance, ComponentInstanceEvent event,
|
||||
boolean failureBeforeLaunch) {
|
||||
Component comp = compInstance.getComponent();
|
||||
|
||||
// Do we need to relaunch the service?
|
||||
|
@ -257,8 +260,10 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
|
|||
+ ": {} completed. Reinsert back to pending list and requested " +
|
||||
"a new container." + System.lineSeparator() +
|
||||
" exitStatus={}, diagnostics={}.",
|
||||
event.getContainerId(), event.getStatus().getExitStatus(),
|
||||
event.getStatus().getDiagnostics());
|
||||
event.getContainerId(), failureBeforeLaunch ? null :
|
||||
event.getStatus().getExitStatus(),
|
||||
failureBeforeLaunch ? FAILED_BEFORE_LAUNCH_DIAG :
|
||||
event.getStatus().getDiagnostics());
|
||||
} else {
|
||||
// When no relaunch, update component's #succeeded/#failed
|
||||
// instances.
|
||||
|
@ -297,8 +302,8 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
|
|||
|
||||
Component comp = compInstance.component;
|
||||
String containerDiag =
|
||||
compInstance.getCompInstanceId() + ": " + event.getStatus()
|
||||
.getDiagnostics();
|
||||
compInstance.getCompInstanceId() + ": " + (failedBeforeLaunching ?
|
||||
FAILED_BEFORE_LAUNCH_DIAG : event.getStatus().getDiagnostics());
|
||||
compInstance.diagnostics.append(containerDiag + System.lineSeparator());
|
||||
compInstance.cancelContainerStatusRetriever();
|
||||
if (compInstance.getState().equals(ComponentInstanceState.UPGRADING)) {
|
||||
|
@ -312,6 +317,9 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
|
|||
boolean shouldFailService = false;
|
||||
|
||||
final ServiceScheduler scheduler = comp.getScheduler();
|
||||
scheduler.getAmRMClient().releaseAssignedContainer(
|
||||
event.getContainerId());
|
||||
|
||||
// Check if it exceeds the failure threshold, but only if health threshold
|
||||
// monitor is not enabled
|
||||
if (!comp.isHealthThresholdMonitorEnabled()
|
||||
|
@ -352,7 +360,8 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
|
|||
|
||||
// According to component restart policy, handle container restart
|
||||
// or finish the service (if all components finished)
|
||||
handleComponentInstanceRelaunch(compInstance, event);
|
||||
handleComponentInstanceRelaunch(compInstance, event,
|
||||
failedBeforeLaunching);
|
||||
|
||||
if (shouldFailService) {
|
||||
scheduler.getTerminationHandler().terminate(-1);
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
|
||||
package org.apache.hadoop.yarn.service.component.instance;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
||||
import org.apache.hadoop.yarn.event.AbstractEvent;
|
||||
|
@ -32,6 +33,7 @@ public class ComponentInstanceEvent
|
|||
public ComponentInstanceEvent(ContainerId containerId,
|
||||
ComponentInstanceEventType componentInstanceEventType) {
|
||||
super(componentInstanceEventType);
|
||||
Preconditions.checkNotNull(containerId);
|
||||
this.id = containerId;
|
||||
}
|
||||
|
||||
|
|
|
@ -22,8 +22,11 @@ import com.google.common.base.Preconditions;
|
|||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.service.AbstractService;
|
||||
import org.apache.hadoop.yarn.api.records.Container;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
||||
import org.apache.hadoop.yarn.service.ServiceContext;
|
||||
import org.apache.hadoop.yarn.service.api.records.Artifact;
|
||||
import org.apache.hadoop.yarn.service.component.ComponentEvent;
|
||||
import org.apache.hadoop.yarn.service.component.ComponentEventType;
|
||||
import org.apache.hadoop.yarn.service.component.instance.ComponentInstance;
|
||||
import org.apache.hadoop.yarn.service.provider.ProviderService;
|
||||
import org.apache.hadoop.yarn.service.provider.ProviderFactory;
|
||||
|
@ -116,9 +119,12 @@ public class ContainerLaunchService extends AbstractService{
|
|||
launcher.completeContainerLaunch(), true);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
LOG.error(instance.getCompInstanceId()
|
||||
+ ": Failed to launch container. ", e);
|
||||
|
||||
LOG.error("{}: Failed to launch container.",
|
||||
instance.getCompInstanceId(), e);
|
||||
ComponentEvent event = new ComponentEvent(instance.getCompName(),
|
||||
ComponentEventType.CONTAINER_COMPLETED)
|
||||
.setInstance(instance).setContainerId(container.getId());
|
||||
context.scheduler.getDispatcher().getEventHandler().handle(event);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -68,6 +68,7 @@ import java.util.Iterator;
|
|||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
|
||||
|
@ -99,6 +100,8 @@ public class MockServiceAM extends ServiceMaster {
|
|||
private Map<ContainerId, ContainerStatus> containerStatuses =
|
||||
new ConcurrentHashMap<>();
|
||||
|
||||
private Set<ContainerId> releasedContainers = ConcurrentHashMap.newKeySet();
|
||||
|
||||
private Credentials amCreds;
|
||||
|
||||
public MockServiceAM(Service service) {
|
||||
|
@ -223,6 +226,13 @@ public class MockServiceAM extends ServiceMaster {
|
|||
return response;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void releaseAssignedContainer(
|
||||
ContainerId containerId) {
|
||||
releasedContainers.add(containerId);
|
||||
super.releaseAssignedContainer(containerId);
|
||||
}
|
||||
|
||||
@Override public void unregisterApplicationMaster(
|
||||
FinalApplicationStatus appStatus, String appMessage,
|
||||
String appTrackingUrl) {
|
||||
|
@ -288,7 +298,7 @@ public class MockServiceAM extends ServiceMaster {
|
|||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Creates a mock container and container ID and feeds to the component.
|
||||
* @param service The service for the component
|
||||
* @param id The id for the container
|
||||
* @param compName The component to which the container is fed
|
||||
|
@ -297,6 +307,18 @@ public class MockServiceAM extends ServiceMaster {
|
|||
public Container feedContainerToComp(Service service, int id,
|
||||
String compName) {
|
||||
ContainerId containerId = createContainerId(id);
|
||||
return feedContainerToComp(service, containerId, compName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Feeds the container to the component.
|
||||
* @param service The service for the component
|
||||
* @param containerId container id
|
||||
* @param compName The component to which the container is fed
|
||||
* @return
|
||||
*/
|
||||
public Container feedContainerToComp(Service service, ContainerId containerId,
|
||||
String compName) {
|
||||
Container container = createContainer(containerId, compName);
|
||||
synchronized (feedContainers) {
|
||||
feedContainers.add(container);
|
||||
|
@ -423,4 +445,14 @@ public class MockServiceAM extends ServiceMaster {
|
|||
}
|
||||
return ByteBuffer.wrap(dob.getData(), 0, dob.getLength());
|
||||
}
|
||||
|
||||
/**
|
||||
* Waits for the container to get released
|
||||
* @param containerId ContainerId
|
||||
*/
|
||||
public void waitForContainerToRelease(ContainerId containerId)
|
||||
throws TimeoutException, InterruptedException {
|
||||
GenericTestUtils.waitFor(() -> releasedContainers.contains(containerId),
|
||||
1000, 9990000);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
|
||||
package org.apache.hadoop.yarn.service;
|
||||
|
||||
import com.google.common.base.Supplier;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.curator.test.TestingCluster;
|
||||
|
@ -391,4 +392,38 @@ public class TestServiceAM extends ServiceTestUtils{
|
|||
.equals("newer.host"), 2000, 200000);
|
||||
am.stop();
|
||||
}
|
||||
|
||||
// Test to verify that the containers are released and the
|
||||
// component instance is added to the pending queue when building the launch
|
||||
// context fails.
|
||||
@Test(timeout = 9990000)
|
||||
public void testContainersReleasedWhenPreLaunchFails()
|
||||
throws Exception {
|
||||
ApplicationId applicationId = ApplicationId.newInstance(
|
||||
System.currentTimeMillis(), 1);
|
||||
Service exampleApp = new Service();
|
||||
exampleApp.setId(applicationId.toString());
|
||||
exampleApp.setVersion("v1");
|
||||
exampleApp.setName("testContainersReleasedWhenPreLaunchFails");
|
||||
|
||||
Component compA = createComponent("compa", 1, "pwd");
|
||||
Artifact artifact = new Artifact();
|
||||
artifact.setType(Artifact.TypeEnum.TARBALL);
|
||||
compA.artifact(artifact);
|
||||
exampleApp.addComponent(compA);
|
||||
|
||||
MockServiceAM am = new MockServiceAM(exampleApp);
|
||||
am.init(conf);
|
||||
am.start();
|
||||
|
||||
ContainerId containerId = am.createContainerId(1);
|
||||
|
||||
// allocate a container
|
||||
am.feedContainerToComp(exampleApp, containerId, "compa");
|
||||
am.waitForContainerToRelease(containerId);
|
||||
|
||||
Assert.assertEquals(1,
|
||||
am.getComponent("compa").getPendingInstances().size());
|
||||
am.stop();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -178,7 +178,8 @@ public class TestComponent {
|
|||
org.apache.hadoop.yarn.api.records.ContainerState.COMPLETE,
|
||||
"successful", 0);
|
||||
comp.handle(new ComponentEvent(comp.getName(),
|
||||
ComponentEventType.CONTAINER_COMPLETED).setStatus(containerStatus));
|
||||
ComponentEventType.CONTAINER_COMPLETED).setStatus(containerStatus)
|
||||
.setContainerId(instanceContainer.getId()));
|
||||
componentInstance.handle(
|
||||
new ComponentInstanceEvent(componentInstance.getContainer().getId(),
|
||||
ComponentInstanceEventType.STOP).setStatus(containerStatus));
|
||||
|
|
|
@ -245,7 +245,7 @@ public class TestComponentInstance {
|
|||
comp.getAllComponentInstances().iterator().next();
|
||||
|
||||
ComponentInstance.handleComponentInstanceRelaunch(componentInstance,
|
||||
componentInstanceEvent);
|
||||
componentInstanceEvent, false);
|
||||
|
||||
verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
|
||||
verify(comp, never()).markAsFailed(any(ComponentInstance.class));
|
||||
|
@ -262,7 +262,7 @@ public class TestComponentInstance {
|
|||
componentInstance = comp.getAllComponentInstances().iterator().next();
|
||||
containerStatus.setExitStatus(1);
|
||||
ComponentInstance.handleComponentInstanceRelaunch(componentInstance,
|
||||
componentInstanceEvent);
|
||||
componentInstanceEvent, false);
|
||||
verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
|
||||
verify(comp, never()).markAsFailed(any(ComponentInstance.class));
|
||||
verify(comp, times(1)).reInsertPendingInstance(
|
||||
|
@ -286,7 +286,7 @@ public class TestComponentInstance {
|
|||
when(comp.getNumSucceededInstances()).thenReturn(new Long(1));
|
||||
|
||||
ComponentInstance.handleComponentInstanceRelaunch(componentInstance,
|
||||
componentInstanceEvent);
|
||||
componentInstanceEvent, false);
|
||||
verify(comp, times(1)).markAsSucceeded(any(ComponentInstance.class));
|
||||
verify(comp, never()).markAsFailed(any(ComponentInstance.class));
|
||||
verify(comp, times(0)).reInsertPendingInstance(
|
||||
|
@ -304,7 +304,7 @@ public class TestComponentInstance {
|
|||
|
||||
when(comp.getNumFailedInstances()).thenReturn(new Long(1));
|
||||
ComponentInstance.handleComponentInstanceRelaunch(componentInstance,
|
||||
componentInstanceEvent);
|
||||
componentInstanceEvent, false);
|
||||
|
||||
verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
|
||||
verify(comp, times(1)).markAsFailed(any(ComponentInstance.class));
|
||||
|
@ -323,7 +323,7 @@ public class TestComponentInstance {
|
|||
componentInstance = comp.getAllComponentInstances().iterator().next();
|
||||
containerStatus.setExitStatus(1);
|
||||
ComponentInstance.handleComponentInstanceRelaunch(componentInstance,
|
||||
componentInstanceEvent);
|
||||
componentInstanceEvent, false);
|
||||
verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
|
||||
verify(comp, never()).markAsFailed(any(ComponentInstance.class));
|
||||
verify(comp, times(1)).reInsertPendingInstance(
|
||||
|
@ -340,7 +340,7 @@ public class TestComponentInstance {
|
|||
componentInstance = comp.getAllComponentInstances().iterator().next();
|
||||
containerStatus.setExitStatus(1);
|
||||
ComponentInstance.handleComponentInstanceRelaunch(componentInstance,
|
||||
componentInstanceEvent);
|
||||
componentInstanceEvent, false);
|
||||
verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
|
||||
verify(comp, times(1)).markAsFailed(any(ComponentInstance.class));
|
||||
verify(comp, times(0)).reInsertPendingInstance(
|
||||
|
@ -363,7 +363,7 @@ public class TestComponentInstance {
|
|||
containerStatus.setExitStatus(1);
|
||||
ComponentInstance commponentInstance = iter.next();
|
||||
ComponentInstance.handleComponentInstanceRelaunch(commponentInstance,
|
||||
componentInstanceEvent);
|
||||
componentInstanceEvent, false);
|
||||
|
||||
verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
|
||||
verify(comp, never()).markAsFailed(any(ComponentInstance.class));
|
||||
|
@ -404,7 +404,7 @@ public class TestComponentInstance {
|
|||
when(component2Instance.getComponent().getNumFailedInstances())
|
||||
.thenReturn(new Long(failed2Instances.size()));
|
||||
ComponentInstance.handleComponentInstanceRelaunch(component2Instance,
|
||||
componentInstanceEvent);
|
||||
componentInstanceEvent, false);
|
||||
}
|
||||
|
||||
Map<String, ComponentInstance> failed1Instances = new HashMap<>();
|
||||
|
@ -418,7 +418,7 @@ public class TestComponentInstance {
|
|||
when(component1Instance.getComponent().getNumFailedInstances())
|
||||
.thenReturn(new Long(failed1Instances.size()));
|
||||
ComponentInstance.handleComponentInstanceRelaunch(component1Instance,
|
||||
componentInstanceEvent);
|
||||
componentInstanceEvent, false);
|
||||
}
|
||||
|
||||
verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
|
||||
|
@ -458,7 +458,7 @@ public class TestComponentInstance {
|
|||
when(component2Instance.getComponent().getNumSucceededInstances())
|
||||
.thenReturn(new Long(succeeded2Instances.size()));
|
||||
ComponentInstance.handleComponentInstanceRelaunch(component2Instance,
|
||||
componentInstanceEvent);
|
||||
componentInstanceEvent, false);
|
||||
}
|
||||
|
||||
Map<String, ComponentInstance> succeeded1Instances = new HashMap<>();
|
||||
|
@ -471,7 +471,7 @@ public class TestComponentInstance {
|
|||
when(component1Instance.getComponent().getNumSucceededInstances())
|
||||
.thenReturn(new Long(succeeded1Instances.size()));
|
||||
ComponentInstance.handleComponentInstanceRelaunch(component1Instance,
|
||||
componentInstanceEvent);
|
||||
componentInstanceEvent, false);
|
||||
}
|
||||
|
||||
verify(comp, times(2)).markAsSucceeded(any(ComponentInstance.class));
|
||||
|
@ -500,7 +500,7 @@ public class TestComponentInstance {
|
|||
|
||||
for (ComponentInstance component2Instance : component2Instances) {
|
||||
ComponentInstance.handleComponentInstanceRelaunch(component2Instance,
|
||||
componentInstanceEvent);
|
||||
componentInstanceEvent, false);
|
||||
}
|
||||
|
||||
succeeded1Instances = new HashMap<>();
|
||||
|
@ -511,7 +511,7 @@ public class TestComponentInstance {
|
|||
when(component1Instance.getComponent().getSucceededInstances())
|
||||
.thenReturn(succeeded1Instances.values());
|
||||
ComponentInstance.handleComponentInstanceRelaunch(component1Instance,
|
||||
componentInstanceEvent);
|
||||
componentInstanceEvent, false);
|
||||
}
|
||||
|
||||
verify(comp, times(2)).markAsSucceeded(any(ComponentInstance.class));
|
||||
|
|
Loading…
Reference in New Issue