YARN-433. When RM is catching up with node updates then it should not expire acquired containers. Contributed by Xuan Gong
This commit is contained in:
parent
2087eaf684
commit
ab80e27703
|
@ -716,6 +716,9 @@ Release 2.8.0 - UNRELEASED
|
|||
YARN-3971. Skip RMNodeLabelsManager#checkRemoveFromClusterNodeLabelsOfQueue
|
||||
on nodelabel recovery. (Bibin A Chundatt via wangda)
|
||||
|
||||
YARN-433. When RM is catching up with node updates then it should not expire
|
||||
acquired containers. (Xuan Gong via zxu)
|
||||
|
||||
Release 2.7.2 - UNRELEASED
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
|
|
@ -99,9 +99,9 @@ public class RMContainerImpl implements RMContainer, Comparable<RMContainer> {
|
|||
|
||||
// Transitions from ACQUIRED state
|
||||
.addTransition(RMContainerState.ACQUIRED, RMContainerState.RUNNING,
|
||||
RMContainerEventType.LAUNCHED, new LaunchedTransition())
|
||||
RMContainerEventType.LAUNCHED)
|
||||
.addTransition(RMContainerState.ACQUIRED, RMContainerState.COMPLETED,
|
||||
RMContainerEventType.FINISHED, new ContainerFinishedAtAcquiredState())
|
||||
RMContainerEventType.FINISHED, new FinishedTransition())
|
||||
.addTransition(RMContainerState.ACQUIRED, RMContainerState.RELEASED,
|
||||
RMContainerEventType.RELEASED, new KillTransition())
|
||||
.addTransition(RMContainerState.ACQUIRED, RMContainerState.EXPIRED,
|
||||
|
@ -486,16 +486,6 @@ public class RMContainerImpl implements RMContainer, Comparable<RMContainer> {
|
|||
}
|
||||
}
|
||||
|
||||
private static final class LaunchedTransition extends BaseTransition {
|
||||
|
||||
@Override
|
||||
public void transition(RMContainerImpl container, RMContainerEvent event) {
|
||||
// Unregister from containerAllocationExpirer.
|
||||
container.containerAllocationExpirer.unregister(container
|
||||
.getContainerId());
|
||||
}
|
||||
}
|
||||
|
||||
private static final class ContainerRescheduledTransition extends
|
||||
FinishedTransition {
|
||||
|
||||
|
@ -554,19 +544,6 @@ public class RMContainerImpl implements RMContainer, Comparable<RMContainer> {
|
|||
}
|
||||
}
|
||||
|
||||
private static final class ContainerFinishedAtAcquiredState extends
|
||||
FinishedTransition {
|
||||
@Override
|
||||
public void transition(RMContainerImpl container, RMContainerEvent event) {
|
||||
// Unregister from containerAllocationExpirer.
|
||||
container.containerAllocationExpirer.unregister(container
|
||||
.getContainerId());
|
||||
|
||||
// Inform AppAttempt
|
||||
super.transition(container, event);
|
||||
}
|
||||
}
|
||||
|
||||
private static final class KillTransition extends FinishedTransition {
|
||||
|
||||
@Override
|
||||
|
|
|
@ -59,6 +59,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsMana
|
|||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppRunningOnNodeEvent;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.ContainerAllocationExpirer;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeRemovedSchedulerEvent;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeResourceUpdateSchedulerEvent;
|
||||
|
@ -107,6 +108,7 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
|||
private long lastHealthReportTime;
|
||||
private String nodeManagerVersion;
|
||||
|
||||
private final ContainerAllocationExpirer containerAllocationExpirer;
|
||||
/* set of containers that have just launched */
|
||||
private final Set<ContainerId> launchedContainers =
|
||||
new HashSet<ContainerId>();
|
||||
|
@ -265,6 +267,8 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
|||
this.stateMachine = stateMachineFactory.make(this);
|
||||
|
||||
this.nodeUpdateQueue = new ConcurrentLinkedQueue<UpdatedContainerInfo>();
|
||||
|
||||
this.containerAllocationExpirer = context.getContainerAllocationExpirer();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -953,11 +957,15 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
|||
// Just launched container. RM knows about it the first time.
|
||||
launchedContainers.add(containerId);
|
||||
newlyLaunchedContainers.add(remoteContainer);
|
||||
// Unregister from containerAllocationExpirer.
|
||||
containerAllocationExpirer.unregister(containerId);
|
||||
}
|
||||
} else {
|
||||
// A finished container
|
||||
launchedContainers.remove(containerId);
|
||||
completedContainers.add(remoteContainer);
|
||||
// Unregister from containerAllocationExpirer.
|
||||
containerAllocationExpirer.unregister(containerId);
|
||||
}
|
||||
}
|
||||
if (newlyLaunchedContainers.size() != 0 || completedContainers.size() != 0) {
|
||||
|
|
|
@ -31,6 +31,7 @@ import java.util.Collections;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.hadoop.util.HostsFileReader;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerState;
|
||||
|
@ -43,6 +44,7 @@ import org.apache.hadoop.yarn.event.EventHandler;
|
|||
import org.apache.hadoop.yarn.event.InlineDispatcher;
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse;
|
||||
import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.ContainerAllocationExpirer;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeCleanAppEvent;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeCleanContainerEvent;
|
||||
|
@ -105,8 +107,9 @@ public class TestRMNodeTransitions {
|
|||
InlineDispatcher rmDispatcher = new InlineDispatcher();
|
||||
|
||||
rmContext =
|
||||
new RMContextImpl(rmDispatcher, null, null, null,
|
||||
mock(DelegationTokenRenewer.class), null, null, null, null, null);
|
||||
new RMContextImpl(rmDispatcher, mock(ContainerAllocationExpirer.class),
|
||||
null, null, mock(DelegationTokenRenewer.class), null, null, null,
|
||||
null, null);
|
||||
NodesListManager nodesListManager = mock(NodesListManager.class);
|
||||
HostsFileReader reader = mock(HostsFileReader.class);
|
||||
when(nodesListManager.getHostsReader()).thenReturn(reader);
|
||||
|
@ -147,7 +150,8 @@ public class TestRMNodeTransitions {
|
|||
public void tearDown() throws Exception {
|
||||
}
|
||||
|
||||
private RMNodeStatusEvent getMockRMNodeStatusEvent() {
|
||||
private RMNodeStatusEvent getMockRMNodeStatusEvent(
|
||||
List<ContainerStatus> containerStatus) {
|
||||
NodeHeartbeatResponse response = mock(NodeHeartbeatResponse.class);
|
||||
|
||||
NodeHealthStatus healthStatus = mock(NodeHealthStatus.class);
|
||||
|
@ -158,6 +162,9 @@ public class TestRMNodeTransitions {
|
|||
doReturn(healthStatus).when(event).getNodeHealthStatus();
|
||||
doReturn(response).when(event).getLatestResponse();
|
||||
doReturn(RMNodeEventType.STATUS_UPDATE).when(event).getType();
|
||||
if (containerStatus != null) {
|
||||
doReturn(containerStatus).when(event).getContainers();
|
||||
}
|
||||
return event;
|
||||
}
|
||||
|
||||
|
@ -176,7 +183,7 @@ public class TestRMNodeTransitions {
|
|||
|
||||
// Now verify that scheduler isn't notified of an expired container
|
||||
// by checking number of 'completedContainers' it got in the previous event
|
||||
RMNodeStatusEvent statusEvent = getMockRMNodeStatusEvent();
|
||||
RMNodeStatusEvent statusEvent = getMockRMNodeStatusEvent(null);
|
||||
ContainerStatus containerStatus = mock(ContainerStatus.class);
|
||||
doReturn(completedContainerId).when(containerStatus).getContainerId();
|
||||
doReturn(Collections.singletonList(containerStatus)).
|
||||
|
@ -207,11 +214,11 @@ public class TestRMNodeTransitions {
|
|||
ContainerId completedContainerIdFromNode2_2 = BuilderUtils.newContainerId(
|
||||
BuilderUtils.newApplicationAttemptId(
|
||||
BuilderUtils.newApplicationId(1, 1), 1), 2);
|
||||
|
||||
RMNodeStatusEvent statusEventFromNode1 = getMockRMNodeStatusEvent();
|
||||
RMNodeStatusEvent statusEventFromNode2_1 = getMockRMNodeStatusEvent();
|
||||
RMNodeStatusEvent statusEventFromNode2_2 = getMockRMNodeStatusEvent();
|
||||
|
||||
|
||||
RMNodeStatusEvent statusEventFromNode1 = getMockRMNodeStatusEvent(null);
|
||||
RMNodeStatusEvent statusEventFromNode2_1 = getMockRMNodeStatusEvent(null);
|
||||
RMNodeStatusEvent statusEventFromNode2_2 = getMockRMNodeStatusEvent(null);
|
||||
|
||||
ContainerStatus containerStatusFromNode1 = mock(ContainerStatus.class);
|
||||
ContainerStatus containerStatusFromNode2_1 = mock(ContainerStatus.class);
|
||||
ContainerStatus containerStatusFromNode2_2 = mock(ContainerStatus.class);
|
||||
|
@ -263,8 +270,8 @@ public class TestRMNodeTransitions {
|
|||
BuilderUtils.newApplicationAttemptId(
|
||||
BuilderUtils.newApplicationId(1, 1), 1), 1);
|
||||
|
||||
RMNodeStatusEvent statusEvent1 = getMockRMNodeStatusEvent();
|
||||
RMNodeStatusEvent statusEvent2 = getMockRMNodeStatusEvent();
|
||||
RMNodeStatusEvent statusEvent1 = getMockRMNodeStatusEvent(null);
|
||||
RMNodeStatusEvent statusEvent2 = getMockRMNodeStatusEvent(null);
|
||||
|
||||
ContainerStatus containerStatus1 = mock(ContainerStatus.class);
|
||||
ContainerStatus containerStatus2 = mock(ContainerStatus.class);
|
||||
|
@ -499,7 +506,7 @@ public class TestRMNodeTransitions {
|
|||
|
||||
// Verify status update does not clear containers/apps to cleanup
|
||||
// but updating heartbeat response for cleanup does
|
||||
RMNodeStatusEvent statusEvent = getMockRMNodeStatusEvent();
|
||||
RMNodeStatusEvent statusEvent = getMockRMNodeStatusEvent(null);
|
||||
node.handle(statusEvent);
|
||||
Assert.assertEquals(1, node.getContainersToCleanUp().size());
|
||||
Assert.assertEquals(1, node.getAppsToCleanup().size());
|
||||
|
@ -706,4 +713,35 @@ public class TestRMNodeTransitions {
|
|||
null, null));
|
||||
Assert.assertEquals(nmVersion2, node.getNodeManagerVersion());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testContainerExpire() throws Exception {
|
||||
ContainerAllocationExpirer mockExpirer =
|
||||
mock(ContainerAllocationExpirer.class);
|
||||
ApplicationId appId =
|
||||
ApplicationId.newInstance(System.currentTimeMillis(), 1);
|
||||
ApplicationAttemptId appAttemptId =
|
||||
ApplicationAttemptId.newInstance(appId, 1);
|
||||
ContainerId containerId1 = ContainerId.newContainerId(appAttemptId, 1L);
|
||||
ContainerId containerId2 = ContainerId.newContainerId(appAttemptId, 2L);
|
||||
mockExpirer.register(containerId1);
|
||||
mockExpirer.register(containerId2);
|
||||
verify(mockExpirer).register(containerId1);
|
||||
verify(mockExpirer).register(containerId2);
|
||||
((RMContextImpl) rmContext).setContainerAllocationExpirer(mockExpirer);
|
||||
RMNodeImpl rmNode = getRunningNode();
|
||||
ContainerStatus status1 =
|
||||
ContainerStatus
|
||||
.newInstance(containerId1, ContainerState.RUNNING, "", 0);
|
||||
ContainerStatus status2 =
|
||||
ContainerStatus.newInstance(containerId2, ContainerState.COMPLETE, "",
|
||||
0);
|
||||
List<ContainerStatus> statusList = new ArrayList<ContainerStatus>();
|
||||
statusList.add(status1);
|
||||
statusList.add(status2);
|
||||
RMNodeStatusEvent statusEvent = getMockRMNodeStatusEvent(statusList);
|
||||
rmNode.handle(statusEvent);
|
||||
verify(mockExpirer).unregister(containerId1);
|
||||
verify(mockExpirer).unregister(containerId2);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue