YARN-2260. Fixed ResourceManager's RMNode to correctly remember containers when nodes resync during work-preserving RM restart. Contributed by Jian He.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1610557 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
97b5fbc281
commit
c6cc6a6a8e
|
@ -53,6 +53,9 @@ Release 2.6.0 - UNRELEASED
|
||||||
YARN-2088. Fixed a bug in GetApplicationsRequestPBImpl#mergeLocalToBuilder.
|
YARN-2088. Fixed a bug in GetApplicationsRequestPBImpl#mergeLocalToBuilder.
|
||||||
(Binglin Chang via jianhe)
|
(Binglin Chang via jianhe)
|
||||||
|
|
||||||
|
YARN-2260. Fixed ResourceManager's RMNode to correctly remember containers
|
||||||
|
when nodes resync during work-preserving RM restart. (Jian He via vinodkv)
|
||||||
|
|
||||||
Release 2.5.0 - UNRELEASED
|
Release 2.5.0 - UNRELEASED
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -20,9 +20,8 @@ package org.apache.hadoop.yarn.server.resourcemanager.rmnode;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.EnumSet;
|
import java.util.EnumSet;
|
||||||
import java.util.HashMap;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
import java.util.concurrent.ConcurrentLinkedQueue;
|
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||||
|
@ -105,8 +104,8 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
||||||
private String nodeManagerVersion;
|
private String nodeManagerVersion;
|
||||||
|
|
||||||
/* set of containers that have just launched */
|
/* set of containers that have just launched */
|
||||||
private final Map<ContainerId, ContainerStatus> justLaunchedContainers =
|
private final Set<ContainerId> launchedContainers =
|
||||||
new HashMap<ContainerId, ContainerStatus>();
|
new HashSet<ContainerId>();
|
||||||
|
|
||||||
/* set of containers that need to be cleaned */
|
/* set of containers that need to be cleaned */
|
||||||
private final Set<ContainerId> containersToClean = new TreeSet<ContainerId>(
|
private final Set<ContainerId> containersToClean = new TreeSet<ContainerId>(
|
||||||
|
@ -476,6 +475,13 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
||||||
// Increment activeNodes explicitly because this is a new node.
|
// Increment activeNodes explicitly because this is a new node.
|
||||||
ClusterMetrics.getMetrics().incrNumActiveNodes();
|
ClusterMetrics.getMetrics().incrNumActiveNodes();
|
||||||
containers = startEvent.getNMContainerStatuses();
|
containers = startEvent.getNMContainerStatuses();
|
||||||
|
if (containers != null && !containers.isEmpty()) {
|
||||||
|
for (NMContainerStatus container : containers) {
|
||||||
|
if (container.getContainerState() == ContainerState.RUNNING) {
|
||||||
|
rmNode.launchedContainers.add(container.getContainerId());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (null != startEvent.getRunningApplications()) {
|
if (null != startEvent.getRunningApplications()) {
|
||||||
|
@ -664,14 +670,14 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
||||||
|
|
||||||
// Process running containers
|
// Process running containers
|
||||||
if (remoteContainer.getState() == ContainerState.RUNNING) {
|
if (remoteContainer.getState() == ContainerState.RUNNING) {
|
||||||
if (!rmNode.justLaunchedContainers.containsKey(containerId)) {
|
if (!rmNode.launchedContainers.contains(containerId)) {
|
||||||
// Just launched container. RM knows about it the first time.
|
// Just launched container. RM knows about it the first time.
|
||||||
rmNode.justLaunchedContainers.put(containerId, remoteContainer);
|
rmNode.launchedContainers.add(containerId);
|
||||||
newlyLaunchedContainers.add(remoteContainer);
|
newlyLaunchedContainers.add(remoteContainer);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// A finished container
|
// A finished container
|
||||||
rmNode.justLaunchedContainers.remove(containerId);
|
rmNode.launchedContainers.remove(containerId);
|
||||||
completedContainers.add(remoteContainer);
|
completedContainers.add(remoteContainer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -748,4 +754,10 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
||||||
public int getQueueSize() {
|
public int getQueueSize() {
|
||||||
return nodeUpdateQueue.size();
|
return nodeUpdateQueue.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// For test only.
|
||||||
|
@VisibleForTesting
|
||||||
|
public Set<ContainerId> getLaunchedContainers() {
|
||||||
|
return this.launchedContainers;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,11 +29,13 @@ import java.util.Arrays;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
||||||
import org.apache.hadoop.security.UserGroupInformation;
|
import org.apache.hadoop.security.UserGroupInformation;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerState;
|
import org.apache.hadoop.yarn.api.records.ContainerState;
|
||||||
import org.apache.hadoop.yarn.api.records.Resource;
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
|
@ -44,6 +46,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
|
||||||
|
@ -164,6 +167,11 @@ public class TestWorkPreservingRMRestart {
|
||||||
|
|
||||||
// Wait for RM to settle down on recovering containers;
|
// Wait for RM to settle down on recovering containers;
|
||||||
waitForNumContainersToRecover(2, rm2, am1.getApplicationAttemptId());
|
waitForNumContainersToRecover(2, rm2, am1.getApplicationAttemptId());
|
||||||
|
Set<ContainerId> launchedContainers =
|
||||||
|
((RMNodeImpl) rm2.getRMContext().getRMNodes().get(nm1.getNodeId()))
|
||||||
|
.getLaunchedContainers();
|
||||||
|
assertTrue(launchedContainers.contains(amContainer.getContainerId()));
|
||||||
|
assertTrue(launchedContainers.contains(runningContainer.getContainerId()));
|
||||||
|
|
||||||
// check RMContainers are re-recreated and the container state is correct.
|
// check RMContainers are re-recreated and the container state is correct.
|
||||||
rm2.waitForState(nm1, amContainer.getContainerId(),
|
rm2.waitForState(nm1, amContainer.getContainerId(),
|
||||||
|
|
Loading…
Reference in New Issue