YARN-6344. Add parameter for rack locality delay in CapacityScheduler. (kkaranasos)

(cherry picked from commit 3587b46164514e6751d7cebb09c8ab5e988853b4)
This commit is contained in:
Konstantinos Karanasos 2017-04-10 15:25:33 -07:00
parent c1ae753de1
commit 9b3ba25980
6 changed files with 228 additions and 21 deletions

View File

@ -111,9 +111,27 @@
<value>40</value>
<description>
Number of missed scheduling opportunities after which the CapacityScheduler
attempts to schedule rack-local containers.
Typically this should be set to number of nodes in the cluster, By default is setting
approximately number of nodes in one rack which is 40.
attempts to schedule rack-local containers.
When setting this parameter, the size of the cluster should be taken into account.
We use 40 as the default value, which is approximately the number of nodes in one rack.
</description>
</property>
<property>
<name>yarn.scheduler.capacity.rack-locality-additional-delay</name>
<value>-1</value>
<description>
Number of additional missed scheduling opportunities over the node-locality-delay
ones, after which the CapacityScheduler attempts to schedule off-switch containers,
instead of rack-local ones.
Example: with node-locality-delay=40 and rack-locality-delay=20, the scheduler will
attempt rack-local assignments after 40 missed opportunities, and off-switch assignments
after 40+20=60 missed opportunities.
When setting this parameter, the size of the cluster should be taken into account.
We use -1 as the default value, which disables this feature. In this case, the number
of missed opportunities for assigning off-switch containers is calculated based on
the number of containers and unique locations specified in the resource request,
as well as the size of the cluster.
</description>
</property>

View File

@ -1404,7 +1404,6 @@ public class SchedulerApplicationAttempt implements SchedulableEntity {
return appSchedulingInfo.getSchedulingPlacementSet(schedulerRequestKey);
}
public void incUnconfirmedRes(Resource res) {
unconfirmedAllocatedMem.addAndGet(res.getMemorySize());
unconfirmedAllocatedVcores.addAndGet(res.getVirtualCores());

View File

@ -197,6 +197,13 @@ public class CapacitySchedulerConfiguration extends ReservationSchedulerConfigur
@Private
public static final int DEFAULT_NODE_LOCALITY_DELAY = 40;
@Private
public static final String RACK_LOCALITY_ADDITIONAL_DELAY =
PREFIX + "rack-locality-additional-delay";
@Private
public static final int DEFAULT_RACK_LOCALITY_ADDITIONAL_DELAY = -1;
@Private
public static final String RACK_LOCALITY_FULL_RESET =
PREFIX + "rack-locality-full-reset";
@ -829,6 +836,11 @@ public class CapacitySchedulerConfiguration extends ReservationSchedulerConfigur
return getInt(NODE_LOCALITY_DELAY, DEFAULT_NODE_LOCALITY_DELAY);
}
public int getRackLocalityAdditionalDelay() {
return getInt(RACK_LOCALITY_ADDITIONAL_DELAY,
DEFAULT_RACK_LOCALITY_ADDITIONAL_DELAY);
}
public boolean getRackLocalityFullReset() {
return getBoolean(RACK_LOCALITY_FULL_RESET,
DEFAULT_RACK_LOCALITY_FULL_RESET);

View File

@ -110,6 +110,7 @@ public class LeafQueue extends AbstractCSQueue {
private float maxAMResourcePerQueuePercent;
private volatile int nodeLocalityDelay;
private volatile int rackLocalityAdditionalDelay;
private volatile boolean rackLocalityFullReset;
Map<ApplicationAttemptId, FiCaSchedulerApp> applicationAttemptMap =
@ -236,6 +237,7 @@ public class LeafQueue extends AbstractCSQueue {
}
nodeLocalityDelay = conf.getNodeLocalityDelay();
rackLocalityAdditionalDelay = conf.getRackLocalityAdditionalDelay();
rackLocalityFullReset = conf.getRackLocalityFullReset();
// re-init this since max allocation could have changed
@ -291,9 +293,12 @@ public class LeafQueue extends AbstractCSQueue {
+ "numContainers = " + numContainers
+ " [= currentNumContainers ]" + "\n" + "state = " + getState()
+ " [= configuredState ]" + "\n" + "acls = " + aclsString
+ " [= configuredAcls ]" + "\n" + "nodeLocalityDelay = "
+ nodeLocalityDelay + "\n" + "labels=" + labelStrBuilder
.toString() + "\n" + "reservationsContinueLooking = "
+ " [= configuredAcls ]" + "\n"
+ "nodeLocalityDelay = " + nodeLocalityDelay + "\n"
+ "rackLocalityAdditionalDelay = "
+ rackLocalityAdditionalDelay + "\n"
+ "labels=" + labelStrBuilder.toString() + "\n"
+ "reservationsContinueLooking = "
+ reservationsContinueLooking + "\n" + "preemptionDisabled = "
+ getPreemptionDisabled() + "\n" + "defaultAppPriorityPerQueue = "
+ defaultAppPriorityPerQueue + "\npriority = " + priority);
@ -1397,6 +1402,11 @@ public class LeafQueue extends AbstractCSQueue {
return nodeLocalityDelay;
}
@Lock(NoLock.class)
public int getRackLocalityAdditionalDelay() {
return rackLocalityAdditionalDelay;
}
@Lock(NoLock.class)
public boolean getRackLocalityFullReset() {
return rackLocalityFullReset;

View File

@ -45,7 +45,6 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.Activi
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.ActivitiesLogger;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.ActivitiesManager;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities.ActivityState;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSAMContainerLaunchDiagnosticsConstants;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSAssignment;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.SchedulingMode;
@ -277,6 +276,12 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
.getCSLeafQueue().getNodeLocalityDelay());
}
private int getActualRackLocalityDelay() {
return Math.min(rmContext.getScheduler().getNumClusterNodes(),
application.getCSLeafQueue().getNodeLocalityDelay()
+ application.getCSLeafQueue().getRackLocalityAdditionalDelay());
}
private boolean canAssign(SchedulerRequestKey schedulerKey,
FiCaSchedulerNode node, NodeType type, RMContainer reservedContainer) {
@ -285,22 +290,34 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
if (reservedContainer != null) {
return true;
}
// If there are no nodes in the cluster, return false.
if (rmContext.getScheduler().getNumClusterNodes() == 0) {
return false;
}
// If we have only ANY requests for this schedulerKey, we should not
// delay its scheduling.
if (application.getResourceRequests(schedulerKey).size() == 1) {
return true;
}
// 'Delay' off-switch
ResourceRequest offSwitchRequest =
application.getResourceRequest(schedulerKey, ResourceRequest.ANY);
long missedOpportunities =
application.getSchedulingOpportunities(schedulerKey);
long requiredContainers = offSwitchRequest.getNumContainers();
float localityWaitFactor =
getLocalityWaitFactor(schedulerKey, rmContext.getScheduler()
.getNumClusterNodes());
// Cap the delay by the number of nodes in the cluster. Under most
// conditions this means we will consider each node in the cluster before
// accepting an off-switch assignment.
return (Math.min(rmContext.getScheduler().getNumClusterNodes(),
(requiredContainers * localityWaitFactor)) < missedOpportunities);
// If rack locality additional delay parameter is enabled.
if (application.getCSLeafQueue().getRackLocalityAdditionalDelay() > -1) {
return missedOpportunities > getActualRackLocalityDelay();
} else {
long requiredContainers =
application.getTotalRequiredResources(schedulerKey);
float localityWaitFactor = getLocalityWaitFactor(schedulerKey,
rmContext.getScheduler().getNumClusterNodes());
// Cap the delay by the number of nodes in the cluster.
return (Math.min(rmContext.getScheduler().getNumClusterNodes(),
(requiredContainers * localityWaitFactor)) < missedOpportunities);
}
}
// Check if we need containers on this rack

View File

@ -106,7 +106,7 @@ import org.mockito.Mockito;
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.TestUtils.toSchedulerKey;
public class TestLeafQueue {
public class TestLeafQueue {
private final RecordFactory recordFactory =
RecordFactoryProvider.getRecordFactory(null);
private static final Log LOG = LogFactory.getLog(TestLeafQueue.class);
@ -2098,6 +2098,154 @@ public class TestLeafQueue {
assertEquals(numNodes+1, app_0.getSchedulingOpportunities(schedulerKey));
}
@Test
public void testRackLocalityDelayScheduling() throws Exception {
// Change parameter values for node locality and rack locality delay.
csConf.setInt(CapacitySchedulerConfiguration.NODE_LOCALITY_DELAY, 2);
csConf.setInt(
CapacitySchedulerConfiguration.RACK_LOCALITY_ADDITIONAL_DELAY, 1);
Map<String, CSQueue> newQueues = new HashMap<String, CSQueue>();
CSQueue newRoot = CapacitySchedulerQueueManager.parseQueue(csContext,
csConf, null, CapacitySchedulerConfiguration.ROOT, newQueues, queues,
TestUtils.spyHook);
queues = newQueues;
root.reinitialize(newRoot, cs.getClusterResource());
// Manipulate queue 'b'
LeafQueue a = stubLeafQueue((LeafQueue) queues.get(B));
// Check locality parameters.
assertEquals(2, a.getNodeLocalityDelay());
assertEquals(1, a.getRackLocalityAdditionalDelay());
// User
String user1 = "user_1";
// Submit applications
final ApplicationAttemptId appAttemptId1 =
TestUtils.getMockApplicationAttemptId(0, 0);
FiCaSchedulerApp app1 = new FiCaSchedulerApp(appAttemptId1, user1, a,
mock(ActiveUsersManager.class), spyRMContext);
a.submitApplicationAttempt(app1, user1);
// Setup some nodes and racks
String host1 = "127.0.0.1";
String host2 = "127.0.0.2";
String host3 = "127.0.0.3";
String host4 = "127.0.0.4";
String rack1 = "rack_1";
String rack2 = "rack_2";
String rack3 = "rack_3";
FiCaSchedulerNode node2 = TestUtils.getMockNode(host3, rack2, 0, 8 * GB);
FiCaSchedulerNode node3 = TestUtils.getMockNode(host4, rack3, 0, 8 * GB);
Map<ApplicationAttemptId, FiCaSchedulerApp> apps =
ImmutableMap.of(app1.getApplicationAttemptId(), app1);
Map<NodeId, FiCaSchedulerNode> nodes =
ImmutableMap.of(node2.getNodeID(), node2, node3.getNodeID(), node3);
final int numNodes = 5;
Resource clusterResource =
Resources.createResource(numNodes * (8 * GB), numNodes * 16);
when(spyRMContext.getScheduler().getNumClusterNodes()).thenReturn(numNodes);
// Setup resource-requests and submit
Priority priority = TestUtils.createMockPriority(1);
List<ResourceRequest> app1Requests1 = new ArrayList<ResourceRequest>();
app1Requests1.add(TestUtils.createResourceRequest(host1, 1 * GB, 1,
true, priority, recordFactory));
app1Requests1.add(TestUtils.createResourceRequest(rack1, 1 * GB, 1,
true, priority, recordFactory));
app1Requests1.add(TestUtils.createResourceRequest(host2, 1 * GB, 1,
true, priority, recordFactory));
app1Requests1.add(TestUtils.createResourceRequest(rack2, 1 * GB, 1,
true, priority, recordFactory));
// Adding one extra in the ANY.
app1Requests1.add(TestUtils.createResourceRequest(ResourceRequest.ANY,
1 * GB, 3, true, priority, recordFactory));
app1.updateResourceRequests(app1Requests1);
// Start testing...
CSAssignment assignment = null;
SchedulerRequestKey schedulerKey = toSchedulerKey(priority);
assertEquals(3, app1.getTotalRequiredResources(schedulerKey));
// No rack-local yet.
assignment = a.assignContainers(clusterResource, node2,
new ResourceLimits(clusterResource),
SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
applyCSAssignment(clusterResource, assignment, a, nodes, apps);
verifyNoContainerAllocated(assignment);
assertEquals(1, app1.getSchedulingOpportunities(schedulerKey));
assertEquals(3, app1.getTotalRequiredResources(schedulerKey));
assertEquals(NodeType.NODE_LOCAL, assignment.getType()); // None->NODE_LOCAL
// Still no rack-local.
assignment = a.assignContainers(clusterResource, node2,
new ResourceLimits(clusterResource),
SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
applyCSAssignment(clusterResource, assignment, a, nodes, apps);
assertEquals(2, app1.getSchedulingOpportunities(schedulerKey));
assertEquals(3, app1.getTotalRequiredResources(schedulerKey));
assertEquals(NodeType.NODE_LOCAL, assignment.getType()); // None->NODE_LOCAL
// Rack local now.
assignment = a.assignContainers(clusterResource, node2,
new ResourceLimits(clusterResource),
SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
applyCSAssignment(clusterResource, assignment, a, nodes, apps);
assertEquals(0, app1.getSchedulingOpportunities(schedulerKey));
assertEquals(2, app1.getTotalRequiredResources(schedulerKey));
assertEquals(NodeType.RACK_LOCAL, assignment.getType());
// No off-switch until 3 missed opportunities.
a.assignContainers(clusterResource, node3,
new ResourceLimits(clusterResource),
SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
applyCSAssignment(clusterResource, assignment, a, nodes, apps);
a.assignContainers(clusterResource, node3,
new ResourceLimits(clusterResource),
SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
applyCSAssignment(clusterResource, assignment, a, nodes, apps);
assignment = a.assignContainers(clusterResource, node3,
new ResourceLimits(clusterResource),
SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
applyCSAssignment(clusterResource, assignment, a, nodes, apps);
assertEquals(3, app1.getSchedulingOpportunities(schedulerKey));
assertEquals(2, app1.getTotalRequiredResources(schedulerKey));
assertEquals(NodeType.NODE_LOCAL, assignment.getType()); // None->NODE_LOCAL
// Now off-switch should succeed.
assignment = a.assignContainers(clusterResource, node3,
new ResourceLimits(clusterResource),
SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
applyCSAssignment(clusterResource, assignment, a, nodes, apps);
assertEquals(4, app1.getSchedulingOpportunities(schedulerKey));
assertEquals(1, app1.getTotalRequiredResources(schedulerKey));
assertEquals(NodeType.OFF_SWITCH, assignment.getType());
// Check capping by number of cluster nodes.
doReturn(10).when(a).getRackLocalityAdditionalDelay();
// Off-switch will happen at 6 missed opportunities now, since cluster size
// is 5.
assignment = a.assignContainers(clusterResource, node3,
new ResourceLimits(clusterResource),
SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
applyCSAssignment(clusterResource, assignment, a, nodes, apps);
assertEquals(5, app1.getSchedulingOpportunities(schedulerKey));
assertEquals(1, app1.getTotalRequiredResources(schedulerKey));
assertEquals(NodeType.NODE_LOCAL, assignment.getType()); // None->NODE_LOCAL
assignment = a.assignContainers(clusterResource, node3,
new ResourceLimits(clusterResource),
SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
applyCSAssignment(clusterResource, assignment, a, nodes, apps);
assertEquals(6, app1.getSchedulingOpportunities(schedulerKey));
assertEquals(0, app1.getTotalRequiredResources(schedulerKey));
assertEquals(NodeType.OFF_SWITCH, assignment.getType());
}
@Test
public void testApplicationPriorityScheduling() throws Exception {
// Manipulate queue 'a'
@ -2403,16 +2551,18 @@ public class TestLeafQueue {
}
@Test (timeout = 30000)
public void testNodeLocalityAfterQueueRefresh() throws Exception {
public void testLocalityDelaysAfterQueueRefresh() throws Exception {
// Manipulate queue 'e'
LeafQueue e = stubLeafQueue((LeafQueue)queues.get(E));
// before reinitialization
assertEquals(40, e.getNodeLocalityDelay());
assertEquals(-1, e.getRackLocalityAdditionalDelay());
csConf.setInt(CapacitySchedulerConfiguration
.NODE_LOCALITY_DELAY, 60);
csConf.setInt(CapacitySchedulerConfiguration.NODE_LOCALITY_DELAY, 60);
csConf.setInt(
CapacitySchedulerConfiguration.RACK_LOCALITY_ADDITIONAL_DELAY, 600);
Map<String, CSQueue> newQueues = new HashMap<String, CSQueue>();
CSQueue newRoot =
CapacitySchedulerQueueManager.parseQueue(csContext, csConf, null,
@ -2424,6 +2574,7 @@ public class TestLeafQueue {
// after reinitialization
assertEquals(60, e.getNodeLocalityDelay());
assertEquals(600, e.getRackLocalityAdditionalDelay());
}
@Test (timeout = 30000)