YARN-2975. FSLeafQueue app lists are accessed without required locks. (kasha)

(cherry picked from commit 2abec14ec6)
This commit is contained in:
Karthik Kambatla 2014-12-20 12:17:50 -08:00 committed by Sangjin Lee
parent 80037436c6
commit 843dac5353
8 changed files with 200 additions and 104 deletions

View File

@ -12,6 +12,8 @@ Release 2.6.4 - UNRELEASED
BUG FIXES
YARN-2975. FSLeafQueue app lists are accessed without required locks. (kasha)
Release 2.6.3 - UNRELEASED
INCOMPATIBLE CHANGES

View File

@ -107,34 +107,94 @@ public class FSLeafQueue extends FSQueue {
*/
public boolean removeApp(FSAppAttempt app) {
boolean runnable = false;
// Remove app from runnable/nonRunnable list while holding the write lock
writeLock.lock();
try {
if (runnableApps.remove(app)) {
runnable = true;
} else if (nonRunnableApps.remove(app)) {
runnable = false; //nop, runnable is initialised to false already
} else {
throw new IllegalStateException("Given app to remove " + app +
" does not exist in queue " + this);
runnable = runnableApps.remove(app);
if (!runnable) {
// removeNonRunnableApp acquires the write lock again, which is fine
if (!removeNonRunnableApp(app)) {
throw new IllegalStateException("Given app to remove " + app +
" does not exist in queue " + this);
}
}
} finally {
writeLock.unlock();
}
// Update AM resource usage if needed
if (runnable && app.isAmRunning() && app.getAMResource() != null) {
Resources.subtractFrom(amResourceUsage, app.getAMResource());
}
return runnable;
}
public Collection<FSAppAttempt> getRunnableAppSchedulables() {
return runnableApps;
/**
* Removes the given app if it is non-runnable and belongs to this queue
* @return true if the app is removed, false otherwise
*/
public boolean removeNonRunnableApp(FSAppAttempt app) {
writeLock.lock();
try {
return nonRunnableApps.remove(app);
} finally {
writeLock.unlock();
}
}
public List<FSAppAttempt> getNonRunnableAppSchedulables() {
return nonRunnableApps;
public boolean isRunnableApp(FSAppAttempt attempt) {
readLock.lock();
try {
return runnableApps.contains(attempt);
} finally {
readLock.unlock();
}
}
public boolean isNonRunnableApp(FSAppAttempt attempt) {
readLock.lock();
try {
return nonRunnableApps.contains(attempt);
} finally {
readLock.unlock();
}
}
public void resetPreemptedResources() {
readLock.lock();
try {
for (FSAppAttempt attempt : runnableApps) {
attempt.resetPreemptedResources();
}
} finally {
readLock.unlock();
}
}
public void clearPreemptedResources() {
readLock.lock();
try {
for (FSAppAttempt attempt : runnableApps) {
attempt.clearPreemptedResources();
}
} finally {
readLock.unlock();
}
}
public List<FSAppAttempt> getCopyOfNonRunnableAppSchedulables() {
List<FSAppAttempt> appsToReturn = new ArrayList<FSAppAttempt>();
readLock.lock();
try {
appsToReturn.addAll(nonRunnableApps);
} finally {
readLock.unlock();
}
return appsToReturn;
}
@Override
public void collectSchedulerApplications(
Collection<ApplicationAttemptId> apps) {
@ -162,7 +222,12 @@ public class FSLeafQueue extends FSQueue {
@Override
public void recomputeShares() {
policy.computeShares(getRunnableAppSchedulables(), getFairShare());
readLock.lock();
try {
policy.computeShares(runnableApps, getFairShare());
} finally {
readLock.unlock();
}
}
@Override
@ -346,9 +411,58 @@ public class FSLeafQueue extends FSQueue {
@Override
public int getNumRunnableApps() {
return runnableApps.size();
readLock.lock();
try {
return runnableApps.size();
} finally {
readLock.unlock();
}
}
public int getNumNonRunnableApps() {
readLock.lock();
try {
return nonRunnableApps.size();
} finally {
readLock.unlock();
}
}
public int getNumPendingApps() {
int numPendingApps = 0;
readLock.lock();
try {
for (FSAppAttempt attempt : runnableApps) {
if (attempt.isPending()) {
numPendingApps++;
}
}
numPendingApps += nonRunnableApps.size();
} finally {
readLock.unlock();
}
return numPendingApps;
}
/**
* TODO: Based on how frequently this is called, we might want to club
* counting pending and active apps in the same method.
*/
public int getNumActiveApps() {
int numActiveApps = 0;
readLock.lock();
try {
for (FSAppAttempt attempt : runnableApps) {
if (!attempt.isPending()) {
numActiveApps++;
}
}
} finally {
readLock.unlock();
}
return numActiveApps;
}
@Override
public ActiveUsersManager getActiveUsersManager() {
return activeUsersManager;

View File

@ -399,9 +399,7 @@ public class FairScheduler extends
try {
// Reset preemptedResource for each app
for (FSLeafQueue queue : getQueueManager().getLeafQueues()) {
for (FSAppAttempt app : queue.getRunnableAppSchedulables()) {
app.resetPreemptedResources();
}
queue.resetPreemptedResources();
}
while (Resources.greaterThan(RESOURCE_CALCULATOR, clusterResource,
@ -420,9 +418,7 @@ public class FairScheduler extends
} finally {
// Clear preemptedResources for each app
for (FSLeafQueue queue : getQueueManager().getLeafQueues()) {
for (FSAppAttempt app : queue.getRunnableAppSchedulables()) {
app.clearPreemptedResources();
}
queue.clearPreemptedResources();
}
}
@ -1424,7 +1420,7 @@ public class FairScheduler extends
return oldQueue.getQueueName();
}
if (oldQueue.getRunnableAppSchedulables().contains(attempt)) {
if (oldQueue.isRunnableApp(attempt)) {
verifyMoveDoesNotViolateConstraints(attempt, oldQueue, targetQueue);
}

View File

@ -203,7 +203,7 @@ public class MaxRunningAppsEnforcer {
if (canAppBeRunnable(next.getQueue(), next.getUser())) {
trackRunnableApp(next);
FSAppAttempt appSched = next;
next.getQueue().getRunnableAppSchedulables().add(appSched);
next.getQueue().addApp(appSched, true);
noLongerPendingApps.add(appSched);
if (noLongerPendingApps.size() >= maxRunnableApps) {
@ -218,8 +218,7 @@ public class MaxRunningAppsEnforcer {
// pull them out from under the iterator. If they are not in these lists
// in the first place, there is a bug.
for (FSAppAttempt appSched : noLongerPendingApps) {
if (!appSched.getQueue().getNonRunnableAppSchedulables()
.remove(appSched)) {
if (!appSched.getQueue().removeNonRunnableApp(appSched)) {
LOG.error("Can't make app runnable that does not already exist in queue"
+ " as non-runnable: " + appSched + ". This should never happen.");
}
@ -269,7 +268,8 @@ public class MaxRunningAppsEnforcer {
if (queue.getNumRunnableApps() < scheduler.getAllocationConfiguration()
.getQueueMaxApps(queue.getName())) {
if (queue instanceof FSLeafQueue) {
appLists.add(((FSLeafQueue)queue).getNonRunnableAppSchedulables());
appLists.add(
((FSLeafQueue)queue).getCopyOfNonRunnableAppSchedulables());
} else {
for (FSQueue child : queue.getChildQueues()) {
gatherPossiblyRunnableAppLists(child, appLists);

View File

@ -297,7 +297,7 @@ public class QueueManager {
if (queue instanceof FSLeafQueue) {
FSLeafQueue leafQueue = (FSLeafQueue)queue;
return queue.getNumRunnableApps() == 0 &&
leafQueue.getNonRunnableAppSchedulables().isEmpty();
leafQueue.getNumNonRunnableApps() == 0;
} else {
for (FSQueue child : queue.getChildQueues()) {
if (!isEmpty(child)) {

View File

@ -18,14 +18,11 @@
package org.apache.hadoop.yarn.server.resourcemanager.webapp.dao;
import java.util.Collection;
import javax.xml.bind.annotation.XmlAccessType;
import javax.xml.bind.annotation.XmlAccessorType;
import javax.xml.bind.annotation.XmlRootElement;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair
.FSAppAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSLeafQueue;
@ -40,15 +37,8 @@ public class FairSchedulerLeafQueueInfo extends FairSchedulerQueueInfo {
public FairSchedulerLeafQueueInfo(FSLeafQueue queue, FairScheduler scheduler) {
super(queue, scheduler);
Collection<FSAppAttempt> apps = queue.getRunnableAppSchedulables();
for (FSAppAttempt app : apps) {
if (app.isPending()) {
numPendingApps++;
} else {
numActiveApps++;
}
}
numPendingApps += queue.getNonRunnableAppSchedulables().size();
numPendingApps = queue.getNumPendingApps();
numActiveApps = queue.getNumActiveApps();
}
public int getNumActiveApplications() {

View File

@ -731,9 +731,9 @@ public class TestFairScheduler extends FairSchedulerTestBase {
ApplicationAttemptId appAttemptId = createAppAttemptId(1, 1);
createApplicationWithAMResource(appAttemptId, "default", "user1", null);
assertEquals(1, scheduler.getQueueManager().getLeafQueue("user1", true)
.getRunnableAppSchedulables().size());
.getNumRunnableApps());
assertEquals(0, scheduler.getQueueManager().getLeafQueue("default", true)
.getRunnableAppSchedulables().size());
.getNumRunnableApps());
assertEquals("root.user1", resourceManager.getRMContext().getRMApps()
.get(appAttemptId.getApplicationId()).getQueue());
}
@ -747,11 +747,11 @@ public class TestFairScheduler extends FairSchedulerTestBase {
ApplicationAttemptId appAttemptId = createAppAttemptId(1, 1);
createApplicationWithAMResource(appAttemptId, "default", "user2", null);
assertEquals(0, scheduler.getQueueManager().getLeafQueue("user1", true)
.getRunnableAppSchedulables().size());
.getNumRunnableApps());
assertEquals(1, scheduler.getQueueManager().getLeafQueue("default", true)
.getRunnableAppSchedulables().size());
.getNumRunnableApps());
assertEquals(0, scheduler.getQueueManager().getLeafQueue("user2", true)
.getRunnableAppSchedulables().size());
.getNumRunnableApps());
}
@Test
@ -1189,7 +1189,7 @@ public class TestFairScheduler extends FairSchedulerTestBase {
// That queue should have one app
assertEquals(1, scheduler.getQueueManager().getLeafQueue("user1", true)
.getRunnableAppSchedulables().size());
.getNumRunnableApps());
AppAttemptRemovedSchedulerEvent appRemovedEvent1 = new AppAttemptRemovedSchedulerEvent(
createAppAttemptId(1, 1), RMAppAttemptState.FINISHED, false);
@ -1199,7 +1199,7 @@ public class TestFairScheduler extends FairSchedulerTestBase {
// Queue should have no apps
assertEquals(0, scheduler.getQueueManager().getLeafQueue("user1", true)
.getRunnableAppSchedulables().size());
.getNumRunnableApps());
}
@Test
@ -1943,7 +1943,7 @@ public class TestFairScheduler extends FairSchedulerTestBase {
// The user1 queue should inherit the configurations from the root queue
FSLeafQueue userQueue =
scheduler.getQueueManager().getLeafQueue("user1", true);
assertEquals(1, userQueue.getRunnableAppSchedulables().size());
assertEquals(1, userQueue.getNumRunnableApps());
assertEquals(10000, userQueue.getMinSharePreemptionTimeout());
assertEquals(15000, userQueue.getFairSharePreemptionTimeout());
assertEquals(.6f, userQueue.getFairSharePreemptionThreshold(), 0.001);
@ -3179,21 +3179,15 @@ public class TestFairScheduler extends FairSchedulerTestBase {
private void verifyAppRunnable(ApplicationAttemptId attId, boolean runnable) {
FSAppAttempt app = scheduler.getSchedulerApp(attId);
FSLeafQueue queue = app.getQueue();
Collection<FSAppAttempt> runnableApps =
queue.getRunnableAppSchedulables();
Collection<FSAppAttempt> nonRunnableApps =
queue.getNonRunnableAppSchedulables();
assertEquals(runnable, runnableApps.contains(app));
assertEquals(!runnable, nonRunnableApps.contains(app));
assertEquals(runnable, queue.isRunnableApp(app));
assertEquals(!runnable, queue.isNonRunnableApp(app));
}
private void verifyQueueNumRunnable(String queueName, int numRunnableInQueue,
int numNonRunnableInQueue) {
FSLeafQueue queue = scheduler.getQueueManager().getLeafQueue(queueName, false);
assertEquals(numRunnableInQueue,
queue.getRunnableAppSchedulables().size());
assertEquals(numNonRunnableInQueue,
queue.getNonRunnableAppSchedulables().size());
assertEquals(numRunnableInQueue, queue.getNumRunnableApps());
assertEquals(numNonRunnableInQueue, queue.getNumNonRunnableApps());
}
@Test
@ -3740,23 +3734,23 @@ public class TestFairScheduler extends FairSchedulerTestBase {
// Should get put into jerry
createSchedulingRequest(1024, "jerry", "someuser");
assertEquals(1, jerryQueue.getRunnableAppSchedulables().size());
assertEquals(1, jerryQueue.getNumRunnableApps());
// Should get forced into default
createSchedulingRequest(1024, "newqueue", "someuser");
assertEquals(1, jerryQueue.getRunnableAppSchedulables().size());
assertEquals(1, defaultQueue.getRunnableAppSchedulables().size());
assertEquals(1, jerryQueue.getNumRunnableApps());
assertEquals(1, defaultQueue.getNumRunnableApps());
// Would get put into someuser because of user-as-default-queue, but should
// be forced into default
createSchedulingRequest(1024, "default", "someuser");
assertEquals(1, jerryQueue.getRunnableAppSchedulables().size());
assertEquals(2, defaultQueue.getRunnableAppSchedulables().size());
assertEquals(1, jerryQueue.getNumRunnableApps());
assertEquals(2, defaultQueue.getNumRunnableApps());
// Should get put into jerry because of user-as-default-queue
createSchedulingRequest(1024, "default", "jerry");
assertEquals(2, jerryQueue.getRunnableAppSchedulables().size());
assertEquals(2, defaultQueue.getRunnableAppSchedulables().size());
assertEquals(2, jerryQueue.getNumRunnableApps());
assertEquals(2, defaultQueue.getNumRunnableApps());
}
@Test
@ -3999,8 +3993,8 @@ public class TestFairScheduler extends FairSchedulerTestBase {
scheduler.moveApplication(appId, "queue2");
FSAppAttempt app = scheduler.getSchedulerApp(appAttId);
assertSame(targetQueue, app.getQueue());
assertFalse(oldQueue.getRunnableAppSchedulables().contains(app));
assertTrue(targetQueue.getRunnableAppSchedulables().contains(app));
assertFalse(oldQueue.isRunnableApp(app));
assertTrue(targetQueue.isRunnableApp(app));
assertEquals(Resource.newInstance(0, 0), oldQueue.getResourceUsage());
assertEquals(Resource.newInstance(1024, 1), targetQueue.getResourceUsage());
assertEquals(0, oldQueue.getNumRunnableApps());
@ -4049,12 +4043,12 @@ public class TestFairScheduler extends FairSchedulerTestBase {
createSchedulingRequest(1024, 1, "queue1", "user1", 3);
FSAppAttempt app = scheduler.getSchedulerApp(appAttId);
assertTrue(oldQueue.getNonRunnableAppSchedulables().contains(app));
assertTrue(oldQueue.isNonRunnableApp(app));
scheduler.moveApplication(appAttId.getApplicationId(), "queue2");
assertFalse(oldQueue.getNonRunnableAppSchedulables().contains(app));
assertFalse(targetQueue.getNonRunnableAppSchedulables().contains(app));
assertTrue(targetQueue.getRunnableAppSchedulables().contains(app));
assertFalse(oldQueue.isNonRunnableApp(app));
assertFalse(targetQueue.isNonRunnableApp(app));
assertTrue(targetQueue.isRunnableApp(app));
assertEquals(1, targetQueue.getNumRunnableApps());
assertEquals(1, queueMgr.getRootQueue().getNumRunnableApps());
}

View File

@ -97,13 +97,13 @@ public class TestMaxRunningAppsEnforcer {
FSAppAttempt app1 = addApp(leaf1, "user");
addApp(leaf2, "user");
addApp(leaf2, "user");
assertEquals(1, leaf1.getRunnableAppSchedulables().size());
assertEquals(1, leaf2.getRunnableAppSchedulables().size());
assertEquals(1, leaf2.getNonRunnableAppSchedulables().size());
assertEquals(1, leaf1.getNumRunnableApps());
assertEquals(1, leaf2.getNumRunnableApps());
assertEquals(1, leaf2.getNumNonRunnableApps());
removeApp(app1);
assertEquals(0, leaf1.getRunnableAppSchedulables().size());
assertEquals(1, leaf2.getRunnableAppSchedulables().size());
assertEquals(1, leaf2.getNonRunnableAppSchedulables().size());
assertEquals(0, leaf1.getNumRunnableApps());
assertEquals(1, leaf2.getNumRunnableApps());
assertEquals(1, leaf2.getNumNonRunnableApps());
}
@Test
@ -114,13 +114,13 @@ public class TestMaxRunningAppsEnforcer {
FSAppAttempt app1 = addApp(leaf1, "user");
addApp(leaf2, "user");
addApp(leaf2, "user");
assertEquals(1, leaf1.getRunnableAppSchedulables().size());
assertEquals(1, leaf2.getRunnableAppSchedulables().size());
assertEquals(1, leaf2.getNonRunnableAppSchedulables().size());
assertEquals(1, leaf1.getNumRunnableApps());
assertEquals(1, leaf2.getNumRunnableApps());
assertEquals(1, leaf2.getNumNonRunnableApps());
removeApp(app1);
assertEquals(0, leaf1.getRunnableAppSchedulables().size());
assertEquals(2, leaf2.getRunnableAppSchedulables().size());
assertEquals(0, leaf2.getNonRunnableAppSchedulables().size());
assertEquals(0, leaf1.getNumRunnableApps());
assertEquals(2, leaf2.getNumRunnableApps());
assertEquals(0, leaf2.getNumNonRunnableApps());
}
@Test
@ -133,14 +133,14 @@ public class TestMaxRunningAppsEnforcer {
addApp(leaf1, "user2");
addApp(leaf1, "user3");
addApp(leaf2, "user1");
assertEquals(2, leaf1.getRunnableAppSchedulables().size());
assertEquals(1, leaf1.getNonRunnableAppSchedulables().size());
assertEquals(1, leaf2.getNonRunnableAppSchedulables().size());
assertEquals(2, leaf1.getNumRunnableApps());
assertEquals(1, leaf1.getNumNonRunnableApps());
assertEquals(1, leaf2.getNumNonRunnableApps());
removeApp(app1);
assertEquals(2, leaf1.getRunnableAppSchedulables().size());
assertEquals(1, leaf2.getRunnableAppSchedulables().size());
assertEquals(0, leaf1.getNonRunnableAppSchedulables().size());
assertEquals(0, leaf2.getNonRunnableAppSchedulables().size());
assertEquals(2, leaf1.getNumRunnableApps());
assertEquals(1, leaf2.getNumRunnableApps());
assertEquals(0, leaf1.getNumNonRunnableApps());
assertEquals(0, leaf2.getNumNonRunnableApps());
}
@Test
@ -153,14 +153,14 @@ public class TestMaxRunningAppsEnforcer {
addApp(leaf2, "user");
clock.tick(20);
addApp(leaf1, "user");
assertEquals(1, leaf1.getRunnableAppSchedulables().size());
assertEquals(1, leaf2.getRunnableAppSchedulables().size());
assertEquals(1, leaf1.getNonRunnableAppSchedulables().size());
assertEquals(1, leaf2.getNonRunnableAppSchedulables().size());
assertEquals(1, leaf1.getNumRunnableApps());
assertEquals(1, leaf2.getNumRunnableApps());
assertEquals(1, leaf1.getNumNonRunnableApps());
assertEquals(1, leaf2.getNumNonRunnableApps());
removeApp(app1);
assertEquals(0, leaf1.getRunnableAppSchedulables().size());
assertEquals(2, leaf2.getRunnableAppSchedulables().size());
assertEquals(0, leaf2.getNonRunnableAppSchedulables().size());
assertEquals(0, leaf1.getNumRunnableApps());
assertEquals(2, leaf2.getNumRunnableApps());
assertEquals(0, leaf2.getNumNonRunnableApps());
}
@Test
@ -172,13 +172,13 @@ public class TestMaxRunningAppsEnforcer {
addApp(leaf2, "user");
addApp(leaf2, "user");
addApp(leaf2, "user");
assertEquals(1, leaf1.getRunnableAppSchedulables().size());
assertEquals(1, leaf2.getRunnableAppSchedulables().size());
assertEquals(2, leaf2.getNonRunnableAppSchedulables().size());
assertEquals(1, leaf1.getNumRunnableApps());
assertEquals(1, leaf2.getNumRunnableApps());
assertEquals(2, leaf2.getNumNonRunnableApps());
removeApp(app1);
assertEquals(0, leaf1.getRunnableAppSchedulables().size());
assertEquals(2, leaf2.getRunnableAppSchedulables().size());
assertEquals(1, leaf2.getNonRunnableAppSchedulables().size());
assertEquals(0, leaf1.getNumRunnableApps());
assertEquals(2, leaf2.getNumRunnableApps());
assertEquals(1, leaf2.getNumNonRunnableApps());
}
@Test