YARN-10996. Fix race condition of User object acquisitions. Contributed by Andras Gyori
This commit is contained in:
parent
db89a9411e
commit
e220e88eca
|
@ -30,6 +30,7 @@ import java.util.PriorityQueue;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.UsersManager.User;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||||
|
@ -419,8 +420,10 @@ public class FifoIntraQueuePreemptionPlugin
|
||||||
String userName = app.getUser();
|
String userName = app.getUser();
|
||||||
TempUserPerPartition tmpUser = usersPerPartition.get(userName);
|
TempUserPerPartition tmpUser = usersPerPartition.get(userName);
|
||||||
if (tmpUser == null) {
|
if (tmpUser == null) {
|
||||||
ResourceUsage userResourceUsage = tq.leafQueue.getUser(userName)
|
// User might have already been removed, but preemption still accounts for this app,
|
||||||
.getResourceUsage();
|
// therefore reinserting the user will not cause a memory leak
|
||||||
|
User user = tq.leafQueue.getOrCreateUser(userName);
|
||||||
|
ResourceUsage userResourceUsage = user.getResourceUsage();
|
||||||
|
|
||||||
// perUserAMUsed was populated with running apps, now we are looping
|
// perUserAMUsed was populated with running apps, now we are looping
|
||||||
// through both running and pending apps.
|
// through both running and pending apps.
|
||||||
|
@ -428,8 +431,7 @@ public class FifoIntraQueuePreemptionPlugin
|
||||||
amUsed = (userSpecificAmUsed == null)
|
amUsed = (userSpecificAmUsed == null)
|
||||||
? Resources.none() : userSpecificAmUsed;
|
? Resources.none() : userSpecificAmUsed;
|
||||||
|
|
||||||
tmpUser = new TempUserPerPartition(
|
tmpUser = new TempUserPerPartition(user, tq.queueName,
|
||||||
tq.leafQueue.getUser(userName), tq.queueName,
|
|
||||||
Resources.clone(userResourceUsage.getUsed(partition)),
|
Resources.clone(userResourceUsage.getUsed(partition)),
|
||||||
Resources.clone(amUsed),
|
Resources.clone(amUsed),
|
||||||
Resources.clone(userResourceUsage.getReserved(partition)),
|
Resources.clone(userResourceUsage.getReserved(partition)),
|
||||||
|
|
|
@ -519,6 +519,11 @@ public class LeafQueue extends AbstractCSQueue {
|
||||||
return usersManager.getUser(userName);
|
return usersManager.getUser(userName);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public User getOrCreateUser(String userName) {
|
||||||
|
return usersManager.getUserAndAddIfAbsent(userName);
|
||||||
|
}
|
||||||
|
|
||||||
@Private
|
@Private
|
||||||
public List<AppPriorityACLGroup> getPriorityACLs() {
|
public List<AppPriorityACLGroup> getPriorityACLs() {
|
||||||
readLock.lock();
|
readLock.lock();
|
||||||
|
@ -2007,7 +2012,12 @@ public class LeafQueue extends AbstractCSQueue {
|
||||||
|
|
||||||
public void incAMUsedResource(String nodeLabel, Resource resourceToInc,
|
public void incAMUsedResource(String nodeLabel, Resource resourceToInc,
|
||||||
SchedulerApplicationAttempt application) {
|
SchedulerApplicationAttempt application) {
|
||||||
getUser(application.getUser()).getResourceUsage().incAMUsed(nodeLabel,
|
User user = getUser(application.getUser());
|
||||||
|
if (user == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
user.getResourceUsage().incAMUsed(nodeLabel,
|
||||||
resourceToInc);
|
resourceToInc);
|
||||||
// ResourceUsage has its own lock, no addition lock needs here.
|
// ResourceUsage has its own lock, no addition lock needs here.
|
||||||
usageTracker.getQueueUsage().incAMUsed(nodeLabel, resourceToInc);
|
usageTracker.getQueueUsage().incAMUsed(nodeLabel, resourceToInc);
|
||||||
|
@ -2015,7 +2025,12 @@ public class LeafQueue extends AbstractCSQueue {
|
||||||
|
|
||||||
public void decAMUsedResource(String nodeLabel, Resource resourceToDec,
|
public void decAMUsedResource(String nodeLabel, Resource resourceToDec,
|
||||||
SchedulerApplicationAttempt application) {
|
SchedulerApplicationAttempt application) {
|
||||||
getUser(application.getUser()).getResourceUsage().decAMUsed(nodeLabel,
|
User user = getUser(application.getUser());
|
||||||
|
if (user == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
user.getResourceUsage().decAMUsed(nodeLabel,
|
||||||
resourceToDec);
|
resourceToDec);
|
||||||
// ResourceUsage has its own lock, no addition lock needs here.
|
// ResourceUsage has its own lock, no addition lock needs here.
|
||||||
usageTracker.getQueueUsage().decAMUsed(nodeLabel, resourceToDec);
|
usageTracker.getQueueUsage().decAMUsed(nodeLabel, resourceToDec);
|
||||||
|
@ -2103,7 +2118,7 @@ public class LeafQueue extends AbstractCSQueue {
|
||||||
for (FiCaSchedulerApp app : getApplications()) {
|
for (FiCaSchedulerApp app : getApplications()) {
|
||||||
String userName = app.getUser();
|
String userName = app.getUser();
|
||||||
if (!userNameToHeadroom.containsKey(userName)) {
|
if (!userNameToHeadroom.containsKey(userName)) {
|
||||||
User user = getUser(userName);
|
User user = getUsersManager().getUserAndAddIfAbsent(userName);
|
||||||
Resource headroom = Resources.subtract(
|
Resource headroom = Resources.subtract(
|
||||||
getResourceLimitForActiveUsers(app.getUser(), clusterResources,
|
getResourceLimitForActiveUsers(app.getUser(), clusterResources,
|
||||||
partition, SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY),
|
partition, SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY),
|
||||||
|
|
|
@ -817,6 +817,7 @@ public class UsersManager implements AbstractUsersManager {
|
||||||
lQueue.getMinimumAllocation());
|
lQueue.getMinimumAllocation());
|
||||||
|
|
||||||
if (LOG.isDebugEnabled()) {
|
if (LOG.isDebugEnabled()) {
|
||||||
|
float weight = lQueue.getUserWeights().getByUser(userName);
|
||||||
LOG.debug("User limit computation for " + userName
|
LOG.debug("User limit computation for " + userName
|
||||||
+ ", in queue: " + lQueue.getQueuePath()
|
+ ", in queue: " + lQueue.getQueuePath()
|
||||||
+ ", userLimitPercent=" + lQueue.getUserLimit()
|
+ ", userLimitPercent=" + lQueue.getUserLimit()
|
||||||
|
@ -834,7 +835,7 @@ public class UsersManager implements AbstractUsersManager {
|
||||||
+ ", Partition=" + nodePartition
|
+ ", Partition=" + nodePartition
|
||||||
+ ", resourceUsed=" + resourceUsed
|
+ ", resourceUsed=" + resourceUsed
|
||||||
+ ", maxUserLimit=" + maxUserLimit
|
+ ", maxUserLimit=" + maxUserLimit
|
||||||
+ ", userWeight=" + getUser(userName).getWeight()
|
+ ", userWeight=" + weight
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
return userLimitResource;
|
return userLimitResource;
|
||||||
|
|
|
@ -165,6 +165,7 @@ class MockApplications {
|
||||||
user.setResourceUsage(userResourceUsage.get(userName));
|
user.setResourceUsage(userResourceUsage.get(userName));
|
||||||
}
|
}
|
||||||
when(queue.getUser(eq(userName))).thenReturn(user);
|
when(queue.getUser(eq(userName))).thenReturn(user);
|
||||||
|
when(queue.getOrCreateUser(eq(userName))).thenReturn(user);
|
||||||
when(queue.getResourceLimitForAllUsers(eq(userName),
|
when(queue.getResourceLimitForAllUsers(eq(userName),
|
||||||
any(Resource.class), anyString(), any(SchedulingMode.class)))
|
any(Resource.class), anyString(), any(SchedulingMode.class)))
|
||||||
.thenReturn(userLimit);
|
.thenReturn(userLimit);
|
||||||
|
|
Loading…
Reference in New Issue