YARN-4546. ResourceManager crash due to scheduling opportunity overflow. Contributed by Jason Lowe.

(cherry picked from commit c1462a67ff)
(cherry picked from commit 1cc001db4c)
This commit is contained in:
Junping Du 2016-01-06 05:49:24 -08:00
parent 6dcf3cb6a1
commit 49ba43e05b
3 changed files with 40 additions and 2 deletions

View File

@ -1110,6 +1110,9 @@ Release 2.8.0 - UNRELEASED
YARN-1382. Remove unusableRMNodesConcurrentSet (never used) in NodeListManager
to get rid of memory leak. (Rohith Sharma K S via junping_du)
YARN-4546. ResourceManager crash due to scheduling opportunity overflow.
(Jason Lowe via junping_du)
Release 2.7.3 - UNRELEASED
INCOMPATIBLE CHANGES
@ -1161,6 +1164,9 @@ Release 2.7.3 - UNRELEASED
YARN-4510. Fix SLS startup failure caused by NPE. (Bibin A Chundatt via wangda)
YARN-4546. ResourceManager crash due to scheduling opportunity overflow.
(Jason Lowe via junping_du)
Release 2.7.2 - UNRELEASED
INCOMPATIBLE CHANGES
@ -2018,6 +2024,9 @@ Release 2.6.4 - UNRELEASED
YARN-4452. NPE when submit Unmanaged application. (Naganarasimha G R
via junping_du)
YARN-4546. ResourceManager crash due to scheduling opportunity overflow.
(Jason Lowe via junping_du)
Release 2.6.3 - 2015-12-17
INCOMPATIBLE CHANGES

View File

@ -625,8 +625,10 @@ public class SchedulerApplicationAttempt implements SchedulableEntity {
public synchronized void addSchedulingOpportunity(Priority priority) {
schedulingOpportunities.setCount(priority,
schedulingOpportunities.count(priority) + 1);
int count = schedulingOpportunities.count(priority);
if (count < Integer.MAX_VALUE) {
schedulingOpportunities.setCount(priority, count + 1);
}
}
public synchronized void subtractSchedulingOpportunity(Priority priority) {
@ -661,6 +663,11 @@ public class SchedulerApplicationAttempt implements SchedulableEntity {
schedulingOpportunities.setCount(priority, 0);
}
@VisibleForTesting
void setSchedulingOpportunities(Priority priority, int count) {
schedulingOpportunities.setCount(priority, count);
}
synchronized AggregateAppResourceUsage getRunningAggregateAppResourceUsage() {
long currentTimeMillis = System.currentTimeMillis();
// Don't walk the whole container list if the resources were computed

View File

@ -249,4 +249,26 @@ public class TestSchedulerApplicationAttempt {
assertEquals(0.0f, app.getResourceUsageReport().getClusterUsagePercentage(),
0.0f);
}
@Test
public void testSchedulingOpportunityOverflow() throws Exception {
ApplicationAttemptId attemptId = createAppAttemptId(0, 0);
Queue queue = createQueue("test", null);
RMContext rmContext = mock(RMContext.class);
when(rmContext.getEpoch()).thenReturn(3L);
SchedulerApplicationAttempt app = new SchedulerApplicationAttempt(
attemptId, "user", queue, queue.getActiveUsersManager(), rmContext);
Priority priority = Priority.newInstance(1);
assertEquals(0, app.getSchedulingOpportunities(priority));
app.addSchedulingOpportunity(priority);
assertEquals(1, app.getSchedulingOpportunities(priority));
// verify the count is capped at MAX_VALUE and does not overflow
app.setSchedulingOpportunities(priority, Integer.MAX_VALUE - 1);
assertEquals(Integer.MAX_VALUE - 1,
app.getSchedulingOpportunities(priority));
app.addSchedulingOpportunity(priority);
assertEquals(Integer.MAX_VALUE, app.getSchedulingOpportunities(priority));
app.addSchedulingOpportunity(priority);
assertEquals(Integer.MAX_VALUE, app.getSchedulingOpportunities(priority));
}
}