YARN-9539.Improve cleanup process of app activities and make some conditions configurable. Contributed by Tao Yang.
This commit is contained in:
parent
ff27e8eabd
commit
1a47c2b7ae
|
@ -4005,6 +4005,41 @@ public class YarnConfiguration extends Configuration {
|
||||||
public static final String DEFAULT_NM_NUMA_AWARENESS_NUMACTL_CMD =
|
public static final String DEFAULT_NM_NUMA_AWARENESS_NUMACTL_CMD =
|
||||||
"/usr/bin/numactl";
|
"/usr/bin/numactl";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Settings for activities manager.
|
||||||
|
*/
|
||||||
|
public static final String RM_ACTIVITIES_MANAGER_PREFIX =
|
||||||
|
RM_PREFIX + "activities-manager.";
|
||||||
|
public static final String RM_ACTIVITIES_MANAGER_SCHEDULER_ACTIVITIES_PREFIX =
|
||||||
|
RM_ACTIVITIES_MANAGER_PREFIX + "scheduler-activities.";
|
||||||
|
public static final String RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_PREFIX =
|
||||||
|
RM_ACTIVITIES_MANAGER_PREFIX + "app-activities.";
|
||||||
|
|
||||||
|
/** The cleanup interval for activities in milliseconds. **/
|
||||||
|
public static final String RM_ACTIVITIES_MANAGER_CLEANUP_INTERVAL_MS =
|
||||||
|
RM_ACTIVITIES_MANAGER_PREFIX + "cleanup-interval-ms";
|
||||||
|
public static final long DEFAULT_RM_ACTIVITIES_MANAGER_CLEANUP_INTERVAL_MS =
|
||||||
|
5000L;
|
||||||
|
|
||||||
|
/** Time to live for scheduler activities in milliseconds. **/
|
||||||
|
public static final String RM_ACTIVITIES_MANAGER_SCHEDULER_ACTIVITIES_TTL_MS =
|
||||||
|
RM_ACTIVITIES_MANAGER_SCHEDULER_ACTIVITIES_PREFIX + "ttl-ms";
|
||||||
|
public static final long
|
||||||
|
DEFAULT_RM_ACTIVITIES_MANAGER_SCHEDULER_ACTIVITIES_TTL_MS = 600000L;
|
||||||
|
|
||||||
|
/** Time to live for app activities in milliseconds. **/
|
||||||
|
public static final String RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_TTL_MS =
|
||||||
|
RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_PREFIX + "ttl-ms";
|
||||||
|
public static final long DEFAULT_RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_TTL_MS =
|
||||||
|
600000L;
|
||||||
|
|
||||||
|
/** Max queue length for app activities. **/
|
||||||
|
public static final String
|
||||||
|
RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_MAX_QUEUE_LENGTH =
|
||||||
|
RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_PREFIX + "max-queue-length";
|
||||||
|
public static final int
|
||||||
|
DEFAULT_RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_MAX_QUEUE_LENGTH = 1000;
|
||||||
|
|
||||||
public YarnConfiguration() {
|
public YarnConfiguration() {
|
||||||
super();
|
super();
|
||||||
}
|
}
|
||||||
|
|
|
@ -4187,4 +4187,28 @@
|
||||||
<name>yarn.nodemanager.csi-driver.names</name>
|
<name>yarn.nodemanager.csi-driver.names</name>
|
||||||
<value></value>
|
<value></value>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>The cleanup interval for activities in milliseconds.</description>
|
||||||
|
<name>yarn.resourcemanager.activities-manager.cleanup-interval-ms</name>
|
||||||
|
<value>5000</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>Time to live for scheduler activities in milliseconds.</description>
|
||||||
|
<name>yarn.resourcemanager.activities-manager.scheduler-activities.ttl-ms</name>
|
||||||
|
<value>600000</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>Time to live for app activities in milliseconds.</description>
|
||||||
|
<name>yarn.resourcemanager.activities-manager.app-activities.ttl-ms</name>
|
||||||
|
<value>600000</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>Max queue length for app activities.</description>
|
||||||
|
<name>yarn.resourcemanager.activities-manager.app-activities.max-queue-length</name>
|
||||||
|
<value>1000</value>
|
||||||
|
</property>
|
||||||
</configuration>
|
</configuration>
|
||||||
|
|
|
@ -19,9 +19,11 @@
|
||||||
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities;
|
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.activities;
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.yarn.api.records.Resource;
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
import org.apache.hadoop.yarn.util.resource.ResourceCalculator;
|
import org.apache.hadoop.yarn.util.resource.ResourceCalculator;
|
||||||
import org.apache.commons.collections.CollectionUtils;
|
import org.apache.commons.collections.CollectionUtils;
|
||||||
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.apache.hadoop.service.AbstractService;
|
import org.apache.hadoop.service.AbstractService;
|
||||||
|
@ -72,7 +74,10 @@ public class ActivitiesManager extends AbstractService {
|
||||||
private boolean recordNextAvailableNode = false;
|
private boolean recordNextAvailableNode = false;
|
||||||
private List<NodeAllocation> lastAvailableNodeActivities = null;
|
private List<NodeAllocation> lastAvailableNodeActivities = null;
|
||||||
private Thread cleanUpThread;
|
private Thread cleanUpThread;
|
||||||
private int timeThreshold = 600 * 1000;
|
private long activitiesCleanupIntervalMs;
|
||||||
|
private long schedulerActivitiesTTL;
|
||||||
|
private long appActivitiesTTL;
|
||||||
|
private int appActivitiesMaxQueueLength;
|
||||||
private final RMContext rmContext;
|
private final RMContext rmContext;
|
||||||
private volatile boolean stopped;
|
private volatile boolean stopped;
|
||||||
private ThreadLocal<DiagnosticsCollectorManager> diagnosticCollectorManager;
|
private ThreadLocal<DiagnosticsCollectorManager> diagnosticCollectorManager;
|
||||||
|
@ -89,6 +94,28 @@ public class ActivitiesManager extends AbstractService {
|
||||||
() -> new DiagnosticsCollectorManager(
|
() -> new DiagnosticsCollectorManager(
|
||||||
new GenericDiagnosticsCollector()));
|
new GenericDiagnosticsCollector()));
|
||||||
this.rmContext = rmContext;
|
this.rmContext = rmContext;
|
||||||
|
if (rmContext.getYarnConfiguration() != null) {
|
||||||
|
setupConfForCleanup(rmContext.getYarnConfiguration());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setupConfForCleanup(Configuration conf) {
|
||||||
|
activitiesCleanupIntervalMs = conf.getLong(
|
||||||
|
YarnConfiguration.RM_ACTIVITIES_MANAGER_CLEANUP_INTERVAL_MS,
|
||||||
|
YarnConfiguration.
|
||||||
|
DEFAULT_RM_ACTIVITIES_MANAGER_CLEANUP_INTERVAL_MS);
|
||||||
|
schedulerActivitiesTTL = conf.getLong(
|
||||||
|
YarnConfiguration.RM_ACTIVITIES_MANAGER_SCHEDULER_ACTIVITIES_TTL_MS,
|
||||||
|
YarnConfiguration.
|
||||||
|
DEFAULT_RM_ACTIVITIES_MANAGER_SCHEDULER_ACTIVITIES_TTL_MS);
|
||||||
|
appActivitiesTTL = conf.getLong(
|
||||||
|
YarnConfiguration.RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_TTL_MS,
|
||||||
|
YarnConfiguration.
|
||||||
|
DEFAULT_RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_TTL_MS);
|
||||||
|
appActivitiesMaxQueueLength = conf.getInt(YarnConfiguration.
|
||||||
|
RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_MAX_QUEUE_LENGTH,
|
||||||
|
YarnConfiguration.
|
||||||
|
DEFAULT_RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_MAX_QUEUE_LENGTH);
|
||||||
}
|
}
|
||||||
|
|
||||||
public AppActivitiesInfo getAppActivitiesInfo(ApplicationId applicationId,
|
public AppActivitiesInfo getAppActivitiesInfo(ApplicationId applicationId,
|
||||||
|
@ -152,12 +179,13 @@ public class ActivitiesManager extends AbstractService {
|
||||||
while (!stopped && !Thread.currentThread().isInterrupted()) {
|
while (!stopped && !Thread.currentThread().isInterrupted()) {
|
||||||
Iterator<Map.Entry<NodeId, List<NodeAllocation>>> ite =
|
Iterator<Map.Entry<NodeId, List<NodeAllocation>>> ite =
|
||||||
completedNodeAllocations.entrySet().iterator();
|
completedNodeAllocations.entrySet().iterator();
|
||||||
|
long curTS = SystemClock.getInstance().getTime();
|
||||||
while (ite.hasNext()) {
|
while (ite.hasNext()) {
|
||||||
Map.Entry<NodeId, List<NodeAllocation>> nodeAllocation = ite.next();
|
Map.Entry<NodeId, List<NodeAllocation>> nodeAllocation = ite.next();
|
||||||
List<NodeAllocation> allocations = nodeAllocation.getValue();
|
List<NodeAllocation> allocations = nodeAllocation.getValue();
|
||||||
long currTS = SystemClock.getInstance().getTime();
|
if (allocations.size() > 0
|
||||||
if (allocations.size() > 0 && allocations.get(0).getTimeStamp()
|
&& curTS - allocations.get(0).getTimeStamp()
|
||||||
- currTS > timeThreshold) {
|
> schedulerActivitiesTTL) {
|
||||||
ite.remove();
|
ite.remove();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -171,11 +199,29 @@ public class ActivitiesManager extends AbstractService {
|
||||||
if (rmApp == null || rmApp.getFinalApplicationStatus()
|
if (rmApp == null || rmApp.getFinalApplicationStatus()
|
||||||
!= FinalApplicationStatus.UNDEFINED) {
|
!= FinalApplicationStatus.UNDEFINED) {
|
||||||
iteApp.remove();
|
iteApp.remove();
|
||||||
|
} else {
|
||||||
|
Iterator<AppAllocation> appActivitiesIt =
|
||||||
|
appAllocation.getValue().iterator();
|
||||||
|
while (appActivitiesIt.hasNext()) {
|
||||||
|
if (curTS - appActivitiesIt.next().getTime()
|
||||||
|
> appActivitiesTTL) {
|
||||||
|
appActivitiesIt.remove();
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (appAllocation.getValue().isEmpty()) {
|
||||||
|
iteApp.remove();
|
||||||
|
LOG.debug("Removed all expired activities from cache for {}.",
|
||||||
|
rmApp.getApplicationId());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG.debug("Remaining apps in app activities cache: {}",
|
||||||
|
completedAppAllocations.keySet());
|
||||||
try {
|
try {
|
||||||
Thread.sleep(5000);
|
Thread.sleep(activitiesCleanupIntervalMs);
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
LOG.info(getName() + " thread interrupted");
|
LOG.info(getName() + " thread interrupted");
|
||||||
break;
|
break;
|
||||||
|
@ -290,7 +336,7 @@ public class ActivitiesManager extends AbstractService {
|
||||||
appAllocations = curAppAllocations;
|
appAllocations = curAppAllocations;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (appAllocations.size() == 1000) {
|
if (appAllocations.size() == appActivitiesMaxQueueLength) {
|
||||||
appAllocations.poll();
|
appAllocations.poll();
|
||||||
}
|
}
|
||||||
appAllocations.add(appAllocation);
|
appAllocations.add(appAllocation);
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
|
|
||||||
package org.apache.hadoop.yarn.server.resourcemanager.webapp.dao;
|
package org.apache.hadoop.yarn.server.resourcemanager.webapp.dao;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
@ -77,4 +78,9 @@ public class AppActivitiesInfo {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public List<AppAllocationInfo> getAllocations() {
|
||||||
|
return allocations;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,10 +30,13 @@ import java.util.concurrent.LinkedBlockingQueue;
|
||||||
import java.util.concurrent.ThreadPoolExecutor;
|
import java.util.concurrent.ThreadPoolExecutor;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
|
||||||
import org.apache.hadoop.yarn.api.records.NodeId;
|
import org.apache.hadoop.yarn.api.records.NodeId;
|
||||||
import org.apache.hadoop.yarn.api.records.Priority;
|
import org.apache.hadoop.yarn.api.records.Priority;
|
||||||
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManager;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManager;
|
||||||
|
@ -43,6 +46,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.TestUtils;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.TestUtils;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.AppActivitiesInfo;
|
||||||
import org.apache.hadoop.yarn.server.scheduler.SchedulerRequestKey;
|
import org.apache.hadoop.yarn.server.scheduler.SchedulerRequestKey;
|
||||||
import org.apache.hadoop.yarn.util.SystemClock;
|
import org.apache.hadoop.yarn.util.SystemClock;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
|
@ -81,6 +85,8 @@ public class TestActivitiesManager {
|
||||||
@Before
|
@Before
|
||||||
public void setup() {
|
public void setup() {
|
||||||
rmContext = Mockito.mock(RMContext.class);
|
rmContext = Mockito.mock(RMContext.class);
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
Mockito.when(rmContext.getYarnConfiguration()).thenReturn(conf);
|
||||||
ResourceScheduler scheduler = Mockito.mock(ResourceScheduler.class);
|
ResourceScheduler scheduler = Mockito.mock(ResourceScheduler.class);
|
||||||
Mockito.when(scheduler.getMinimumResourceCapability())
|
Mockito.when(scheduler.getMinimumResourceCapability())
|
||||||
.thenReturn(Resources.none());
|
.thenReturn(Resources.none());
|
||||||
|
@ -95,6 +101,8 @@ public class TestActivitiesManager {
|
||||||
RMApp mockApp = Mockito.mock(RMApp.class);
|
RMApp mockApp = Mockito.mock(RMApp.class);
|
||||||
Mockito.doReturn(appAttemptId.getApplicationId()).when(mockApp)
|
Mockito.doReturn(appAttemptId.getApplicationId()).when(mockApp)
|
||||||
.getApplicationId();
|
.getApplicationId();
|
||||||
|
Mockito.doReturn(FinalApplicationStatus.UNDEFINED).when(mockApp)
|
||||||
|
.getFinalApplicationStatus();
|
||||||
rmApps.put(appAttemptId.getApplicationId(), mockApp);
|
rmApps.put(appAttemptId.getApplicationId(), mockApp);
|
||||||
FiCaSchedulerApp app =
|
FiCaSchedulerApp app =
|
||||||
new FiCaSchedulerApp(appAttemptId, "user", mockQueue,
|
new FiCaSchedulerApp(appAttemptId, "user", mockQueue,
|
||||||
|
@ -245,6 +253,51 @@ public class TestActivitiesManager {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test (timeout = 30000)
|
||||||
|
public void testAppActivitiesTTL() throws Exception {
|
||||||
|
long cleanupIntervalMs = 100;
|
||||||
|
long appActivitiesTTL = 1000;
|
||||||
|
rmContext.getYarnConfiguration()
|
||||||
|
.setLong(YarnConfiguration.RM_ACTIVITIES_MANAGER_CLEANUP_INTERVAL_MS,
|
||||||
|
cleanupIntervalMs);
|
||||||
|
rmContext.getYarnConfiguration()
|
||||||
|
.setLong(YarnConfiguration.RM_ACTIVITIES_MANAGER_APP_ACTIVITIES_TTL_MS,
|
||||||
|
appActivitiesTTL);
|
||||||
|
ActivitiesManager newActivitiesManager = new ActivitiesManager(rmContext);
|
||||||
|
newActivitiesManager.serviceStart();
|
||||||
|
// start recording activities for first app and first node
|
||||||
|
SchedulerApplicationAttempt app = apps.get(0);
|
||||||
|
FiCaSchedulerNode node = (FiCaSchedulerNode) nodes.get(0);
|
||||||
|
newActivitiesManager
|
||||||
|
.turnOnAppActivitiesRecording(app.getApplicationId(), 3);
|
||||||
|
int numActivities = 10;
|
||||||
|
for (int i = 0; i < numActivities; i++) {
|
||||||
|
ActivitiesLogger.APP
|
||||||
|
.startAppAllocationRecording(newActivitiesManager, node,
|
||||||
|
SystemClock.getInstance().getTime(), app);
|
||||||
|
ActivitiesLogger.APP
|
||||||
|
.recordAppActivityWithoutAllocation(newActivitiesManager, node, app,
|
||||||
|
new SchedulerRequestKey(Priority.newInstance(0), 0, null),
|
||||||
|
ActivityDiagnosticConstant.FAIL_TO_ALLOCATE,
|
||||||
|
ActivityState.REJECTED);
|
||||||
|
ActivitiesLogger.APP
|
||||||
|
.finishAllocatedAppAllocationRecording(newActivitiesManager,
|
||||||
|
app.getApplicationId(), null, ActivityState.SKIPPED,
|
||||||
|
ActivityDiagnosticConstant.SKIPPED_ALL_PRIORITIES);
|
||||||
|
}
|
||||||
|
AppActivitiesInfo appActivitiesInfo = newActivitiesManager
|
||||||
|
.getAppActivitiesInfo(app.getApplicationId(), null, null);
|
||||||
|
Assert.assertEquals(numActivities,
|
||||||
|
appActivitiesInfo.getAllocations().size());
|
||||||
|
// sleep until all app activities expired
|
||||||
|
Thread.sleep(cleanupIntervalMs + appActivitiesTTL);
|
||||||
|
// there should be no remaining app activities
|
||||||
|
appActivitiesInfo = newActivitiesManager
|
||||||
|
.getAppActivitiesInfo(app.getApplicationId(), null, null);
|
||||||
|
Assert.assertEquals(0,
|
||||||
|
appActivitiesInfo.getAllocations().size());
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Testing activities manager which can record all history information about
|
* Testing activities manager which can record all history information about
|
||||||
* node allocations.
|
* node allocations.
|
||||||
|
|
Loading…
Reference in New Issue