YARN-10471. Prevent logs for any container from becoming larger than a configurable size. Contributed by Eric Payne
This commit is contained in:
parent
f17e067d52
commit
8ee6bc2518
|
@ -78,4 +78,9 @@ public class ContainerExitStatus {
|
||||||
*/
|
*/
|
||||||
public static final int KILLED_BY_CONTAINER_SCHEDULER = -108;
|
public static final int KILLED_BY_CONTAINER_SCHEDULER = -108;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Container was terminated for generating excess log data.
|
||||||
|
*/
|
||||||
|
public static final int KILLED_FOR_EXCESS_LOGS = -109;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1912,6 +1912,25 @@ public class YarnConfiguration extends Configuration {
|
||||||
public static final String APPLICATION_TAG_BASED_PLACEMENT_USER_WHITELIST =
|
public static final String APPLICATION_TAG_BASED_PLACEMENT_USER_WHITELIST =
|
||||||
APPLICATION_TAG_BASED_PLACEMENT_PREFIX + ".username.whitelist";
|
APPLICATION_TAG_BASED_PLACEMENT_PREFIX + ".username.whitelist";
|
||||||
|
|
||||||
|
/** Enable switch for container log monitoring. */
|
||||||
|
public static final String NM_CONTAINER_LOG_MONITOR_ENABLED =
|
||||||
|
NM_PREFIX + "container-log-monitor.enable";
|
||||||
|
public static final boolean DEFAULT_NM_CONTAINER_LOG_MONITOR_ENABLED = false;
|
||||||
|
/** How often to monitor logs generated by containers. */
|
||||||
|
public static final String NM_CONTAINER_LOG_MON_INTERVAL_MS =
|
||||||
|
NM_PREFIX + "container-log-monitor.interval-ms";
|
||||||
|
public static final int DEFAULT_NM_CONTAINER_LOG_MON_INTERVAL_MS = 60000;
|
||||||
|
/** The disk space limit for a single container log directory. */
|
||||||
|
public static final String NM_CONTAINER_LOG_DIR_SIZE_LIMIT_BYTES =
|
||||||
|
NM_PREFIX + "container-log-monitor.dir-size-limit-bytes";
|
||||||
|
public static final long DEFAULT_NM_CONTAINER_LOG_DIR_SIZE_LIMIT_BYTES =
|
||||||
|
1000000000L;
|
||||||
|
/** The disk space limit for all of a container's logs. */
|
||||||
|
public static final String NM_CONTAINER_LOG_TOTAL_SIZE_LIMIT_BYTES =
|
||||||
|
NM_PREFIX + "container-log-monitor.total-size-limit-bytes";
|
||||||
|
public static final long DEFAULT_NM_CONTAINER_LOG_TOTAL_SIZE_LIMIT_BYTES =
|
||||||
|
10000000000L;
|
||||||
|
|
||||||
/** Enable/disable container metrics. */
|
/** Enable/disable container metrics. */
|
||||||
@Private
|
@Private
|
||||||
public static final String NM_CONTAINER_METRICS_ENABLE =
|
public static final String NM_CONTAINER_METRICS_ENABLE =
|
||||||
|
|
|
@ -1665,6 +1665,34 @@
|
||||||
<name>yarn.nodemanager.container-monitor.interval-ms</name>
|
<name>yarn.nodemanager.container-monitor.interval-ms</name>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>Flag to enable the container log monitor which enforces
|
||||||
|
container log directory size limits.</description>
|
||||||
|
<name>yarn.nodemanager.container-log-monitor.enable</name>
|
||||||
|
<value>false</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>How often to check the usage of a container's log directories
|
||||||
|
in milliseconds</description>
|
||||||
|
<name>yarn.nodemanager.container-log-monitor.interval-ms</name>
|
||||||
|
<value>60000</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>The disk space limit, in bytes, for a single
|
||||||
|
container log directory</description>
|
||||||
|
<name>yarn.nodemanager.container-log-monitor.dir-size-limit-bytes</name>
|
||||||
|
<value>1000000000</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>The disk space limit, in bytes, for all of a container's
|
||||||
|
logs</description>
|
||||||
|
<name>yarn.nodemanager.container-log-monitor.total-size-limit-bytes</name>
|
||||||
|
<value>10000000000</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<description>Class that calculates containers current resource utilization.
|
<description>Class that calculates containers current resource utilization.
|
||||||
If not set, the value for yarn.nodemanager.resource-calculator.class will
|
If not set, the value for yarn.nodemanager.resource-calculator.class will
|
||||||
|
|
|
@ -28,6 +28,7 @@ import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.apache.hadoop.classification.InterfaceAudience.Private;
|
import org.apache.hadoop.classification.InterfaceAudience.Private;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileUtil;
|
||||||
import org.apache.hadoop.service.AbstractService;
|
import org.apache.hadoop.service.AbstractService;
|
||||||
import org.apache.hadoop.util.StringUtils.TraditionalBinaryPrefix;
|
import org.apache.hadoop.util.StringUtils.TraditionalBinaryPrefix;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||||
|
@ -45,11 +46,14 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Cont
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerExecutionException;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerExecutionException;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher;
|
import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.util.NodeManagerHardwareUtils;
|
import org.apache.hadoop.yarn.server.nodemanager.util.NodeManagerHardwareUtils;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.webapp.ContainerLogsUtils;
|
||||||
import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin;
|
import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin;
|
||||||
import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree;
|
import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.io.File;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map.Entry;
|
import java.util.Map.Entry;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
|
@ -67,6 +71,10 @@ public class ContainersMonitorImpl extends AbstractService implements
|
||||||
|
|
||||||
private long monitoringInterval;
|
private long monitoringInterval;
|
||||||
private MonitoringThread monitoringThread;
|
private MonitoringThread monitoringThread;
|
||||||
|
private int logCheckInterval;
|
||||||
|
private LogMonitorThread logMonitorThread;
|
||||||
|
private long logDirSizeLimit;
|
||||||
|
private long logTotalSizeLimit;
|
||||||
private CGroupElasticMemoryController oomListenerThread;
|
private CGroupElasticMemoryController oomListenerThread;
|
||||||
private boolean containerMetricsEnabled;
|
private boolean containerMetricsEnabled;
|
||||||
private long containerMetricsPeriodMs;
|
private long containerMetricsPeriodMs;
|
||||||
|
@ -94,6 +102,7 @@ public class ContainersMonitorImpl extends AbstractService implements
|
||||||
private boolean elasticMemoryEnforcement;
|
private boolean elasticMemoryEnforcement;
|
||||||
private boolean strictMemoryEnforcement;
|
private boolean strictMemoryEnforcement;
|
||||||
private boolean containersMonitorEnabled;
|
private boolean containersMonitorEnabled;
|
||||||
|
private boolean logMonitorEnabled;
|
||||||
|
|
||||||
private long maxVCoresAllottedForContainers;
|
private long maxVCoresAllottedForContainers;
|
||||||
|
|
||||||
|
@ -122,6 +131,8 @@ public class ContainersMonitorImpl extends AbstractService implements
|
||||||
|
|
||||||
this.monitoringThread = new MonitoringThread();
|
this.monitoringThread = new MonitoringThread();
|
||||||
|
|
||||||
|
this.logMonitorThread = new LogMonitorThread();
|
||||||
|
|
||||||
this.containersUtilization = ResourceUtilization.newInstance(0, 0, 0.0f);
|
this.containersUtilization = ResourceUtilization.newInstance(0, 0, 0.0f);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -133,6 +144,16 @@ public class ContainersMonitorImpl extends AbstractService implements
|
||||||
this.conf.getLong(YarnConfiguration.NM_RESOURCE_MON_INTERVAL_MS,
|
this.conf.getLong(YarnConfiguration.NM_RESOURCE_MON_INTERVAL_MS,
|
||||||
YarnConfiguration.DEFAULT_NM_RESOURCE_MON_INTERVAL_MS));
|
YarnConfiguration.DEFAULT_NM_RESOURCE_MON_INTERVAL_MS));
|
||||||
|
|
||||||
|
this.logCheckInterval =
|
||||||
|
conf.getInt(YarnConfiguration.NM_CONTAINER_LOG_MON_INTERVAL_MS,
|
||||||
|
YarnConfiguration.DEFAULT_NM_CONTAINER_LOG_MON_INTERVAL_MS);
|
||||||
|
this.logDirSizeLimit =
|
||||||
|
conf.getLong(YarnConfiguration.NM_CONTAINER_LOG_DIR_SIZE_LIMIT_BYTES,
|
||||||
|
YarnConfiguration.DEFAULT_NM_CONTAINER_LOG_DIR_SIZE_LIMIT_BYTES);
|
||||||
|
this.logTotalSizeLimit =
|
||||||
|
conf.getLong(YarnConfiguration.NM_CONTAINER_LOG_TOTAL_SIZE_LIMIT_BYTES,
|
||||||
|
YarnConfiguration.DEFAULT_NM_CONTAINER_LOG_TOTAL_SIZE_LIMIT_BYTES);
|
||||||
|
|
||||||
this.resourceCalculatorPlugin =
|
this.resourceCalculatorPlugin =
|
||||||
ResourceCalculatorPlugin.getContainersMonitorPlugin(this.conf);
|
ResourceCalculatorPlugin.getContainersMonitorPlugin(this.conf);
|
||||||
LOG.info("Using ResourceCalculatorPlugin: {}",
|
LOG.info("Using ResourceCalculatorPlugin: {}",
|
||||||
|
@ -214,6 +235,11 @@ public class ContainersMonitorImpl extends AbstractService implements
|
||||||
isContainerMonitorEnabled() && monitoringInterval > 0;
|
isContainerMonitorEnabled() && monitoringInterval > 0;
|
||||||
LOG.info("ContainersMonitor enabled: {}", containersMonitorEnabled);
|
LOG.info("ContainersMonitor enabled: {}", containersMonitorEnabled);
|
||||||
|
|
||||||
|
logMonitorEnabled =
|
||||||
|
conf.getBoolean(YarnConfiguration.NM_CONTAINER_LOG_MONITOR_ENABLED,
|
||||||
|
YarnConfiguration.DEFAULT_NM_CONTAINER_LOG_MONITOR_ENABLED);
|
||||||
|
LOG.info("Container Log Monitor Enabled: "+ logMonitorEnabled);
|
||||||
|
|
||||||
nodeCpuPercentageForYARN =
|
nodeCpuPercentageForYARN =
|
||||||
NodeManagerHardwareUtils.getNodeCpuPercentage(this.conf);
|
NodeManagerHardwareUtils.getNodeCpuPercentage(this.conf);
|
||||||
|
|
||||||
|
@ -284,13 +310,16 @@ public class ContainersMonitorImpl extends AbstractService implements
|
||||||
if (oomListenerThread != null) {
|
if (oomListenerThread != null) {
|
||||||
oomListenerThread.start();
|
oomListenerThread.start();
|
||||||
}
|
}
|
||||||
|
if (logMonitorEnabled) {
|
||||||
|
this.logMonitorThread.start();
|
||||||
|
}
|
||||||
super.serviceStart();
|
super.serviceStart();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void serviceStop() throws Exception {
|
protected void serviceStop() throws Exception {
|
||||||
|
stopped = true;
|
||||||
if (containersMonitorEnabled) {
|
if (containersMonitorEnabled) {
|
||||||
stopped = true;
|
|
||||||
this.monitoringThread.interrupt();
|
this.monitoringThread.interrupt();
|
||||||
try {
|
try {
|
||||||
this.monitoringThread.join();
|
this.monitoringThread.join();
|
||||||
|
@ -306,6 +335,13 @@ public class ContainersMonitorImpl extends AbstractService implements
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (logMonitorEnabled) {
|
||||||
|
this.logMonitorThread.interrupt();
|
||||||
|
try {
|
||||||
|
this.logMonitorThread.join();
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
}
|
||||||
|
}
|
||||||
super.serviceStop();
|
super.serviceStop();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -752,7 +788,8 @@ public class ContainersMonitorImpl extends AbstractService implements
|
||||||
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_PMEM;
|
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_PMEM;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isMemoryOverLimit) {
|
if (isMemoryOverLimit
|
||||||
|
&& trackingContainers.remove(containerId) != null) {
|
||||||
// Virtual or physical memory over limit. Fail the container and
|
// Virtual or physical memory over limit. Fail the container and
|
||||||
// remove
|
// remove
|
||||||
// the corresponding process tree
|
// the corresponding process tree
|
||||||
|
@ -766,7 +803,6 @@ public class ContainersMonitorImpl extends AbstractService implements
|
||||||
eventDispatcher.getEventHandler().handle(
|
eventDispatcher.getEventHandler().handle(
|
||||||
new ContainerKillEvent(containerId,
|
new ContainerKillEvent(containerId,
|
||||||
containerExitStatus, msg));
|
containerExitStatus, msg));
|
||||||
trackingContainers.remove(containerId);
|
|
||||||
LOG.info("Removed ProcessTree with root {}", pId);
|
LOG.info("Removed ProcessTree with root {}", pId);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -834,6 +870,72 @@ public class ContainersMonitorImpl extends AbstractService implements
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private class LogMonitorThread extends Thread {
|
||||||
|
LogMonitorThread() {
|
||||||
|
super("Container Log Monitor");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
while (!stopped && !Thread.currentThread().isInterrupted()) {
|
||||||
|
for (Entry<ContainerId, ProcessTreeInfo> entry :
|
||||||
|
trackingContainers.entrySet()) {
|
||||||
|
ContainerId containerId = entry.getKey();
|
||||||
|
ProcessTreeInfo ptInfo = entry.getValue();
|
||||||
|
Container container = context.getContainers().get(containerId);
|
||||||
|
if (container == null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
List<File> logDirs = ContainerLogsUtils.getContainerLogDirs(
|
||||||
|
containerId, container.getUser(), context);
|
||||||
|
long totalLogDataBytes = 0;
|
||||||
|
for (File dir : logDirs) {
|
||||||
|
long currentDirSizeBytes = FileUtil.getDU(dir);
|
||||||
|
totalLogDataBytes += currentDirSizeBytes;
|
||||||
|
String killMsg = null;
|
||||||
|
if (currentDirSizeBytes > logDirSizeLimit) {
|
||||||
|
killMsg = String.format(
|
||||||
|
"Container [pid=%s,containerID=%s] is logging beyond "
|
||||||
|
+ "the container single log directory limit.%n"
|
||||||
|
+ "Limit: %d Log Directory Size: %d Log Directory: %s"
|
||||||
|
+ "%nKilling container.%n",
|
||||||
|
ptInfo.getPID(), containerId, logDirSizeLimit,
|
||||||
|
currentDirSizeBytes, dir);
|
||||||
|
} else if (totalLogDataBytes > logTotalSizeLimit) {
|
||||||
|
killMsg = String.format(
|
||||||
|
"Container [pid=%s,containerID=%s] is logging beyond "
|
||||||
|
+ "the container total log limit.%n"
|
||||||
|
+ "Limit: %d Total Size: >=%d"
|
||||||
|
+ "%nKilling container.%n",
|
||||||
|
ptInfo.getPID(), containerId, logTotalSizeLimit,
|
||||||
|
totalLogDataBytes);
|
||||||
|
}
|
||||||
|
if (killMsg != null
|
||||||
|
&& trackingContainers.remove(containerId) != null) {
|
||||||
|
LOG.warn(killMsg);
|
||||||
|
eventDispatcher.getEventHandler().handle(
|
||||||
|
new ContainerKillEvent(containerId,
|
||||||
|
ContainerExitStatus.KILLED_FOR_EXCESS_LOGS, killMsg));
|
||||||
|
LOG.info("Removed ProcessTree with root " + ptInfo.getPID());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
LOG.warn("Uncaught exception in ContainerMemoryManager "
|
||||||
|
+ "while monitoring log usage for " + containerId, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
Thread.sleep(logCheckInterval);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
LOG.info("Log monitor thread was interrupted. "
|
||||||
|
+ "Stopping container log monitoring.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private void updateContainerMetrics(ContainersMonitorEvent monitoringEvent) {
|
private void updateContainerMetrics(ContainersMonitorEvent monitoringEvent) {
|
||||||
if (!containerMetricsEnabled || monitoringEvent == null) {
|
if (!containerMetricsEnabled || monitoringEvent == null) {
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -21,9 +21,13 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor;
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertFalse;
|
import static org.junit.Assert.assertFalse;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
|
import static org.mockito.ArgumentMatchers.any;
|
||||||
|
import static org.mockito.Mockito.doReturn;
|
||||||
import static org.mockito.Mockito.mock;
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.never;
|
||||||
import static org.mockito.Mockito.spy;
|
import static org.mockito.Mockito.spy;
|
||||||
import static org.mockito.Mockito.timeout;
|
import static org.mockito.Mockito.timeout;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
@ -32,12 +36,14 @@ import java.io.IOException;
|
||||||
import java.io.PrintWriter;
|
import java.io.PrintWriter;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import java.util.function.Supplier;
|
import java.util.function.Supplier;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FileUtil;
|
import org.apache.hadoop.fs.FileUtil;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
||||||
|
@ -61,12 +67,19 @@ import org.apache.hadoop.yarn.api.records.Token;
|
||||||
import org.apache.hadoop.yarn.api.records.URL;
|
import org.apache.hadoop.yarn.api.records.URL;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.event.AsyncDispatcher;
|
import org.apache.hadoop.yarn.event.AsyncDispatcher;
|
||||||
|
import org.apache.hadoop.yarn.event.Event;
|
||||||
|
import org.apache.hadoop.yarn.event.EventHandler;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
|
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
|
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.Signal;
|
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.Signal;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerKillEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerSignalContext;
|
import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerSignalContext;
|
||||||
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
|
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
|
||||||
import org.apache.hadoop.yarn.util.LinuxResourceCalculatorPlugin;
|
import org.apache.hadoop.yarn.util.LinuxResourceCalculatorPlugin;
|
||||||
|
@ -76,6 +89,7 @@ import org.apache.hadoop.yarn.util.TestProcfsBasedProcessTree;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
import org.mockito.ArgumentCaptor;
|
||||||
import org.mockito.Mockito;
|
import org.mockito.Mockito;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -88,6 +102,7 @@ public class TestContainersMonitor extends BaseContainerManagerTest {
|
||||||
static {
|
static {
|
||||||
LOG = LoggerFactory.getLogger(TestContainersMonitor.class);
|
LOG = LoggerFactory.getLogger(TestContainersMonitor.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void setup() throws IOException {
|
public void setup() throws IOException {
|
||||||
conf.setClass(
|
conf.setClass(
|
||||||
|
@ -353,6 +368,164 @@ public class TestContainersMonitor extends BaseContainerManagerTest {
|
||||||
.build()));
|
.build()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
@Test
|
||||||
|
public void testContainerKillOnExcessLogDirectory() throws Exception {
|
||||||
|
final String user = "someuser";
|
||||||
|
ApplicationId appId = ApplicationId.newInstance(1, 1);
|
||||||
|
ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1);
|
||||||
|
ContainerId cid = ContainerId.newContainerId(attemptId, 1);
|
||||||
|
Application app = mock(Application.class);
|
||||||
|
doReturn(user).when(app).getUser();
|
||||||
|
doReturn(appId).when(app).getAppId();
|
||||||
|
Container container = mock(Container.class);
|
||||||
|
doReturn(cid).when(container).getContainerId();
|
||||||
|
doReturn(user).when(container).getUser();
|
||||||
|
File containerLogDir = new File(new File(localLogDir, appId.toString()),
|
||||||
|
cid.toString());
|
||||||
|
containerLogDir.mkdirs();
|
||||||
|
LocalDirsHandlerService mockDirsHandler =
|
||||||
|
mock(LocalDirsHandlerService.class);
|
||||||
|
doReturn(Collections.singletonList(localLogDir.getAbsolutePath()))
|
||||||
|
.when(mockDirsHandler).getLogDirsForRead();
|
||||||
|
Context ctx = new NMContext(context.getContainerTokenSecretManager(),
|
||||||
|
context.getNMTokenSecretManager(), mockDirsHandler,
|
||||||
|
context.getApplicationACLsManager(), context.getNMStateStore(),
|
||||||
|
false, conf);
|
||||||
|
|
||||||
|
Configuration monitorConf = new Configuration(conf);
|
||||||
|
monitorConf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false);
|
||||||
|
monitorConf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false);
|
||||||
|
monitorConf.setBoolean(YarnConfiguration.NM_CONTAINER_METRICS_ENABLE,
|
||||||
|
false);
|
||||||
|
monitorConf.setBoolean(YarnConfiguration.NM_CONTAINER_LOG_MONITOR_ENABLED,
|
||||||
|
true);
|
||||||
|
monitorConf.setLong(
|
||||||
|
YarnConfiguration.NM_CONTAINER_LOG_DIR_SIZE_LIMIT_BYTES, 10);
|
||||||
|
monitorConf.setLong(
|
||||||
|
YarnConfiguration.NM_CONTAINER_LOG_TOTAL_SIZE_LIMIT_BYTES, 10000000);
|
||||||
|
monitorConf.setLong(YarnConfiguration.NM_CONTAINER_LOG_MON_INTERVAL_MS,
|
||||||
|
10);
|
||||||
|
|
||||||
|
EventHandler mockHandler = mock(EventHandler.class);
|
||||||
|
AsyncDispatcher mockDispatcher = mock(AsyncDispatcher.class);
|
||||||
|
doReturn(mockHandler).when(mockDispatcher).getEventHandler();
|
||||||
|
ContainersMonitor monitor = new ContainersMonitorImpl(
|
||||||
|
mock(ContainerExecutor.class), mockDispatcher, ctx);
|
||||||
|
monitor.init(monitorConf);
|
||||||
|
monitor.start();
|
||||||
|
Event event;
|
||||||
|
try {
|
||||||
|
ctx.getApplications().put(appId, app);
|
||||||
|
ctx.getContainers().put(cid, container);
|
||||||
|
monitor.handle(new ContainerStartMonitoringEvent(cid, 1, 1, 1, 0, 0));
|
||||||
|
|
||||||
|
PrintWriter fileWriter = new PrintWriter(new File(containerLogDir,
|
||||||
|
"log"));
|
||||||
|
fileWriter.write("This container is logging too much.");
|
||||||
|
fileWriter.close();
|
||||||
|
|
||||||
|
ArgumentCaptor<Event> captor = ArgumentCaptor.forClass(Event.class);
|
||||||
|
verify(mockHandler, timeout(10000)).handle(captor.capture());
|
||||||
|
event = captor.getValue();
|
||||||
|
} finally {
|
||||||
|
monitor.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
assertTrue("Expected a kill event", event instanceof ContainerKillEvent);
|
||||||
|
ContainerKillEvent cke = (ContainerKillEvent) event;
|
||||||
|
assertEquals("Unexpected container exit status",
|
||||||
|
ContainerExitStatus.KILLED_FOR_EXCESS_LOGS,
|
||||||
|
cke.getContainerExitStatus());
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
@Test
|
||||||
|
public void testContainerKillOnExcessTotalLogs() throws Exception {
|
||||||
|
final String user = "someuser";
|
||||||
|
ApplicationId appId = ApplicationId.newInstance(1, 1);
|
||||||
|
ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1);
|
||||||
|
ContainerId cid = ContainerId.newContainerId(attemptId, 1);
|
||||||
|
Application app = mock(Application.class);
|
||||||
|
doReturn(user).when(app).getUser();
|
||||||
|
doReturn(appId).when(app).getAppId();
|
||||||
|
Container container = mock(Container.class);
|
||||||
|
doReturn(cid).when(container).getContainerId();
|
||||||
|
doReturn(user).when(container).getUser();
|
||||||
|
File logDir1 = new File(localLogDir, "dir1");
|
||||||
|
File logDir2 = new File(localLogDir, "dir2");
|
||||||
|
List<String> logDirs = new ArrayList<>();
|
||||||
|
logDirs.add(logDir1.getAbsolutePath());
|
||||||
|
logDirs.add(logDir2.getAbsolutePath());
|
||||||
|
LocalDirsHandlerService mockDirsHandler =
|
||||||
|
mock(LocalDirsHandlerService.class);
|
||||||
|
doReturn(logDirs).when(mockDirsHandler).getLogDirsForRead();
|
||||||
|
Context ctx = new NMContext(context.getContainerTokenSecretManager(),
|
||||||
|
context.getNMTokenSecretManager(), mockDirsHandler,
|
||||||
|
context.getApplicationACLsManager(), context.getNMStateStore(),
|
||||||
|
false, conf);
|
||||||
|
|
||||||
|
File clogDir1 = new File(new File(logDir1, appId.toString()),
|
||||||
|
cid.toString());
|
||||||
|
clogDir1.mkdirs();
|
||||||
|
File clogDir2 = new File(new File(logDir2, appId.toString()),
|
||||||
|
cid.toString());
|
||||||
|
clogDir2.mkdirs();
|
||||||
|
|
||||||
|
Configuration monitorConf = new Configuration(conf);
|
||||||
|
monitorConf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false);
|
||||||
|
monitorConf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false);
|
||||||
|
monitorConf.setBoolean(YarnConfiguration.NM_CONTAINER_METRICS_ENABLE,
|
||||||
|
false);
|
||||||
|
monitorConf.setBoolean(YarnConfiguration.NM_CONTAINER_LOG_MONITOR_ENABLED,
|
||||||
|
true);
|
||||||
|
monitorConf.setLong(
|
||||||
|
YarnConfiguration.NM_CONTAINER_LOG_DIR_SIZE_LIMIT_BYTES, 100000);
|
||||||
|
monitorConf.setLong(
|
||||||
|
YarnConfiguration.NM_CONTAINER_LOG_TOTAL_SIZE_LIMIT_BYTES, 15);
|
||||||
|
monitorConf.setLong(YarnConfiguration.NM_CONTAINER_LOG_MON_INTERVAL_MS,
|
||||||
|
10);
|
||||||
|
monitorConf.set(YarnConfiguration.NM_LOG_DIRS, logDir1.getAbsolutePath()
|
||||||
|
+ "," + logDir2.getAbsolutePath());
|
||||||
|
|
||||||
|
EventHandler mockHandler = mock(EventHandler.class);
|
||||||
|
AsyncDispatcher mockDispatcher = mock(AsyncDispatcher.class);
|
||||||
|
doReturn(mockHandler).when(mockDispatcher).getEventHandler();
|
||||||
|
ContainersMonitor monitor = new ContainersMonitorImpl(
|
||||||
|
mock(ContainerExecutor.class), mockDispatcher, ctx);
|
||||||
|
monitor.init(monitorConf);
|
||||||
|
monitor.start();
|
||||||
|
Event event;
|
||||||
|
try {
|
||||||
|
ctx.getApplications().put(appId, app);
|
||||||
|
ctx.getContainers().put(cid, container);
|
||||||
|
monitor.handle(new ContainerStartMonitoringEvent(cid, 1, 1, 1, 0, 0));
|
||||||
|
|
||||||
|
PrintWriter fileWriter = new PrintWriter(new File(clogDir1, "log"));
|
||||||
|
fileWriter.write("0123456789");
|
||||||
|
fileWriter.close();
|
||||||
|
|
||||||
|
Thread.sleep(1000);
|
||||||
|
verify(mockHandler, never()).handle(any(Event.class));
|
||||||
|
|
||||||
|
fileWriter = new PrintWriter(new File(clogDir2, "log"));
|
||||||
|
fileWriter.write("0123456789");
|
||||||
|
fileWriter.close();
|
||||||
|
|
||||||
|
ArgumentCaptor<Event> captor = ArgumentCaptor.forClass(Event.class);
|
||||||
|
verify(mockHandler, timeout(10000)).handle(captor.capture());
|
||||||
|
event = captor.getValue();
|
||||||
|
} finally {
|
||||||
|
monitor.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
assertTrue("Expected a kill event", event instanceof ContainerKillEvent);
|
||||||
|
ContainerKillEvent cke = (ContainerKillEvent) event;
|
||||||
|
assertEquals("Unexpected container exit status",
|
||||||
|
ContainerExitStatus.KILLED_FOR_EXCESS_LOGS,
|
||||||
|
cke.getContainerExitStatus());
|
||||||
|
}
|
||||||
|
|
||||||
@Test(timeout = 20000)
|
@Test(timeout = 20000)
|
||||||
public void testContainerMonitorMemFlags() {
|
public void testContainerMonitorMemFlags() {
|
||||||
ContainersMonitor cm = null;
|
ContainersMonitor cm = null;
|
||||||
|
|
|
@ -202,4 +202,20 @@ The following settings need to be set in *yarn-site.xml*.
|
||||||
<property>
|
<property>
|
||||||
<name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
|
<name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
|
||||||
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
|
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
Prevent Container Logs From Getting Too Big
|
||||||
|
-------------------------------------------
|
||||||
|
|
||||||
|
This allows a cluster admin to configure a cluster such that a task attempt will be killed if any container log exceeds a configured size. This helps prevent logs from filling disks and also prevent the need to aggregate enormous logs.
|
||||||
|
|
||||||
|
### Configuration
|
||||||
|
|
||||||
|
The following parameters can be used to configure the container log dir sizes.
|
||||||
|
|
||||||
|
| Configuration Name | Allowed Values | Description |
|
||||||
|
|:---- |:---- |:---- |
|
||||||
|
| `yarn.nodemanager.container-log-monitor.enable` | true, false | Flag to enable the container log monitor which enforces container log directory size limits. Default is false. |
|
||||||
|
| `yarn.nodemanager.container-log-monitor.interval-ms` | Positive integer | How often to check the usage of a container's log directories in milliseconds. Default is 60000 ms. |
|
||||||
|
| `yarn.nodemanager.container-log-monitor.dir-size-limit-bytes` | Long | The disk space limit, in bytes, for a single container log directory. Default is 1000000000. |
|
||||||
|
| `yarn.nodemanager.container-log-monitor.total-size-limit-bytes` | Long | The disk space limit, in bytes, for all of a container's logs. The default is 10000000000. |
|
||||||
|
|
Loading…
Reference in New Issue