diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerExitStatus.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerExitStatus.java index 02070100e3b..26a204ea392 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerExitStatus.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerExitStatus.java @@ -78,4 +78,9 @@ public class ContainerExitStatus { */ public static final int KILLED_BY_CONTAINER_SCHEDULER = -108; + /** + * Container was terminated for generating excess log data. + */ + public static final int KILLED_FOR_EXCESS_LOGS = -109; + } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index ba440e6d960..f11ebca9a67 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1820,6 +1820,25 @@ public class YarnConfiguration extends Configuration { public static final boolean DEFAULT_PROCFS_USE_SMAPS_BASED_RSS_ENABLED = false; + /** Enable switch for container log monitoring. */ + public static final String NM_CONTAINER_LOG_MONITOR_ENABLED = + NM_PREFIX + "container-log-monitor.enable"; + public static final boolean DEFAULT_NM_CONTAINER_LOG_MONITOR_ENABLED = false; + /** How often to monitor logs generated by containers. */ + public static final String NM_CONTAINER_LOG_MON_INTERVAL_MS = + NM_PREFIX + "container-log-monitor.interval-ms"; + public static final int DEFAULT_NM_CONTAINER_LOG_MON_INTERVAL_MS = 60000; + /** The disk space limit for a single container log directory. */ + public static final String NM_CONTAINER_LOG_DIR_SIZE_LIMIT_BYTES = + NM_PREFIX + "container-log-monitor.dir-size-limit-bytes"; + public static final long DEFAULT_NM_CONTAINER_LOG_DIR_SIZE_LIMIT_BYTES = + 1000000000L; + /** The disk space limit for all of a container's logs. */ + public static final String NM_CONTAINER_LOG_TOTAL_SIZE_LIMIT_BYTES = + NM_PREFIX + "container-log-monitor.total-size-limit-bytes"; + public static final long DEFAULT_NM_CONTAINER_LOG_TOTAL_SIZE_LIMIT_BYTES = + 10000000000L; + /** Enable/disable container metrics. */ @Private public static final String NM_CONTAINER_METRICS_ENABLE = diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 7236e4f835c..910c1001868 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -1576,6 +1576,34 @@ yarn.nodemanager.container-monitor.interval-ms + + Flag to enable the container log monitor which enforces + container log directory size limits. + yarn.nodemanager.container-log-monitor.enable + false + + + + How often to check the usage of a container's log directories + in milliseconds + yarn.nodemanager.container-log-monitor.interval-ms + 60000 + + + + The disk space limit, in bytes, for a single + container log directory + yarn.nodemanager.container-log-monitor.dir-size-limit-bytes + 1000000000 + + + + The disk space limit, in bytes, for all of a container's + logs + yarn.nodemanager.container-log-monitor.total-size-limit-bytes + 10000000000 + + Class that calculates containers current resource utilization. If not set, the value for yarn.nodemanager.resource-calculator.class will diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java index d56ca650290..a7bf73fdfa1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java @@ -29,6 +29,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.classification.InterfaceAudience.Private; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.service.AbstractService; import org.apache.hadoop.util.StringUtils.TraditionalBinaryPrefix; import org.apache.hadoop.yarn.api.records.ContainerExitStatus; @@ -46,11 +47,14 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Cont import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerExecutionException; import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher; import org.apache.hadoop.yarn.server.nodemanager.util.NodeManagerHardwareUtils; +import org.apache.hadoop.yarn.server.nodemanager.webapp.ContainerLogsUtils; import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin; import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree; import java.util.Arrays; +import java.io.File; import java.util.Map; +import java.util.List; import java.util.Map.Entry; import java.util.Optional; import java.util.concurrent.ConcurrentHashMap; @@ -69,6 +73,10 @@ public class ContainersMonitorImpl extends AbstractService implements private long monitoringInterval; private MonitoringThread monitoringThread; + private int logCheckInterval; + private LogMonitorThread logMonitorThread; + private long logDirSizeLimit; + private long logTotalSizeLimit; private CGroupElasticMemoryController oomListenerThread; private boolean containerMetricsEnabled; private long containerMetricsPeriodMs; @@ -94,6 +102,7 @@ public class ContainersMonitorImpl extends AbstractService implements private boolean elasticMemoryEnforcement; private boolean strictMemoryEnforcement; private boolean containersMonitorEnabled; + private boolean logMonitorEnabled; private long maxVCoresAllottedForContainers; @@ -122,6 +131,8 @@ public class ContainersMonitorImpl extends AbstractService implements this.monitoringThread = new MonitoringThread(); + this.logMonitorThread = new LogMonitorThread(); + this.containersUtilization = ResourceUtilization.newInstance(0, 0, 0.0f); } @@ -133,6 +144,16 @@ public class ContainersMonitorImpl extends AbstractService implements this.conf.getLong(YarnConfiguration.NM_RESOURCE_MON_INTERVAL_MS, YarnConfiguration.DEFAULT_NM_RESOURCE_MON_INTERVAL_MS)); + this.logCheckInterval = + conf.getInt(YarnConfiguration.NM_CONTAINER_LOG_MON_INTERVAL_MS, + YarnConfiguration.DEFAULT_NM_CONTAINER_LOG_MON_INTERVAL_MS); + this.logDirSizeLimit = + conf.getLong(YarnConfiguration.NM_CONTAINER_LOG_DIR_SIZE_LIMIT_BYTES, + YarnConfiguration.DEFAULT_NM_CONTAINER_LOG_DIR_SIZE_LIMIT_BYTES); + this.logTotalSizeLimit = + conf.getLong(YarnConfiguration.NM_CONTAINER_LOG_TOTAL_SIZE_LIMIT_BYTES, + YarnConfiguration.DEFAULT_NM_CONTAINER_LOG_TOTAL_SIZE_LIMIT_BYTES); + this.resourceCalculatorPlugin = ResourceCalculatorPlugin.getContainersMonitorPlugin(this.conf); LOG.info(" Using ResourceCalculatorPlugin : " @@ -217,6 +238,11 @@ public class ContainersMonitorImpl extends AbstractService implements isContainerMonitorEnabled() && monitoringInterval > 0; LOG.info("ContainersMonitor enabled: " + containersMonitorEnabled); + logMonitorEnabled = + conf.getBoolean(YarnConfiguration.NM_CONTAINER_LOG_MONITOR_ENABLED, + YarnConfiguration.DEFAULT_NM_CONTAINER_LOG_MONITOR_ENABLED); + LOG.info("Container Log Monitor Enabled: "+ logMonitorEnabled); + nodeCpuPercentageForYARN = NodeManagerHardwareUtils.getNodeCpuPercentage(this.conf); @@ -286,13 +312,16 @@ public class ContainersMonitorImpl extends AbstractService implements if (oomListenerThread != null) { oomListenerThread.start(); } + if (logMonitorEnabled) { + this.logMonitorThread.start(); + } super.serviceStart(); } @Override protected void serviceStop() throws Exception { + stopped = true; if (containersMonitorEnabled) { - stopped = true; this.monitoringThread.interrupt(); try { this.monitoringThread.join(); @@ -308,6 +337,13 @@ public class ContainersMonitorImpl extends AbstractService implements } } } + if (logMonitorEnabled) { + this.logMonitorThread.interrupt(); + try { + this.logMonitorThread.join(); + } catch (InterruptedException e) { + } + } super.serviceStop(); } @@ -771,7 +807,8 @@ public class ContainersMonitorImpl extends AbstractService implements } } - if (isMemoryOverLimit.isPresent() && isMemoryOverLimit.get()) { + if (isMemoryOverLimit.isPresent() && isMemoryOverLimit.get() + && trackingContainers.remove(containerId) != null) { // Virtual or physical memory over limit. Fail the container and // remove // the corresponding process tree @@ -785,7 +822,6 @@ public class ContainersMonitorImpl extends AbstractService implements eventDispatcher.getEventHandler().handle( new ContainerKillEvent(containerId, containerExitStatus, msg)); - trackingContainers.remove(containerId); LOG.info("Removed ProcessTree with root " + pId); } } @@ -853,6 +889,72 @@ public class ContainersMonitorImpl extends AbstractService implements } } + private class LogMonitorThread extends Thread { + LogMonitorThread() { + super("Container Log Monitor"); + } + + @Override + public void run() { + while (!stopped && !Thread.currentThread().isInterrupted()) { + for (Entry entry : + trackingContainers.entrySet()) { + ContainerId containerId = entry.getKey(); + ProcessTreeInfo ptInfo = entry.getValue(); + Container container = context.getContainers().get(containerId); + if (container == null) { + continue; + } + try { + List logDirs = ContainerLogsUtils.getContainerLogDirs( + containerId, container.getUser(), context); + long totalLogDataBytes = 0; + for (File dir : logDirs) { + long currentDirSizeBytes = FileUtil.getDU(dir); + totalLogDataBytes += currentDirSizeBytes; + String killMsg = null; + if (currentDirSizeBytes > logDirSizeLimit) { + killMsg = String.format( + "Container [pid=%s,containerID=%s] is logging beyond " + + "the container single log directory limit.%n" + + "Limit: %d Log Directory Size: %d Log Directory: %s" + + "%nKilling container.%n", + ptInfo.getPID(), containerId, logDirSizeLimit, + currentDirSizeBytes, dir); + } else if (totalLogDataBytes > logTotalSizeLimit) { + killMsg = String.format( + "Container [pid=%s,containerID=%s] is logging beyond " + + "the container total log limit.%n" + + "Limit: %d Total Size: >=%d" + + "%nKilling container.%n", + ptInfo.getPID(), containerId, logTotalSizeLimit, + totalLogDataBytes); + } + if (killMsg != null + && trackingContainers.remove(containerId) != null) { + LOG.warn(killMsg); + eventDispatcher.getEventHandler().handle( + new ContainerKillEvent(containerId, + ContainerExitStatus.KILLED_FOR_EXCESS_LOGS, killMsg)); + LOG.info("Removed ProcessTree with root " + ptInfo.getPID()); + break; + } + } + } catch (Exception e) { + LOG.warn("Uncaught exception in ContainerMemoryManager " + + "while monitoring log usage for " + containerId, e); + } + } + try { + Thread.sleep(logCheckInterval); + } catch (InterruptedException e) { + LOG.info("Log monitor thread was interrupted. " + + "Stopping container log monitoring."); + } + } + } + } + private void updateContainerMetrics(ContainersMonitorEvent monitoringEvent) { if (!containerMetricsEnabled || monitoringEvent == null) { return; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitor.java index 02f6cea769b..5bd9eafac88 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitor.java @@ -21,9 +21,13 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import static org.mockito.Matchers.any; +import static org.mockito.Mockito.doReturn; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; import static org.mockito.Mockito.spy; import static org.mockito.Mockito.timeout; +import static org.mockito.Mockito.verify; import java.io.BufferedReader; import java.io.File; @@ -32,12 +36,14 @@ import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; import java.util.function.Supplier; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.UnsupportedFileSystemException; @@ -61,12 +67,19 @@ import org.apache.hadoop.yarn.api.records.Token; import org.apache.hadoop.yarn.api.records.URL; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.AsyncDispatcher; +import org.apache.hadoop.yarn.event.Event; +import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor; import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.Signal; import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; +import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerKillEvent; import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerSignalContext; import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.util.LinuxResourceCalculatorPlugin; @@ -76,6 +89,7 @@ import org.apache.hadoop.yarn.util.TestProcfsBasedProcessTree; import org.junit.Assert; import org.junit.Before; import org.junit.Test; +import org.mockito.ArgumentCaptor; import org.mockito.Mockito; import org.slf4j.LoggerFactory; @@ -88,6 +102,7 @@ public class TestContainersMonitor extends BaseContainerManagerTest { static { LOG = LoggerFactory.getLogger(TestContainersMonitor.class); } + @Before public void setup() throws IOException { conf.setClass( @@ -353,6 +368,164 @@ public class TestContainersMonitor extends BaseContainerManagerTest { .build())); } + @SuppressWarnings("unchecked") + @Test + public void testContainerKillOnExcessLogDirectory() throws Exception { + final String user = "someuser"; + ApplicationId appId = ApplicationId.newInstance(1, 1); + ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1); + ContainerId cid = ContainerId.newContainerId(attemptId, 1); + Application app = mock(Application.class); + doReturn(user).when(app).getUser(); + doReturn(appId).when(app).getAppId(); + Container container = mock(Container.class); + doReturn(cid).when(container).getContainerId(); + doReturn(user).when(container).getUser(); + File containerLogDir = new File(new File(localLogDir, appId.toString()), + cid.toString()); + containerLogDir.mkdirs(); + LocalDirsHandlerService mockDirsHandler = + mock(LocalDirsHandlerService.class); + doReturn(Collections.singletonList(localLogDir.getAbsolutePath())) + .when(mockDirsHandler).getLogDirsForRead(); + Context ctx = new NMContext(context.getContainerTokenSecretManager(), + context.getNMTokenSecretManager(), mockDirsHandler, + context.getApplicationACLsManager(), context.getNMStateStore(), + false, conf); + + Configuration monitorConf = new Configuration(conf); + monitorConf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false); + monitorConf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false); + monitorConf.setBoolean(YarnConfiguration.NM_CONTAINER_METRICS_ENABLE, + false); + monitorConf.setBoolean(YarnConfiguration.NM_CONTAINER_LOG_MONITOR_ENABLED, + true); + monitorConf.setLong( + YarnConfiguration.NM_CONTAINER_LOG_DIR_SIZE_LIMIT_BYTES, 10); + monitorConf.setLong( + YarnConfiguration.NM_CONTAINER_LOG_TOTAL_SIZE_LIMIT_BYTES, 10000000); + monitorConf.setLong(YarnConfiguration.NM_CONTAINER_LOG_MON_INTERVAL_MS, + 10); + + EventHandler mockHandler = mock(EventHandler.class); + AsyncDispatcher mockDispatcher = mock(AsyncDispatcher.class); + doReturn(mockHandler).when(mockDispatcher).getEventHandler(); + ContainersMonitor monitor = new ContainersMonitorImpl( + mock(ContainerExecutor.class), mockDispatcher, ctx); + monitor.init(monitorConf); + monitor.start(); + Event event; + try { + ctx.getApplications().put(appId, app); + ctx.getContainers().put(cid, container); + monitor.handle(new ContainerStartMonitoringEvent(cid, 1, 1, 1, 0, 0)); + + PrintWriter fileWriter = new PrintWriter(new File(containerLogDir, + "log")); + fileWriter.write("This container is logging too much."); + fileWriter.close(); + + ArgumentCaptor captor = ArgumentCaptor.forClass(Event.class); + verify(mockHandler, timeout(10000)).handle(captor.capture()); + event = captor.getValue(); + } finally { + monitor.stop(); + } + + assertTrue("Expected a kill event", event instanceof ContainerKillEvent); + ContainerKillEvent cke = (ContainerKillEvent) event; + assertEquals("Unexpected container exit status", + ContainerExitStatus.KILLED_FOR_EXCESS_LOGS, + cke.getContainerExitStatus()); + } + + @SuppressWarnings("unchecked") + @Test + public void testContainerKillOnExcessTotalLogs() throws Exception { + final String user = "someuser"; + ApplicationId appId = ApplicationId.newInstance(1, 1); + ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1); + ContainerId cid = ContainerId.newContainerId(attemptId, 1); + Application app = mock(Application.class); + doReturn(user).when(app).getUser(); + doReturn(appId).when(app).getAppId(); + Container container = mock(Container.class); + doReturn(cid).when(container).getContainerId(); + doReturn(user).when(container).getUser(); + File logDir1 = new File(localLogDir, "dir1"); + File logDir2 = new File(localLogDir, "dir2"); + List logDirs = new ArrayList<>(); + logDirs.add(logDir1.getAbsolutePath()); + logDirs.add(logDir2.getAbsolutePath()); + LocalDirsHandlerService mockDirsHandler = + mock(LocalDirsHandlerService.class); + doReturn(logDirs).when(mockDirsHandler).getLogDirsForRead(); + Context ctx = new NMContext(context.getContainerTokenSecretManager(), + context.getNMTokenSecretManager(), mockDirsHandler, + context.getApplicationACLsManager(), context.getNMStateStore(), + false, conf); + + File clogDir1 = new File(new File(logDir1, appId.toString()), + cid.toString()); + clogDir1.mkdirs(); + File clogDir2 = new File(new File(logDir2, appId.toString()), + cid.toString()); + clogDir2.mkdirs(); + + Configuration monitorConf = new Configuration(conf); + monitorConf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false); + monitorConf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false); + monitorConf.setBoolean(YarnConfiguration.NM_CONTAINER_METRICS_ENABLE, + false); + monitorConf.setBoolean(YarnConfiguration.NM_CONTAINER_LOG_MONITOR_ENABLED, + true); + monitorConf.setLong( + YarnConfiguration.NM_CONTAINER_LOG_DIR_SIZE_LIMIT_BYTES, 100000); + monitorConf.setLong( + YarnConfiguration.NM_CONTAINER_LOG_TOTAL_SIZE_LIMIT_BYTES, 15); + monitorConf.setLong(YarnConfiguration.NM_CONTAINER_LOG_MON_INTERVAL_MS, + 10); + monitorConf.set(YarnConfiguration.NM_LOG_DIRS, logDir1.getAbsolutePath() + + "," + logDir2.getAbsolutePath()); + + EventHandler mockHandler = mock(EventHandler.class); + AsyncDispatcher mockDispatcher = mock(AsyncDispatcher.class); + doReturn(mockHandler).when(mockDispatcher).getEventHandler(); + ContainersMonitor monitor = new ContainersMonitorImpl( + mock(ContainerExecutor.class), mockDispatcher, ctx); + monitor.init(monitorConf); + monitor.start(); + Event event; + try { + ctx.getApplications().put(appId, app); + ctx.getContainers().put(cid, container); + monitor.handle(new ContainerStartMonitoringEvent(cid, 1, 1, 1, 0, 0)); + + PrintWriter fileWriter = new PrintWriter(new File(clogDir1, "log")); + fileWriter.write("0123456789"); + fileWriter.close(); + + Thread.sleep(1000); + verify(mockHandler, never()).handle(any(Event.class)); + + fileWriter = new PrintWriter(new File(clogDir2, "log")); + fileWriter.write("0123456789"); + fileWriter.close(); + + ArgumentCaptor captor = ArgumentCaptor.forClass(Event.class); + verify(mockHandler, timeout(10000)).handle(captor.capture()); + event = captor.getValue(); + } finally { + monitor.stop(); + } + + assertTrue("Expected a kill event", event instanceof ContainerKillEvent); + ContainerKillEvent cke = (ContainerKillEvent) event; + assertEquals("Unexpected container exit status", + ContainerExitStatus.KILLED_FOR_EXCESS_LOGS, + cke.getContainerExitStatus()); + } + @Test(timeout = 20000) public void testContainerMonitorMemFlags() { ContainersMonitor cm = null; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/NodeManager.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/NodeManager.md index dd7a1957f04..3e23b63ded2 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/NodeManager.md +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/NodeManager.md @@ -146,4 +146,20 @@ The following settings need to be set in *yarn-site.xml*. yarn.nodemanager.aux-services.mapreduce_shuffle.class org.apache.hadoop.mapred.ShuffleHandler - \ No newline at end of file + + +Prevent Container Logs From Getting Too Big +------------------------------------------- + +This allows a cluster admin to configure a cluster such that a task attempt will be killed if any container log exceeds a configured size. This helps prevent logs from filling disks and also prevent the need to aggregate enormous logs. + +### Configuration + +The following parameters can be used to configure the container log dir sizes. + +| Configuration Name | Allowed Values | Description | +|:---- |:---- |:---- | +| `yarn.nodemanager.container-log-monitor.enable` | true, false | Flag to enable the container log monitor which enforces container log directory size limits. Default is false. | +| `yarn.nodemanager.container-log-monitor.interval-ms` | Positive integer | How often to check the usage of a container's log directories in milliseconds. Default is 60000 ms. | +| `yarn.nodemanager.container-log-monitor.dir-size-limit-bytes` | Long | The disk space limit, in bytes, for a single container log directory. Default is 1000000000. | +| `yarn.nodemanager.container-log-monitor.total-size-limit-bytes` | Long | The disk space limit, in bytes, for all of a container's logs. The default is 10000000000. |