hadoop/hadoop-yarn/hadoop-yarn-site/NodeManager.html

919 lines
50 KiB
HTML
Raw Normal View History

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!--
| Generated by Apache Maven Doxia at 2023-02-15
| Rendered using Apache Maven Stylus Skin 1.5
-->
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Apache Hadoop 3.4.0-SNAPSHOT &#x2013; NodeManager</title>
<style type="text/css" media="all">
@import url("./css/maven-base.css");
@import url("./css/maven-theme.css");
@import url("./css/site.css");
</style>
<link rel="stylesheet" href="./css/print.css" type="text/css" media="print" />
<meta name="Date-Revision-yyyymmdd" content="20230215" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
</head>
<body class="composite">
<div id="banner">
<a href="http://hadoop.apache.org/" id="bannerLeft">
<img src="http://hadoop.apache.org/images/hadoop-logo.jpg" alt="" />
</a>
<a href="http://www.apache.org/" id="bannerRight">
<img src="http://www.apache.org/images/asf_logo_wide.png" alt="" />
</a>
<div class="clear">
<hr/>
</div>
</div>
<div id="breadcrumbs">
<div class="xright"> <a href="http://wiki.apache.org/hadoop" class="externalLink">Wiki</a>
|
<a href="https://gitbox.apache.org/repos/asf/hadoop.git" class="externalLink">git</a>
|
<a href="http://hadoop.apache.org/" class="externalLink">Apache Hadoop</a>
&nbsp;| Last Published: 2023-02-15
&nbsp;| Version: 3.4.0-SNAPSHOT
</div>
<div class="clear">
<hr/>
</div>
</div>
<div id="leftColumn">
<div id="navcolumn">
<h5>General</h5>
<ul>
<li class="none">
<a href="../../index.html">Overview</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/SingleCluster.html">Single Node Setup</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/ClusterSetup.html">Cluster Setup</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/CommandsManual.html">Commands Reference</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/FileSystemShell.html">FileSystem Shell</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/Compatibility.html">Compatibility Specification</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/DownstreamDev.html">Downstream Developer's Guide</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/AdminCompatibilityGuide.html">Admin Compatibility Guide</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/InterfaceClassification.html">Interface Classification</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/filesystem/index.html">FileSystem Specification</a>
</li>
</ul>
<h5>Common</h5>
<ul>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/CLIMiniCluster.html">CLI Mini Cluster</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/FairCallQueue.html">Fair Call Queue</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/NativeLibraries.html">Native Libraries</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/Superusers.html">Proxy User</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/RackAwareness.html">Rack Awareness</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/SecureMode.html">Secure Mode</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/ServiceLevelAuth.html">Service Level Authorization</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/HttpAuthentication.html">HTTP Authentication</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/CredentialProviderAPI.html">Credential Provider API</a>
</li>
<li class="none">
<a href="../../hadoop-kms/index.html">Hadoop KMS</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/Tracing.html">Tracing</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/UnixShellGuide.html">Unix Shell Guide</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/registry/index.html">Registry</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/AsyncProfilerServlet.html">Async Profiler</a>
</li>
</ul>
<h5>HDFS</h5>
<ul>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/HdfsDesign.html">Architecture</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/HdfsUserGuide.html">User Guide</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/HDFSCommands.html">Commands Reference</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/HDFSHighAvailabilityWithQJM.html">NameNode HA With QJM</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/HDFSHighAvailabilityWithNFS.html">NameNode HA With NFS</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/ObserverNameNode.html">Observer NameNode</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/Federation.html">Federation</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/ViewFs.html">ViewFs</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/ViewFsOverloadScheme.html">ViewFsOverloadScheme</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/HdfsSnapshots.html">Snapshots</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/HdfsEditsViewer.html">Edits Viewer</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/HdfsImageViewer.html">Image Viewer</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/HdfsPermissionsGuide.html">Permissions and HDFS</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/HdfsQuotaAdminGuide.html">Quotas and HDFS</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/LibHdfs.html">libhdfs (C API)</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/WebHDFS.html">WebHDFS (REST API)</a>
</li>
<li class="none">
<a href="../../hadoop-hdfs-httpfs/index.html">HttpFS</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/ShortCircuitLocalReads.html">Short Circuit Local Reads</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/CentralizedCacheManagement.html">Centralized Cache Management</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/HdfsNfsGateway.html">NFS Gateway</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/HdfsRollingUpgrade.html">Rolling Upgrade</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/ExtendedAttributes.html">Extended Attributes</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/TransparentEncryption.html">Transparent Encryption</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/HdfsMultihoming.html">Multihoming</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/ArchivalStorage.html">Storage Policies</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/MemoryStorage.html">Memory Storage Support</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/SLGUserGuide.html">Synthetic Load Generator</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/HDFSErasureCoding.html">Erasure Coding</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/HDFSDiskbalancer.html">Disk Balancer</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/HdfsUpgradeDomain.html">Upgrade Domain</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/HdfsDataNodeAdminGuide.html">DataNode Admin</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs-rbf/HDFSRouterFederation.html">Router Federation</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/HdfsProvidedStorage.html">Provided Storage</a>
</li>
</ul>
<h5>MapReduce</h5>
<ul>
<li class="none">
<a href="../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html">Tutorial</a>
</li>
<li class="none">
<a href="../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapredCommands.html">Commands Reference</a>
</li>
<li class="none">
<a href="../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduce_Compatibility_Hadoop1_Hadoop2.html">Compatibility with 1.x</a>
</li>
<li class="none">
<a href="../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/EncryptedShuffle.html">Encrypted Shuffle</a>
</li>
<li class="none">
<a href="../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/PluggableShuffleAndPluggableSort.html">Pluggable Shuffle/Sort</a>
</li>
<li class="none">
<a href="../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/DistributedCacheDeploy.html">Distributed Cache Deploy</a>
</li>
<li class="none">
<a href="../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/SharedCacheSupport.html">Support for YARN Shared Cache</a>
</li>
</ul>
<h5>MapReduce REST APIs</h5>
<ul>
<li class="none">
<a href="../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapredAppMasterRest.html">MR Application Master</a>
</li>
<li class="none">
<a href="../../hadoop-mapreduce-client/hadoop-mapreduce-client-hs/HistoryServerRest.html">MR History Server</a>
</li>
</ul>
<h5>YARN</h5>
<ul>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/YARN.html">Architecture</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/YarnCommands.html">Commands Reference</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/CapacityScheduler.html">Capacity Scheduler</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/FairScheduler.html">Fair Scheduler</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/ResourceManagerRestart.html">ResourceManager Restart</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/ResourceManagerHA.html">ResourceManager HA</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/ResourceModel.html">Resource Model</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/NodeLabel.html">Node Labels</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/NodeAttributes.html">Node Attributes</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/WebApplicationProxy.html">Web Application Proxy</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/TimelineServer.html">Timeline Server</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/TimelineServiceV2.html">Timeline Service V.2</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/WritingYarnApplications.html">Writing YARN Applications</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/YarnApplicationSecurity.html">YARN Application Security</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/NodeManager.html">NodeManager</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/DockerContainers.html">Running Applications in Docker Containers</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/RuncContainers.html">Running Applications in runC Containers</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/NodeManagerCgroups.html">Using CGroups</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/SecureContainer.html">Secure Containers</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/ReservationSystem.html">Reservation System</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/GracefulDecommission.html">Graceful Decommission</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/OpportunisticContainers.html">Opportunistic Containers</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/Federation.html">YARN Federation</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/SharedCache.html">Shared Cache</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/UsingGpus.html">Using GPU</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/UsingFPGA.html">Using FPGA</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/PlacementConstraints.html">Placement Constraints</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/YarnUI2.html">YARN UI2</a>
</li>
</ul>
<h5>YARN REST APIs</h5>
<ul>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/WebServicesIntro.html">Introduction</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/ResourceManagerRest.html">Resource Manager</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/NodeManagerRest.html">Node Manager</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/TimelineServer.html#Timeline_Server_REST_API_v1">Timeline Server</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/TimelineServiceV2.html#Timeline_Service_v.2_REST_API">Timeline Service V.2</a>
</li>
</ul>
<h5>YARN Service</h5>
<ul>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/yarn-service/Overview.html">Overview</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/yarn-service/QuickStart.html">QuickStart</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/yarn-service/Concepts.html">Concepts</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/yarn-service/YarnServiceAPI.html">Yarn Service API</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/yarn-service/ServiceDiscovery.html">Service Discovery</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-site/yarn-service/SystemServices.html">System Services</a>
</li>
</ul>
<h5>Hadoop Compatible File Systems</h5>
<ul>
<li class="none">
<a href="../../hadoop-aliyun/tools/hadoop-aliyun/index.html">Aliyun OSS</a>
</li>
<li class="none">
<a href="../../hadoop-aws/tools/hadoop-aws/index.html">Amazon S3</a>
</li>
<li class="none">
<a href="../../hadoop-azure/index.html">Azure Blob Storage</a>
</li>
<li class="none">
<a href="../../hadoop-azure-datalake/index.html">Azure Data Lake Storage</a>
</li>
<li class="none">
<a href="../../hadoop-cos/cloud-storage/index.html">Tencent COS</a>
</li>
<li class="none">
<a href="../../hadoop-huaweicloud/cloud-storage/index.html">Huaweicloud OBS</a>
</li>
</ul>
<h5>Auth</h5>
<ul>
<li class="none">
<a href="../../hadoop-auth/index.html">Overview</a>
</li>
<li class="none">
<a href="../../hadoop-auth/Examples.html">Examples</a>
</li>
<li class="none">
<a href="../../hadoop-auth/Configuration.html">Configuration</a>
</li>
<li class="none">
<a href="../../hadoop-auth/BuildingIt.html">Building</a>
</li>
</ul>
<h5>Tools</h5>
<ul>
<li class="none">
<a href="../../hadoop-streaming/HadoopStreaming.html">Hadoop Streaming</a>
</li>
<li class="none">
<a href="../../hadoop-archives/HadoopArchives.html">Hadoop Archives</a>
</li>
<li class="none">
<a href="../../hadoop-archive-logs/HadoopArchiveLogs.html">Hadoop Archive Logs</a>
</li>
<li class="none">
<a href="../../hadoop-distcp/DistCp.html">DistCp</a>
</li>
<li class="none">
<a href="../../hadoop-federation-balance/HDFSFederationBalance.html">HDFS Federation Balance</a>
</li>
<li class="none">
<a href="../../hadoop-gridmix/GridMix.html">GridMix</a>
</li>
<li class="none">
<a href="../../hadoop-rumen/Rumen.html">Rumen</a>
</li>
<li class="none">
<a href="../../hadoop-resourceestimator/ResourceEstimator.html">Resource Estimator Service</a>
</li>
<li class="none">
<a href="../../hadoop-sls/SchedulerLoadSimulator.html">Scheduler Load Simulator</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/Benchmarking.html">Hadoop Benchmarking</a>
</li>
<li class="none">
<a href="../../hadoop-dynamometer/Dynamometer.html">Dynamometer</a>
</li>
</ul>
<h5>Reference</h5>
<ul>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/release/">Changelog and Release Notes</a>
</li>
<li class="none">
<a href="../../api/index.html">Java API docs</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/UnixShellAPI.html">Unix Shell API</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/Metrics.html">Metrics</a>
</li>
</ul>
<h5>Configuration</h5>
<ul>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/core-default.xml">core-default.xml</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs/hdfs-default.xml">hdfs-default.xml</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-hdfs-rbf/hdfs-rbf-default.xml">hdfs-rbf-default.xml</a>
</li>
<li class="none">
<a href="../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml">mapred-default.xml</a>
</li>
<li class="none">
<a href="../../hadoop-yarn/hadoop-yarn-common/yarn-default.xml">yarn-default.xml</a>
</li>
<li class="none">
<a href="../../hadoop-kms/kms-default.html">kms-default.xml</a>
</li>
<li class="none">
<a href="../../hadoop-hdfs-httpfs/httpfs-default.html">httpfs-default.xml</a>
</li>
<li class="none">
<a href="../../hadoop-project-dist/hadoop-common/DeprecatedProperties.html">Deprecated Properties</a>
</li>
</ul>
<a href="http://maven.apache.org/" title="Built by Maven" class="poweredBy">
<img alt="Built by Maven" src="./images/logos/maven-feather.png"/>
</a>
</div>
</div>
<div id="bodyColumn">
<div id="contentBox">
<!---
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<h1>NodeManager</h1>
<ul>
<li><a href="#Overview">Overview</a></li>
<li><a href="#Health_Checker_Service">Health Checker Service</a>
<ul>
<li><a href="#Disk_Checker">Disk Checker</a></li>
<li><a href="#External_Health_Script">External Health Script</a></li></ul></li>
<li><a href="#NodeManager_Restart">NodeManager Restart</a>
<ul>
<li><a href="#Introduction">Introduction</a></li>
<li><a href="#Enabling_NM_Restart">Enabling NM Restart</a></li></ul></li>
<li><a href="#Auxiliary_Service_Classpath_Isolation">Auxiliary Service Classpath Isolation</a>
<ul>
<li><a href="#Introduction">Introduction</a></li>
<li><a href="#Manifest">Manifest</a></li>
<li><a href="#Configuration">Configuration</a></li>
<li><a href="#Configuration_Examples">Configuration Examples</a></li></ul></li>
<li><a href="#Prevent_Container_Logs_From_Getting_Too_Big">Prevent Container Logs From Getting Too Big</a>
<ul>
<li><a href="#Configuration">Configuration</a></li></ul></li>
<li><a href="#Scale_Heart-beat_Interval_Based_on_CPU_Utilization">Scale Heart-beat Interval Based on CPU Utilization</a>
<ul>
<li><a href="#Configuration">Configuration</a></li></ul></li></ul>
<section>
<h2><a name="Overview"></a>Overview</h2>
<p>The NodeManager is responsible for launching and managing containers on a node. Containers execute tasks as specified by the AppMaster.</p></section><section>
<h2><a name="Health_Checker_Service"></a>Health Checker Service</h2>
<p>The NodeManager runs services to determine the health of the node it is executing on. The services perform checks on the disk as well as any user specified tests. If any health check fails, the NodeManager marks the node as unhealthy and communicates this to the ResourceManager, which then stops assigning containers to the node. Communication of the node status is done as part of the heartbeat between the NodeManager and the ResourceManager. The intervals at which the disk checker and health monitor(described below) run don&#x2019;t affect the heartbeat intervals. When the heartbeat takes place, the status of both checks is used to determine the health of the node.</p><section>
<h3><a name="Disk_Checker"></a>Disk Checker</h3>
<p>The disk checker checks the state of the disks that the NodeManager is configured to use(local-dirs and log-dirs, configured using yarn.nodemanager.local-dirs and yarn.nodemanager.log-dirs respectively). The checks include permissions and free disk space. It also checks that the filesystem isn&#x2019;t in a read-only state. The checks are run at 2 minute intervals by default but can be configured to run as often as the user desires. If a disk fails the check, the NodeManager stops using that particular disk but still reports the node status as healthy. However if a number of disks fail the check(the number can be configured, as explained below), then the node is reported as unhealthy to the ResourceManager and new containers will not be assigned to the node.</p>
<p>The following configuration parameters can be used to modify the disk checks:</p>
<table border="0" class="bodyTable">
<thead>
<tr class="a">
<th align="left"> Configuration Name </th>
<th align="left"> Allowed Values </th>
<th align="left"> Description </th></tr>
</thead><tbody>
<tr class="b">
<td align="left"> <code>yarn.nodemanager.disk-health-checker.enable</code> </td>
<td align="left"> true, false </td>
<td align="left"> Enable or disable the disk health checker service </td></tr>
<tr class="a">
<td align="left"> <code>yarn.nodemanager.disk-health-checker.interval-ms</code> </td>
<td align="left"> Positive integer </td>
<td align="left"> The interval, in milliseconds, at which the disk checker should run; the default value is 2 minutes </td></tr>
<tr class="b">
<td align="left"> <code>yarn.nodemanager.disk-health-checker.min-healthy-disks</code> </td>
<td align="left"> Float between 0-1 </td>
<td align="left"> The minimum fraction of disks that must pass the check for the NodeManager to mark the node as healthy; the default is 0.25 </td></tr>
<tr class="a">
<td align="left"> <code>yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage</code> </td>
<td align="left"> Float between 0-100 </td>
<td align="left"> The maximum percentage of disk space that may be utilized before a disk is marked as unhealthy by the disk checker service. This check is run for every disk used by the NodeManager. The default value is 90 i.e. 90% of the disk can be used. </td></tr>
<tr class="b">
<td align="left"> <code>yarn.nodemanager.disk-health-checker.min-free-space-per-disk-mb</code> </td>
<td align="left"> Integer </td>
<td align="left"> The minimum amount of free space that must be available on the disk for the disk checker service to mark the disk as healthy. This check is run for every disk used by the NodeManager. The default value is 0 i.e. the entire disk can be used. </td></tr>
</tbody>
</table></section><section>
<h3><a name="External_Health_Script"></a>External Health Script</h3>
<p>Users may specify their own health checker scripts that will be invoked by the health checker service. Users may specify a timeout as well as options to be passed to the script. If the script times out, results in an exception being thrown or outputs a line which begins with the string ERROR, the node is marked as unhealthy. Please note that:</p>
<ul>
<li>
<p>Exit code other than 0 is <b>not</b> considered to be a failure because it might have been caused by a syntax error. Therefore the node will <b>not</b> be marked as unhealthy.</p>
</li>
<li>
<p>If the script cannot be executed due to permissions or an incorrect path, etc, then it counts as a failure and the node will be reported as unhealthy.</p>
</li>
<li>
<p>Specifying a health check script is not mandatory. If no script is specified, only the disk checker status will be used to determine the health of the node.</p>
</li>
</ul>
<p>Users can specify up to 4 scripts to run individually with the <code>yarn.nodemanager.health-checker.scripts</code> configuration. Also these options can be configured for all scripts (global configurations):</p>
<table border="0" class="bodyTable">
<thead>
<tr class="a">
<th align="left"> Configuration Name </th>
<th align="left"> Allowed Values </th>
<th align="left"> Description </th></tr>
</thead><tbody>
<tr class="b">
<td align="left"><code>yarn.nodemanager.health-checker.script</code></td>
<td align="left"> String </td>
<td align="left"> The keywords for the health checker scripts separated by a comma. The default is &#x201c;script&#x201d;. </td></tr>
<tr class="a">
<td align="left"> <code>yarn.nodemanager.health-checker.interval-ms</code> </td>
<td align="left"> Positive integer </td>
<td align="left"> The interval, in milliseconds, at which health checker service runs; the default value is 10 minutes. </td></tr>
<tr class="b">
<td align="left"> <code>yarn.nodemanager.health-checker.timeout-ms</code> </td>
<td align="left"> Positive integer </td>
<td align="left"> The timeout for the health script that&#x2019;s executed; the default value is 20 minutes. </td></tr>
</tbody>
</table>
<p>The following options can be set for every health checker script. The %s symbol is substituted with each keyword provided in <code>yarn.nodemanager.health-checker.script</code>.</p>
<table border="0" class="bodyTable">
<thead>
<tr class="a">
<th align="left"> Configuration Name </th>
<th align="left"> Allowed Values </th>
<th align="left"> Description </th></tr>
</thead><tbody>
<tr class="b">
<td align="left"> <code>yarn.nodemanager.health-checker.%s.path</code> </td>
<td align="left"> String </td>
<td align="left"> Absolute path to the health check script to be run. Mandatory argument for each script. </td></tr>
<tr class="a">
<td align="left"> <code>yarn.nodemanager.health-checker.%s.opts</code> </td>
<td align="left"> String </td>
<td align="left"> Arguments to be passed to the script when the script is executed. Mandatory argument for each script. </td></tr>
<tr class="b">
<td align="left"> <code>yarn.nodemanager.health-checker.%s.interval-ms</code> </td>
<td align="left"> Positive integer </td>
<td align="left"> The interval, in milliseconds, at which health checker service runs. </td></tr>
<tr class="a">
<td align="left"> <code>yarn.nodemanager.health-checker.%s.timeout-ms</code> </td>
<td align="left"> Positive integer </td>
<td align="left"> The timeout for the health script that&#x2019;s executed. </td></tr>
</tbody>
</table>
<p>The interval and timeout options are not required to be specified. In that case the global configurations will be used.</p></section></section><section>
<h2><a name="NodeManager_Restart"></a>NodeManager Restart</h2><section>
<h3><a name="Introduction"></a>Introduction</h3>
<p>This document gives an overview of NodeManager (NM) restart, a feature that enables the NodeManager to be restarted without losing the active containers running on the node. At a high level, the NM stores any necessary state to a local state-store as it processes container-management requests. When the NM restarts, it recovers by first loading state for various subsystems and then letting those subsystems perform recovery using the loaded state.</p></section><section>
<h3><a name="Enabling_NM_Restart"></a>Enabling NM Restart</h3>
<p>Step 1. To enable NM Restart functionality, set the following property in <b>conf/yarn-site.xml</b> to <i>true</i>.</p>
<table border="0" class="bodyTable">
<thead>
<tr class="a">
<th align="left"> Property </th>
<th align="left"> Value </th></tr>
</thead><tbody>
<tr class="b">
<td align="left"> <code>yarn.nodemanager.recovery.enabled</code> </td>
<td align="left"> <code>true</code>, (default value is set to false) </td></tr>
</tbody>
</table>
<p>Step 2. Configure a path to the local file-system directory where the NodeManager can save its run state.</p>
<table border="0" class="bodyTable">
<thead>
<tr class="a">
<th align="left"> Property </th>
<th align="left"> Description </th></tr>
</thead><tbody>
<tr class="b">
<td align="left"> <code>yarn.nodemanager.recovery.dir</code> </td>
<td align="left"> The local filesystem directory in which the node manager will store state when recovery is enabled. The default value is set to <code>$hadoop.tmp.dir/yarn-nm-recovery</code>. </td></tr>
</tbody>
</table>
<p>Step 3: Enable NM supervision under recovery to prevent running containers from getting cleaned up when NM exits.</p>
<table border="0" class="bodyTable">
<thead>
<tr class="a">
<th align="left"> Property </th>
<th align="left"> Description </th></tr>
</thead><tbody>
<tr class="b">
<td align="left"> <code>yarn.nodemanager.recovery.supervised</code> </td>
<td align="left"> If enabled, NodeManager running will not try to cleanup containers as it exits with the assumption it will be immediately be restarted and recover containers The default value is set to &#x2018;false&#x2019;. </td></tr>
</tbody>
</table>
<p>Step 4. Configure a valid RPC address for the NodeManager.</p>
<table border="0" class="bodyTable">
<thead>
<tr class="a">
<th align="left"> Property </th>
<th align="left"> Description </th></tr>
</thead><tbody>
<tr class="b">
<td align="left"> <code>yarn.nodemanager.address</code> </td>
<td align="left"> Ephemeral ports (port 0, which is default) cannot be used for the NodeManager&#x2019;s RPC server specified via yarn.nodemanager.address as it can make NM use different ports before and after a restart. This will break any previously running clients that were communicating with the NM before restart. Explicitly setting yarn.nodemanager.address to an address with specific port number (for e.g 0.0.0.0:45454) is a precondition for enabling NM restart. </td></tr>
</tbody>
</table>
<p>Step 5. Auxiliary services.</p>
<ul>
<li>
<p>NodeManagers in a YARN cluster can be configured to run auxiliary services. For a completely functional NM restart, YARN relies on any auxiliary service configured to also support recovery. This usually includes (1) avoiding usage of ephemeral ports so that previously running clients (in this case, usually containers) are not disrupted after restart and (2) having the auxiliary service itself support recoverability by reloading any previous state when NodeManager restarts and reinitializes the auxiliary service.</p>
</li>
<li>
<p>A simple example for the above is the auxiliary service &#x2018;ShuffleHandler&#x2019; for MapReduce (MR). ShuffleHandler respects the above two requirements already, so users/admins don&#x2019;t have to do anything for it to support NM restart: (1) The configuration property <b>mapreduce.shuffle.port</b> controls which port the ShuffleHandler on a NodeManager host binds to, and it defaults to a non-ephemeral port. (2) The ShuffleHandler service also already supports recovery of previous state after NM restarts.</p>
</li>
<li>
<p>There are two ways to configure auxiliary services, through a manifest or through the Configuration. Auxiliary services will only be loaded via the prior method of using Configuration properties when an auxiliary services manifest is not enabled. One advantage of using a manifest is that NMs can dynamically reload auxiliary services based on changes to the manifest. To support reloading, AuxiliaryService implementations must perform any cleanup that is needed during the service stop phase for the NM to be able to create a new instance of the auxiliary service.</p>
</li>
</ul></section></section><section>
<h2><a name="Auxiliary_Service_Classpath_Isolation"></a>Auxiliary Service Classpath Isolation</h2><section>
<h3><a name="Introduction"></a>Introduction</h3>
<p>To launch auxiliary services on a NodeManager, users have to add their jar to NodeManager&#x2019;s classpath directly, thus put them on the system classloader. But if multiple versions of the plugin are present on the classpath, there is no control over which version actually gets loaded. Or if there are any conflicts between the dependencies introduced by the auxiliary services and the NodeManager itself, they can break the NodeManager, the auxiliary services, or both. To solve this issue, we can instantiate auxiliary services using a classloader that is different from the system classloader.</p></section><section>
<h3><a name="Manifest"></a>Manifest</h3>
<p>This section describes the auxiliary service manifest for aux-service classpath isolation. To use a manifest, the property <code>yarn.nodemanager.aux-services.manifest.enabled</code> must be set to true in <i>yarn-site.xml</i>.</p>
<p>To load the manifest file from a filesystem, set the file path in <i>yarn-site.xml</i> under the property <code>yarn.nodemanager.aux-services.manifest</code>. The NMs will check this file for new modifications at an interval specified by <code>yarn.nodemanager.aux-services.manifest.reload-ms</code> (defaults to 0; setting interval &lt;= 0 means it will not be reloaded automatically). Alternatively, the manifest file may be sent to an NM via REST API by making a PUT call to the endpoint <code>http://nm-http-address:port/ws/v1/node/auxiliaryservices</code>. Note this only updates the manifest on one NM. When it reads a new manifest, the NM will add, remove, or reload auxiliary services as needed based on the service names and versions found in the manifest.</p>
<p>An example manifest that configures classpath isolation for a CustomAuxService follows. One or more files may be specified to make up the classpath of a service, with jar or archive formats being supported.</p>
<div class="source">
<div class="source">
<pre>{
&quot;services&quot;: [
{
&quot;name&quot;: &quot;mapreduce_shuffle&quot;,
&quot;version&quot;: &quot;2&quot;,
&quot;configuration&quot;: {
&quot;properties&quot;: {
&quot;class.name&quot;: &quot;org.apache.hadoop.mapred.ShuffleHandler&quot;,
&quot;mapreduce.shuffle.transfer.buffer.size&quot;: &quot;102400&quot;,
&quot;mapreduce.shuffle.port&quot;: &quot;13562&quot;
}
}
},
{
&quot;name&quot;: &quot;CustomAuxService&quot;,
&quot;version&quot;: &quot;1&quot;,
&quot;configuration&quot;: {
&quot;properties&quot;: {
&quot;class.name&quot;: &quot;org.aux.CustomAuxService&quot;
},
&quot;files&quot;: [
{
&quot;src_file&quot;: &quot;${remote-dir}/CustomAuxService.jar&quot;,
&quot;type&quot;: &quot;STATIC&quot;
},
{
&quot;src_file&quot;: &quot;${remote-dir}/CustomAuxService.tgz&quot;,
&quot;type&quot;: &quot;ARCHIVE&quot;
}
]
}
}
]
}
</pre></div></div>
</section><section>
<h3><a name="Configuration"></a>Configuration</h3>
<p>This section describes the configuration variables for aux-service classpath isolation. Aux services will only be loaded from the configuration if a manifest file is not specified.</p>
<p>The following settings need to be set in <i>yarn-site.xml</i>.</p>
<table border="0" class="bodyTable">
<thead>
<tr class="a">
<th align="left">Configuration Name </th>
<th align="left"> Description </th></tr>
</thead><tbody>
<tr class="b">
<td align="left"> <code>yarn.nodemanager.aux-services.%s.classpath</code> </td>
<td align="left"> Provide local directory which includes the related jar file as well as all the dependencies&#x2019; jar file. We could specify the single jar file or use ${local_dir_to_jar}/* to load all jars under the dep directory. </td></tr>
<tr class="a">
<td align="left"> <code>yarn.nodemanager.aux-services.%s.remote-classpath</code> </td>
<td align="left"> Provide remote absolute or relative path to jar file(We also support zip, tar.gz, tgz, tar and gz files as well). For the same aux-service class, we can only specify one of the configurations: yarn.nodemanager.aux-services.%s.classpath or yarn.nodemanager.aux-services.%s.remote-classpath. The YarnRuntimeException will be thrown. Please also make sure that the owner of the jar file must be the same as the NodeManager user and the permbits should satisfy (permbits &amp; 0022)==0 (such as 600, it&#x2019;s not writable by group or other).</td></tr>
<tr class="b">
<td align="left"> <code>yarn.nodemanager.aux-services.%s.system-classes</code> </td>
<td align="left"> Normally, we do not need to set this configuration. The class would be loaded from customized classpath if it does not belongs to system-classes. For example, by default, the package org.apache.hadoop is in the system-classes, if your class CustomAuxService is in the package org.apache.hadoop, it would not be loaded from customized classpath. To solve this, either we could change the package for CustomAuxService, or configure our own system-classes which exclude org.apache.hadoop. </td></tr>
</tbody>
</table></section><section>
<h3><a name="Configuration_Examples"></a>Configuration Examples</h3>
<div class="source">
<div class="source">
<pre>&lt;property&gt;
&lt;name&gt;yarn.nodemanager.aux-services&lt;/name&gt;
&lt;value&gt;mapreduce_shuffle,CustomAuxService&lt;/value&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;yarn.nodemanager.aux-services.CustomAuxService.classpath&lt;/name&gt;
&lt;value&gt;${local_dir_to_jar}/CustomAuxService.jar&lt;/value&gt;
&lt;/property&gt;
&lt;!--
&lt;property&gt;
&lt;name&gt;yarn.nodemanager.aux-services.CustomAuxService.remote-classpath&lt;/name&gt;
&lt;value&gt;${remote-dir_to_jar}/CustomAuxService.jar&lt;/value&gt;
&lt;/property&gt;
--&gt;
&lt;property&gt;
&lt;name&gt;yarn.nodemanager.aux-services.CustomAuxService.class&lt;/name&gt;
&lt;value&gt;org.aux.CustomAuxService&lt;/value&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;yarn.nodemanager.aux-services.mapreduce_shuffle.class&lt;/name&gt;
&lt;value&gt;org.apache.hadoop.mapred.ShuffleHandler&lt;/value&gt;
&lt;/property&gt;
</pre></div></div>
</section></section><section>
<h2><a name="Prevent_Container_Logs_From_Getting_Too_Big"></a>Prevent Container Logs From Getting Too Big</h2>
<p>This allows a cluster admin to configure a cluster such that a task attempt will be killed if any container log exceeds a configured size. This helps prevent logs from filling disks and also prevent the need to aggregate enormous logs.</p><section>
<h3><a name="Configuration"></a>Configuration</h3>
<p>The following parameters can be used to configure the container log dir sizes.</p>
<table border="0" class="bodyTable">
<thead>
<tr class="a">
<th align="left"> Configuration Name </th>
<th align="left"> Allowed Values </th>
<th align="left"> Description </th></tr>
</thead><tbody>
<tr class="b">
<td align="left"> <code>yarn.nodemanager.container-log-monitor.enable</code> </td>
<td align="left"> true, false </td>
<td align="left"> Flag to enable the container log monitor which enforces container log directory size limits. Default is false. </td></tr>
<tr class="a">
<td align="left"> <code>yarn.nodemanager.container-log-monitor.interval-ms</code> </td>
<td align="left"> Positive integer </td>
<td align="left"> How often to check the usage of a container&#x2019;s log directories in milliseconds. Default is 60000 ms. </td></tr>
<tr class="b">
<td align="left"> <code>yarn.nodemanager.container-log-monitor.dir-size-limit-bytes</code> </td>
<td align="left"> Long </td>
<td align="left"> The disk space limit, in bytes, for a single container log directory. Default is 1000000000. </td></tr>
<tr class="a">
<td align="left"> <code>yarn.nodemanager.container-log-monitor.total-size-limit-bytes</code> </td>
<td align="left"> Long </td>
<td align="left"> The disk space limit, in bytes, for all of a container&#x2019;s logs. The default is 10000000000. </td></tr>
</tbody>
</table></section></section><section>
<h2><a name="Scale_Heart-beat_Interval_Based_on_CPU_Utilization"></a>Scale Heart-beat Interval Based on CPU Utilization</h2>
<p>This allows a cluster admin to configure a cluster to allow the heart-beat between the Resource Manager and each NodeManager to be scaled based on the CPU utilization of the node compared to the overall CPU utilization of the cluster.</p><section>
<h3><a name="Configuration"></a>Configuration</h3>
<p>The following parameters can be used to configure the heart-beat interval and whether and how it scales.</p>
<table border="0" class="bodyTable">
<thead>
<tr class="a">
<th align="left"> Configuration Name </th>
<th align="left"> Allowed Values </th>
<th align="left"> Description </th></tr>
</thead><tbody>
<tr class="b">
<td align="left"> <code>yarn.resourcemanager.nodemanagers.heartbeat-interval-ms</code> </td>
<td align="left"> Long </td>
<td align="left"> Specifies the default heart-beat interval in milliseconds for every NodeManager in the cluster. Default is 1000 ms. </td></tr>
<tr class="a">
<td align="left"> <code>yarn.resourcemanager.nodemanagers.heartbeat-interval-scaling-enable</code> </td>
<td align="left"> true, false </td>
<td align="left"> Enables heart-beat interval scaling. If true, The NodeManager heart-beat interval will scale based on the difference between the CPU utilization on the node and the cluster-wide average CPU utilization. Default is false. </td></tr>
<tr class="b">
<td align="left"> <code>yarn.resourcemanager.nodemanagers.heartbeat-interval-min-ms</code> </td>
<td align="left"> Positive Long </td>
<td align="left"> If heart-beat interval scaling is enabled, this is the minimum heart-beat interval in milliseconds. Default is 1000 ms. </td></tr>
<tr class="a">
<td align="left"> <code>yarn.resourcemanager.nodemanagers.heartbeat-interval-max-ms</code> </td>
<td align="left"> Positive Long </td>
<td align="left"> If heart-beat interval scaling is enabled, this is the maximum heart-beat interval in milliseconds. Default is 1000 ms. </td></tr>
<tr class="b">
<td align="left"> <code>yarn.resourcemanager.nodemanagers.heartbeat-interval-speedup-factor</code> </td>
<td align="left"> Positive Float </td>
<td align="left"> If heart-beat interval scaling is enabled, this controls the degree of adjustment when speeding up heartbeat intervals. At 1.0, 20% less than the average cluster-wide CPU utilization will result in a 20% decrease in the heartbeat interval. Default is 1.0. </td></tr>
<tr class="a">
<td align="left"> <code>yarn.resourcemanager.nodemanagers.heartbeat-interval-slowdown-factor</code> </td>
<td align="left"> Positive Float </td>
<td align="left"> If heart-beat interval scaling is enabled, this controls the degree of adjustment when slowing down heartbeat intervals. At 1.0, 20% greater than the average cluster-wide CPU utilization will result in a 20% increase in the heartbeat interval. Default is 1.0. </td></tr>
</tbody>
</table></section></section>
</div>
</div>
<div class="clear">
<hr/>
</div>
<div id="footer">
<div class="xright">
&#169; 2008-2023
Apache Software Foundation
- <a href="http://maven.apache.org/privacy-policy.html">Privacy Policy</a>.
Apache Maven, Maven, Apache, the Apache feather logo, and the Apache Maven project logos are trademarks of The Apache Software Foundation.
</div>
<div class="clear">
<hr/>
</div>
</div>
</body>
</html>