YARN-3503. Expose disk utilization percentage and bad local and log dir counts in NM metrics. Contributed by Varun Vasudev
(cherry picked from commit 674c7ef649
)
Conflicts:
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
This commit is contained in:
parent
dc4698bb33
commit
613a783380
|
@ -106,6 +106,9 @@ Release 2.8.0 - UNRELEASED
|
||||||
YARN-3494. Expose AM resource limit and usage in CS QueueMetrics. (Rohith
|
YARN-3494. Expose AM resource limit and usage in CS QueueMetrics. (Rohith
|
||||||
Sharmaks via jianhe)
|
Sharmaks via jianhe)
|
||||||
|
|
||||||
|
YARN-3503. Expose disk utilization percentage and bad local and log dir
|
||||||
|
counts in NM metrics. (Varun Vasudev via jianhe)
|
||||||
|
|
||||||
OPTIMIZATIONS
|
OPTIMIZATIONS
|
||||||
|
|
||||||
YARN-3339. TestDockerContainerExecutor should pull a single image and not
|
YARN-3339. TestDockerContainerExecutor should pull a single image and not
|
||||||
|
|
|
@ -82,6 +82,8 @@ class DirectoryCollection {
|
||||||
private float diskUtilizationPercentageCutoff;
|
private float diskUtilizationPercentageCutoff;
|
||||||
private long diskUtilizationSpaceCutoff;
|
private long diskUtilizationSpaceCutoff;
|
||||||
|
|
||||||
|
private int goodDirsDiskUtilizationPercentage;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create collection for the directories specified. No check for free space.
|
* Create collection for the directories specified. No check for free space.
|
||||||
*
|
*
|
||||||
|
@ -277,6 +279,7 @@ class DirectoryCollection {
|
||||||
+ dirsFailedCheck.get(dir).message);
|
+ dirsFailedCheck.get(dir).message);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
setGoodDirsDiskUtilizationPercentage();
|
||||||
return setChanged;
|
return setChanged;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -390,4 +393,32 @@ class DirectoryCollection {
|
||||||
diskUtilizationSpaceCutoff < 0 ? 0 : diskUtilizationSpaceCutoff;
|
diskUtilizationSpaceCutoff < 0 ? 0 : diskUtilizationSpaceCutoff;
|
||||||
this.diskUtilizationSpaceCutoff = diskUtilizationSpaceCutoff;
|
this.diskUtilizationSpaceCutoff = diskUtilizationSpaceCutoff;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void setGoodDirsDiskUtilizationPercentage() {
|
||||||
|
|
||||||
|
long totalSpace = 0;
|
||||||
|
long usableSpace = 0;
|
||||||
|
|
||||||
|
for (String dir : localDirs) {
|
||||||
|
File f = new File(dir);
|
||||||
|
if (!f.isDirectory()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
totalSpace += f.getTotalSpace();
|
||||||
|
usableSpace += f.getUsableSpace();
|
||||||
|
}
|
||||||
|
if (totalSpace != 0) {
|
||||||
|
long tmp = ((totalSpace - usableSpace) * 100) / totalSpace;
|
||||||
|
if (Integer.MIN_VALUE < tmp && Integer.MAX_VALUE > tmp) {
|
||||||
|
goodDirsDiskUtilizationPercentage = (int) tmp;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// got no good dirs
|
||||||
|
goodDirsDiskUtilizationPercentage = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getGoodDirsDiskUtilizationPercentage() {
|
||||||
|
return goodDirsDiskUtilizationPercentage;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,6 +38,7 @@ import org.apache.hadoop.service.AbstractService;
|
||||||
import org.apache.hadoop.util.StringUtils;
|
import org.apache.hadoop.util.StringUtils;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The class which provides functionality of checking the health of the local
|
* The class which provides functionality of checking the health of the local
|
||||||
|
@ -84,6 +85,8 @@ public class LocalDirsHandlerService extends AbstractService {
|
||||||
|
|
||||||
private static String FILE_SCHEME = "file";
|
private static String FILE_SCHEME = "file";
|
||||||
|
|
||||||
|
private NodeManagerMetrics nodeManagerMetrics = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Class which is used by the {@link Timer} class to periodically execute the
|
* Class which is used by the {@link Timer} class to periodically execute the
|
||||||
* disks' health checker code.
|
* disks' health checker code.
|
||||||
|
@ -119,7 +122,12 @@ public class LocalDirsHandlerService extends AbstractService {
|
||||||
}
|
}
|
||||||
|
|
||||||
public LocalDirsHandlerService() {
|
public LocalDirsHandlerService() {
|
||||||
|
this(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public LocalDirsHandlerService(NodeManagerMetrics nodeManagerMetrics) {
|
||||||
super(LocalDirsHandlerService.class.getName());
|
super(LocalDirsHandlerService.class.getName());
|
||||||
|
this.nodeManagerMetrics = nodeManagerMetrics;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -389,6 +397,8 @@ public class LocalDirsHandlerService extends AbstractService {
|
||||||
updateDirsAfterTest();
|
updateDirsAfterTest();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
updateMetrics();
|
||||||
|
|
||||||
lastDisksCheckTime = System.currentTimeMillis();
|
lastDisksCheckTime = System.currentTimeMillis();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -462,4 +472,15 @@ public class LocalDirsHandlerService extends AbstractService {
|
||||||
validPaths.toArray(arrValidPaths);
|
validPaths.toArray(arrValidPaths);
|
||||||
return arrValidPaths;
|
return arrValidPaths;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected void updateMetrics() {
|
||||||
|
if (nodeManagerMetrics != null) {
|
||||||
|
nodeManagerMetrics.setBadLocalDirs(localDirs.getFailedDirs().size());
|
||||||
|
nodeManagerMetrics.setBadLogDirs(logDirs.getFailedDirs().size());
|
||||||
|
nodeManagerMetrics.setGoodLocalDirsDiskUtilizationPerc(
|
||||||
|
localDirs.getGoodDirsDiskUtilizationPercentage());
|
||||||
|
nodeManagerMetrics.setGoodLogDirsDiskUtilizationPerc(
|
||||||
|
logDirs.getGoodDirsDiskUtilizationPercentage());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -239,6 +239,8 @@ public class NodeManager extends CompositeService
|
||||||
this.dispatcher = new AsyncDispatcher();
|
this.dispatcher = new AsyncDispatcher();
|
||||||
|
|
||||||
nodeHealthChecker = new NodeHealthCheckerService();
|
nodeHealthChecker = new NodeHealthCheckerService();
|
||||||
|
dirsHandler = new LocalDirsHandlerService(metrics);
|
||||||
|
|
||||||
addService(nodeHealthChecker);
|
addService(nodeHealthChecker);
|
||||||
dirsHandler = nodeHealthChecker.getDiskHandler();
|
dirsHandler = nodeHealthChecker.getDiskHandler();
|
||||||
|
|
||||||
|
|
|
@ -48,6 +48,15 @@ public class NodeManagerMetrics {
|
||||||
@Metric MutableGaugeInt availableVCores;
|
@Metric MutableGaugeInt availableVCores;
|
||||||
@Metric("Container launch duration")
|
@Metric("Container launch duration")
|
||||||
MutableRate containerLaunchDuration;
|
MutableRate containerLaunchDuration;
|
||||||
|
@Metric("# of bad local dirs")
|
||||||
|
MutableGaugeInt badLocalDirs;
|
||||||
|
@Metric("# of bad log dirs")
|
||||||
|
MutableGaugeInt badLogDirs;
|
||||||
|
@Metric("Disk utilization % on good local dirs")
|
||||||
|
MutableGaugeInt goodLocalDirsDiskUtilizationPerc;
|
||||||
|
@Metric("Disk utilization % on good log dirs")
|
||||||
|
MutableGaugeInt goodLogDirsDiskUtilizationPerc;
|
||||||
|
|
||||||
|
|
||||||
private long allocatedMB;
|
private long allocatedMB;
|
||||||
private long availableMB;
|
private long availableMB;
|
||||||
|
@ -125,6 +134,24 @@ public class NodeManagerMetrics {
|
||||||
containerLaunchDuration.add(value);
|
containerLaunchDuration.add(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setBadLocalDirs(int badLocalDirs) {
|
||||||
|
this.badLocalDirs.set(badLocalDirs);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setBadLogDirs(int badLogDirs) {
|
||||||
|
this.badLogDirs.set(badLogDirs);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setGoodLocalDirsDiskUtilizationPerc(
|
||||||
|
int goodLocalDirsDiskUtilizationPerc) {
|
||||||
|
this.goodLocalDirsDiskUtilizationPerc.set(goodLocalDirsDiskUtilizationPerc);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setGoodLogDirsDiskUtilizationPerc(
|
||||||
|
int goodLogDirsDiskUtilizationPerc) {
|
||||||
|
this.goodLogDirsDiskUtilizationPerc.set(goodLogDirsDiskUtilizationPerc);
|
||||||
|
}
|
||||||
|
|
||||||
public int getRunningContainers() {
|
public int getRunningContainers() {
|
||||||
return containersRunning.value();
|
return containersRunning.value();
|
||||||
}
|
}
|
||||||
|
@ -143,4 +170,25 @@ public class NodeManagerMetrics {
|
||||||
public int getCompletedContainers() {
|
public int getCompletedContainers() {
|
||||||
return containersCompleted.value();
|
return containersCompleted.value();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public int getBadLogDirs() {
|
||||||
|
return badLogDirs.value();
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public int getBadLocalDirs() {
|
||||||
|
return badLocalDirs.value();
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public int getGoodLogDirsDiskUtilizationPerc() {
|
||||||
|
return goodLogDirsDiskUtilizationPerc.value();
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public int getGoodLocalDirsDiskUtilizationPerc() {
|
||||||
|
return goodLocalDirsDiskUtilizationPerc.value();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -129,24 +129,38 @@ public class TestDirectoryCollection {
|
||||||
Assert.assertEquals(0, dc.getGoodDirs().size());
|
Assert.assertEquals(0, dc.getGoodDirs().size());
|
||||||
Assert.assertEquals(1, dc.getFailedDirs().size());
|
Assert.assertEquals(1, dc.getFailedDirs().size());
|
||||||
Assert.assertEquals(1, dc.getFullDirs().size());
|
Assert.assertEquals(1, dc.getFullDirs().size());
|
||||||
|
// no good dirs
|
||||||
|
Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage());
|
||||||
|
|
||||||
dc = new DirectoryCollection(dirs, 100.0F);
|
dc = new DirectoryCollection(dirs, 100.0F);
|
||||||
|
int utilizedSpacePerc =
|
||||||
|
(int) ((testDir.getTotalSpace() - testDir.getUsableSpace()) * 100 /
|
||||||
|
testDir.getTotalSpace());
|
||||||
dc.checkDirs();
|
dc.checkDirs();
|
||||||
Assert.assertEquals(1, dc.getGoodDirs().size());
|
Assert.assertEquals(1, dc.getGoodDirs().size());
|
||||||
Assert.assertEquals(0, dc.getFailedDirs().size());
|
Assert.assertEquals(0, dc.getFailedDirs().size());
|
||||||
Assert.assertEquals(0, dc.getFullDirs().size());
|
Assert.assertEquals(0, dc.getFullDirs().size());
|
||||||
|
Assert.assertEquals(utilizedSpacePerc,
|
||||||
|
dc.getGoodDirsDiskUtilizationPercentage());
|
||||||
|
|
||||||
dc = new DirectoryCollection(dirs, testDir.getTotalSpace() / (1024 * 1024));
|
dc = new DirectoryCollection(dirs, testDir.getTotalSpace() / (1024 * 1024));
|
||||||
dc.checkDirs();
|
dc.checkDirs();
|
||||||
Assert.assertEquals(0, dc.getGoodDirs().size());
|
Assert.assertEquals(0, dc.getGoodDirs().size());
|
||||||
Assert.assertEquals(1, dc.getFailedDirs().size());
|
Assert.assertEquals(1, dc.getFailedDirs().size());
|
||||||
Assert.assertEquals(1, dc.getFullDirs().size());
|
Assert.assertEquals(1, dc.getFullDirs().size());
|
||||||
|
// no good dirs
|
||||||
|
Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage());
|
||||||
|
|
||||||
dc = new DirectoryCollection(dirs, 100.0F, 0);
|
dc = new DirectoryCollection(dirs, 100.0F, 0);
|
||||||
|
utilizedSpacePerc =
|
||||||
|
(int)((testDir.getTotalSpace() - testDir.getUsableSpace()) * 100 /
|
||||||
|
testDir.getTotalSpace());
|
||||||
dc.checkDirs();
|
dc.checkDirs();
|
||||||
Assert.assertEquals(1, dc.getGoodDirs().size());
|
Assert.assertEquals(1, dc.getGoodDirs().size());
|
||||||
Assert.assertEquals(0, dc.getFailedDirs().size());
|
Assert.assertEquals(0, dc.getFailedDirs().size());
|
||||||
Assert.assertEquals(0, dc.getFullDirs().size());
|
Assert.assertEquals(0, dc.getFullDirs().size());
|
||||||
|
Assert.assertEquals(utilizedSpacePerc,
|
||||||
|
dc.getGoodDirsDiskUtilizationPercentage());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -31,6 +31,7 @@ import org.apache.hadoop.fs.permission.FsPermission;
|
||||||
import org.apache.hadoop.service.Service.STATE;
|
import org.apache.hadoop.service.Service.STATE;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
|
||||||
import org.junit.After;
|
import org.junit.After;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
|
@ -106,12 +107,40 @@ public class TestLocalDirsHandlerService {
|
||||||
conf.set(YarnConfiguration.NM_LOG_DIRS, logDir1 + "," + logDir2);
|
conf.set(YarnConfiguration.NM_LOG_DIRS, logDir1 + "," + logDir2);
|
||||||
conf.setFloat(YarnConfiguration.NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE,
|
conf.setFloat(YarnConfiguration.NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE,
|
||||||
0.0f);
|
0.0f);
|
||||||
LocalDirsHandlerService dirSvc = new LocalDirsHandlerService();
|
NodeManagerMetrics nm = NodeManagerMetrics.create();
|
||||||
|
LocalDirsHandlerService dirSvc = new LocalDirsHandlerService(nm);
|
||||||
dirSvc.init(conf);
|
dirSvc.init(conf);
|
||||||
Assert.assertEquals(0, dirSvc.getLocalDirs().size());
|
Assert.assertEquals(0, dirSvc.getLocalDirs().size());
|
||||||
Assert.assertEquals(0, dirSvc.getLogDirs().size());
|
Assert.assertEquals(0, dirSvc.getLogDirs().size());
|
||||||
Assert.assertEquals(1, dirSvc.getDiskFullLocalDirs().size());
|
Assert.assertEquals(1, dirSvc.getDiskFullLocalDirs().size());
|
||||||
Assert.assertEquals(1, dirSvc.getDiskFullLogDirs().size());
|
Assert.assertEquals(1, dirSvc.getDiskFullLogDirs().size());
|
||||||
|
// check the metrics
|
||||||
|
Assert.assertEquals(2, nm.getBadLocalDirs());
|
||||||
|
Assert.assertEquals(2, nm.getBadLogDirs());
|
||||||
|
Assert.assertEquals(0, nm.getGoodLocalDirsDiskUtilizationPerc());
|
||||||
|
Assert.assertEquals(0, nm.getGoodLogDirsDiskUtilizationPerc());
|
||||||
|
|
||||||
|
conf.setFloat(YarnConfiguration.NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE,
|
||||||
|
100.0f);
|
||||||
|
nm = NodeManagerMetrics.create();
|
||||||
|
dirSvc = new LocalDirsHandlerService(nm);
|
||||||
|
dirSvc.init(conf);
|
||||||
|
Assert.assertEquals(1, dirSvc.getLocalDirs().size());
|
||||||
|
Assert.assertEquals(1, dirSvc.getLogDirs().size());
|
||||||
|
Assert.assertEquals(0, dirSvc.getDiskFullLocalDirs().size());
|
||||||
|
Assert.assertEquals(0, dirSvc.getDiskFullLogDirs().size());
|
||||||
|
// check the metrics
|
||||||
|
File dir = new File(localDir1);
|
||||||
|
int utilizationPerc =
|
||||||
|
(int) ((dir.getTotalSpace() - dir.getUsableSpace()) * 100 /
|
||||||
|
dir.getTotalSpace());
|
||||||
|
Assert.assertEquals(1, nm.getBadLocalDirs());
|
||||||
|
Assert.assertEquals(1, nm.getBadLogDirs());
|
||||||
|
Assert.assertEquals(utilizationPerc,
|
||||||
|
nm.getGoodLocalDirsDiskUtilizationPerc());
|
||||||
|
Assert
|
||||||
|
.assertEquals(utilizationPerc, nm.getGoodLogDirsDiskUtilizationPerc());
|
||||||
|
|
||||||
FileUtils.deleteDirectory(new File(localDir1));
|
FileUtils.deleteDirectory(new File(localDir1));
|
||||||
FileUtils.deleteDirectory(new File(localDir2));
|
FileUtils.deleteDirectory(new File(localDir2));
|
||||||
FileUtils.deleteDirectory(new File(logDir1));
|
FileUtils.deleteDirectory(new File(logDir1));
|
||||||
|
|
Loading…
Reference in New Issue