YARN-3503. Expose disk utilization percentage and bad local and log dir counts in NM metrics. Contributed by Varun Vasudev
This commit is contained in:
parent
bdd90110e6
commit
674c7ef649
|
@ -154,6 +154,9 @@ Release 2.8.0 - UNRELEASED
|
|||
YARN-3494. Expose AM resource limit and usage in CS QueueMetrics. (Rohith
|
||||
Sharmaks via jianhe)
|
||||
|
||||
YARN-3503. Expose disk utilization percentage and bad local and log dir
|
||||
counts in NM metrics. (Varun Vasudev via jianhe)
|
||||
|
||||
OPTIMIZATIONS
|
||||
|
||||
YARN-3339. TestDockerContainerExecutor should pull a single image and not
|
||||
|
|
|
@ -82,6 +82,8 @@ class DirectoryCollection {
|
|||
private float diskUtilizationPercentageCutoff;
|
||||
private long diskUtilizationSpaceCutoff;
|
||||
|
||||
private int goodDirsDiskUtilizationPercentage;
|
||||
|
||||
/**
|
||||
* Create collection for the directories specified. No check for free space.
|
||||
*
|
||||
|
@ -277,6 +279,7 @@ class DirectoryCollection {
|
|||
+ dirsFailedCheck.get(dir).message);
|
||||
}
|
||||
}
|
||||
setGoodDirsDiskUtilizationPercentage();
|
||||
return setChanged;
|
||||
}
|
||||
|
||||
|
@ -390,4 +393,32 @@ class DirectoryCollection {
|
|||
diskUtilizationSpaceCutoff < 0 ? 0 : diskUtilizationSpaceCutoff;
|
||||
this.diskUtilizationSpaceCutoff = diskUtilizationSpaceCutoff;
|
||||
}
|
||||
|
||||
private void setGoodDirsDiskUtilizationPercentage() {
|
||||
|
||||
long totalSpace = 0;
|
||||
long usableSpace = 0;
|
||||
|
||||
for (String dir : localDirs) {
|
||||
File f = new File(dir);
|
||||
if (!f.isDirectory()) {
|
||||
continue;
|
||||
}
|
||||
totalSpace += f.getTotalSpace();
|
||||
usableSpace += f.getUsableSpace();
|
||||
}
|
||||
if (totalSpace != 0) {
|
||||
long tmp = ((totalSpace - usableSpace) * 100) / totalSpace;
|
||||
if (Integer.MIN_VALUE < tmp && Integer.MAX_VALUE > tmp) {
|
||||
goodDirsDiskUtilizationPercentage = (int) tmp;
|
||||
}
|
||||
} else {
|
||||
// got no good dirs
|
||||
goodDirsDiskUtilizationPercentage = 0;
|
||||
}
|
||||
}
|
||||
|
||||
public int getGoodDirsDiskUtilizationPercentage() {
|
||||
return goodDirsDiskUtilizationPercentage;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -38,6 +38,7 @@ import org.apache.hadoop.service.AbstractService;
|
|||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
|
||||
|
||||
/**
|
||||
* The class which provides functionality of checking the health of the local
|
||||
|
@ -84,6 +85,8 @@ public class LocalDirsHandlerService extends AbstractService {
|
|||
|
||||
private static String FILE_SCHEME = "file";
|
||||
|
||||
private NodeManagerMetrics nodeManagerMetrics = null;
|
||||
|
||||
/**
|
||||
* Class which is used by the {@link Timer} class to periodically execute the
|
||||
* disks' health checker code.
|
||||
|
@ -119,7 +122,12 @@ public class LocalDirsHandlerService extends AbstractService {
|
|||
}
|
||||
|
||||
public LocalDirsHandlerService() {
|
||||
this(null);
|
||||
}
|
||||
|
||||
public LocalDirsHandlerService(NodeManagerMetrics nodeManagerMetrics) {
|
||||
super(LocalDirsHandlerService.class.getName());
|
||||
this.nodeManagerMetrics = nodeManagerMetrics;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -389,6 +397,8 @@ public class LocalDirsHandlerService extends AbstractService {
|
|||
updateDirsAfterTest();
|
||||
}
|
||||
|
||||
updateMetrics();
|
||||
|
||||
lastDisksCheckTime = System.currentTimeMillis();
|
||||
}
|
||||
|
||||
|
@ -462,4 +472,15 @@ public class LocalDirsHandlerService extends AbstractService {
|
|||
validPaths.toArray(arrValidPaths);
|
||||
return arrValidPaths;
|
||||
}
|
||||
|
||||
protected void updateMetrics() {
|
||||
if (nodeManagerMetrics != null) {
|
||||
nodeManagerMetrics.setBadLocalDirs(localDirs.getFailedDirs().size());
|
||||
nodeManagerMetrics.setBadLogDirs(logDirs.getFailedDirs().size());
|
||||
nodeManagerMetrics.setGoodLocalDirsDiskUtilizationPerc(
|
||||
localDirs.getGoodDirsDiskUtilizationPercentage());
|
||||
nodeManagerMetrics.setGoodLogDirsDiskUtilizationPerc(
|
||||
logDirs.getGoodDirsDiskUtilizationPercentage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -258,7 +258,7 @@ public class NodeManager extends CompositeService
|
|||
// NodeManager level dispatcher
|
||||
this.dispatcher = new AsyncDispatcher();
|
||||
|
||||
dirsHandler = new LocalDirsHandlerService();
|
||||
dirsHandler = new LocalDirsHandlerService(metrics);
|
||||
nodeHealthChecker =
|
||||
new NodeHealthCheckerService(
|
||||
getNodeHealthScriptRunner(conf), dirsHandler);
|
||||
|
|
|
@ -48,6 +48,15 @@ public class NodeManagerMetrics {
|
|||
@Metric MutableGaugeInt availableVCores;
|
||||
@Metric("Container launch duration")
|
||||
MutableRate containerLaunchDuration;
|
||||
@Metric("# of bad local dirs")
|
||||
MutableGaugeInt badLocalDirs;
|
||||
@Metric("# of bad log dirs")
|
||||
MutableGaugeInt badLogDirs;
|
||||
@Metric("Disk utilization % on good local dirs")
|
||||
MutableGaugeInt goodLocalDirsDiskUtilizationPerc;
|
||||
@Metric("Disk utilization % on good log dirs")
|
||||
MutableGaugeInt goodLogDirsDiskUtilizationPerc;
|
||||
|
||||
|
||||
private long allocatedMB;
|
||||
private long availableMB;
|
||||
|
@ -125,6 +134,24 @@ public class NodeManagerMetrics {
|
|||
containerLaunchDuration.add(value);
|
||||
}
|
||||
|
||||
public void setBadLocalDirs(int badLocalDirs) {
|
||||
this.badLocalDirs.set(badLocalDirs);
|
||||
}
|
||||
|
||||
public void setBadLogDirs(int badLogDirs) {
|
||||
this.badLogDirs.set(badLogDirs);
|
||||
}
|
||||
|
||||
public void setGoodLocalDirsDiskUtilizationPerc(
|
||||
int goodLocalDirsDiskUtilizationPerc) {
|
||||
this.goodLocalDirsDiskUtilizationPerc.set(goodLocalDirsDiskUtilizationPerc);
|
||||
}
|
||||
|
||||
public void setGoodLogDirsDiskUtilizationPerc(
|
||||
int goodLogDirsDiskUtilizationPerc) {
|
||||
this.goodLogDirsDiskUtilizationPerc.set(goodLogDirsDiskUtilizationPerc);
|
||||
}
|
||||
|
||||
public int getRunningContainers() {
|
||||
return containersRunning.value();
|
||||
}
|
||||
|
@ -143,4 +170,25 @@ public class NodeManagerMetrics {
|
|||
public int getCompletedContainers() {
|
||||
return containersCompleted.value();
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public int getBadLogDirs() {
|
||||
return badLogDirs.value();
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public int getBadLocalDirs() {
|
||||
return badLocalDirs.value();
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public int getGoodLogDirsDiskUtilizationPerc() {
|
||||
return goodLogDirsDiskUtilizationPerc.value();
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public int getGoodLocalDirsDiskUtilizationPerc() {
|
||||
return goodLocalDirsDiskUtilizationPerc.value();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -129,24 +129,38 @@ public class TestDirectoryCollection {
|
|||
Assert.assertEquals(0, dc.getGoodDirs().size());
|
||||
Assert.assertEquals(1, dc.getFailedDirs().size());
|
||||
Assert.assertEquals(1, dc.getFullDirs().size());
|
||||
// no good dirs
|
||||
Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage());
|
||||
|
||||
dc = new DirectoryCollection(dirs, 100.0F);
|
||||
int utilizedSpacePerc =
|
||||
(int) ((testDir.getTotalSpace() - testDir.getUsableSpace()) * 100 /
|
||||
testDir.getTotalSpace());
|
||||
dc.checkDirs();
|
||||
Assert.assertEquals(1, dc.getGoodDirs().size());
|
||||
Assert.assertEquals(0, dc.getFailedDirs().size());
|
||||
Assert.assertEquals(0, dc.getFullDirs().size());
|
||||
Assert.assertEquals(utilizedSpacePerc,
|
||||
dc.getGoodDirsDiskUtilizationPercentage());
|
||||
|
||||
dc = new DirectoryCollection(dirs, testDir.getTotalSpace() / (1024 * 1024));
|
||||
dc.checkDirs();
|
||||
Assert.assertEquals(0, dc.getGoodDirs().size());
|
||||
Assert.assertEquals(1, dc.getFailedDirs().size());
|
||||
Assert.assertEquals(1, dc.getFullDirs().size());
|
||||
// no good dirs
|
||||
Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage());
|
||||
|
||||
dc = new DirectoryCollection(dirs, 100.0F, 0);
|
||||
utilizedSpacePerc =
|
||||
(int)((testDir.getTotalSpace() - testDir.getUsableSpace()) * 100 /
|
||||
testDir.getTotalSpace());
|
||||
dc.checkDirs();
|
||||
Assert.assertEquals(1, dc.getGoodDirs().size());
|
||||
Assert.assertEquals(0, dc.getFailedDirs().size());
|
||||
Assert.assertEquals(0, dc.getFullDirs().size());
|
||||
Assert.assertEquals(utilizedSpacePerc,
|
||||
dc.getGoodDirsDiskUtilizationPercentage());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -31,6 +31,7 @@ import org.apache.hadoop.fs.permission.FsPermission;
|
|||
import org.apache.hadoop.service.Service.STATE;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
|
||||
import org.junit.After;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
|
@ -106,12 +107,40 @@ public class TestLocalDirsHandlerService {
|
|||
conf.set(YarnConfiguration.NM_LOG_DIRS, logDir1 + "," + logDir2);
|
||||
conf.setFloat(YarnConfiguration.NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE,
|
||||
0.0f);
|
||||
LocalDirsHandlerService dirSvc = new LocalDirsHandlerService();
|
||||
NodeManagerMetrics nm = NodeManagerMetrics.create();
|
||||
LocalDirsHandlerService dirSvc = new LocalDirsHandlerService(nm);
|
||||
dirSvc.init(conf);
|
||||
Assert.assertEquals(0, dirSvc.getLocalDirs().size());
|
||||
Assert.assertEquals(0, dirSvc.getLogDirs().size());
|
||||
Assert.assertEquals(1, dirSvc.getDiskFullLocalDirs().size());
|
||||
Assert.assertEquals(1, dirSvc.getDiskFullLogDirs().size());
|
||||
// check the metrics
|
||||
Assert.assertEquals(2, nm.getBadLocalDirs());
|
||||
Assert.assertEquals(2, nm.getBadLogDirs());
|
||||
Assert.assertEquals(0, nm.getGoodLocalDirsDiskUtilizationPerc());
|
||||
Assert.assertEquals(0, nm.getGoodLogDirsDiskUtilizationPerc());
|
||||
|
||||
conf.setFloat(YarnConfiguration.NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE,
|
||||
100.0f);
|
||||
nm = NodeManagerMetrics.create();
|
||||
dirSvc = new LocalDirsHandlerService(nm);
|
||||
dirSvc.init(conf);
|
||||
Assert.assertEquals(1, dirSvc.getLocalDirs().size());
|
||||
Assert.assertEquals(1, dirSvc.getLogDirs().size());
|
||||
Assert.assertEquals(0, dirSvc.getDiskFullLocalDirs().size());
|
||||
Assert.assertEquals(0, dirSvc.getDiskFullLogDirs().size());
|
||||
// check the metrics
|
||||
File dir = new File(localDir1);
|
||||
int utilizationPerc =
|
||||
(int) ((dir.getTotalSpace() - dir.getUsableSpace()) * 100 /
|
||||
dir.getTotalSpace());
|
||||
Assert.assertEquals(1, nm.getBadLocalDirs());
|
||||
Assert.assertEquals(1, nm.getBadLogDirs());
|
||||
Assert.assertEquals(utilizationPerc,
|
||||
nm.getGoodLocalDirsDiskUtilizationPerc());
|
||||
Assert
|
||||
.assertEquals(utilizationPerc, nm.getGoodLogDirsDiskUtilizationPerc());
|
||||
|
||||
FileUtils.deleteDirectory(new File(localDir1));
|
||||
FileUtils.deleteDirectory(new File(localDir2));
|
||||
FileUtils.deleteDirectory(new File(logDir1));
|
||||
|
|
Loading…
Reference in New Issue