YARN-3503. Expose disk utilization percentage and bad local and log dir counts in NM metrics. Contributed by Varun Vasudev

This commit is contained in:
Jian He 2015-04-21 20:55:59 -07:00
parent bdd90110e6
commit 674c7ef649
7 changed files with 148 additions and 2 deletions

View File

@ -154,6 +154,9 @@ Release 2.8.0 - UNRELEASED
YARN-3494. Expose AM resource limit and usage in CS QueueMetrics. (Rohith
Sharmaks via jianhe)
YARN-3503. Expose disk utilization percentage and bad local and log dir
counts in NM metrics. (Varun Vasudev via jianhe)
OPTIMIZATIONS
YARN-3339. TestDockerContainerExecutor should pull a single image and not

View File

@ -82,6 +82,8 @@ class DirectoryCollection {
private float diskUtilizationPercentageCutoff;
private long diskUtilizationSpaceCutoff;
private int goodDirsDiskUtilizationPercentage;
/**
* Create collection for the directories specified. No check for free space.
*
@ -277,6 +279,7 @@ class DirectoryCollection {
+ dirsFailedCheck.get(dir).message);
}
}
setGoodDirsDiskUtilizationPercentage();
return setChanged;
}
@ -390,4 +393,32 @@ class DirectoryCollection {
diskUtilizationSpaceCutoff < 0 ? 0 : diskUtilizationSpaceCutoff;
this.diskUtilizationSpaceCutoff = diskUtilizationSpaceCutoff;
}
private void setGoodDirsDiskUtilizationPercentage() {
long totalSpace = 0;
long usableSpace = 0;
for (String dir : localDirs) {
File f = new File(dir);
if (!f.isDirectory()) {
continue;
}
totalSpace += f.getTotalSpace();
usableSpace += f.getUsableSpace();
}
if (totalSpace != 0) {
long tmp = ((totalSpace - usableSpace) * 100) / totalSpace;
if (Integer.MIN_VALUE < tmp && Integer.MAX_VALUE > tmp) {
goodDirsDiskUtilizationPercentage = (int) tmp;
}
} else {
// got no good dirs
goodDirsDiskUtilizationPercentage = 0;
}
}
public int getGoodDirsDiskUtilizationPercentage() {
return goodDirsDiskUtilizationPercentage;
}
}

View File

@ -38,6 +38,7 @@ import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
/**
* The class which provides functionality of checking the health of the local
@ -84,6 +85,8 @@ public class LocalDirsHandlerService extends AbstractService {
private static String FILE_SCHEME = "file";
private NodeManagerMetrics nodeManagerMetrics = null;
/**
* Class which is used by the {@link Timer} class to periodically execute the
* disks' health checker code.
@ -119,7 +122,12 @@ public class LocalDirsHandlerService extends AbstractService {
}
public LocalDirsHandlerService() {
this(null);
}
public LocalDirsHandlerService(NodeManagerMetrics nodeManagerMetrics) {
super(LocalDirsHandlerService.class.getName());
this.nodeManagerMetrics = nodeManagerMetrics;
}
/**
@ -389,6 +397,8 @@ public class LocalDirsHandlerService extends AbstractService {
updateDirsAfterTest();
}
updateMetrics();
lastDisksCheckTime = System.currentTimeMillis();
}
@ -462,4 +472,15 @@ public class LocalDirsHandlerService extends AbstractService {
validPaths.toArray(arrValidPaths);
return arrValidPaths;
}
protected void updateMetrics() {
if (nodeManagerMetrics != null) {
nodeManagerMetrics.setBadLocalDirs(localDirs.getFailedDirs().size());
nodeManagerMetrics.setBadLogDirs(logDirs.getFailedDirs().size());
nodeManagerMetrics.setGoodLocalDirsDiskUtilizationPerc(
localDirs.getGoodDirsDiskUtilizationPercentage());
nodeManagerMetrics.setGoodLogDirsDiskUtilizationPerc(
logDirs.getGoodDirsDiskUtilizationPercentage());
}
}
}

View File

@ -258,7 +258,7 @@ public class NodeManager extends CompositeService
// NodeManager level dispatcher
this.dispatcher = new AsyncDispatcher();
dirsHandler = new LocalDirsHandlerService();
dirsHandler = new LocalDirsHandlerService(metrics);
nodeHealthChecker =
new NodeHealthCheckerService(
getNodeHealthScriptRunner(conf), dirsHandler);

View File

@ -48,6 +48,15 @@ public class NodeManagerMetrics {
@Metric MutableGaugeInt availableVCores;
@Metric("Container launch duration")
MutableRate containerLaunchDuration;
@Metric("# of bad local dirs")
MutableGaugeInt badLocalDirs;
@Metric("# of bad log dirs")
MutableGaugeInt badLogDirs;
@Metric("Disk utilization % on good local dirs")
MutableGaugeInt goodLocalDirsDiskUtilizationPerc;
@Metric("Disk utilization % on good log dirs")
MutableGaugeInt goodLogDirsDiskUtilizationPerc;
private long allocatedMB;
private long availableMB;
@ -125,6 +134,24 @@ public class NodeManagerMetrics {
containerLaunchDuration.add(value);
}
public void setBadLocalDirs(int badLocalDirs) {
this.badLocalDirs.set(badLocalDirs);
}
public void setBadLogDirs(int badLogDirs) {
this.badLogDirs.set(badLogDirs);
}
public void setGoodLocalDirsDiskUtilizationPerc(
int goodLocalDirsDiskUtilizationPerc) {
this.goodLocalDirsDiskUtilizationPerc.set(goodLocalDirsDiskUtilizationPerc);
}
public void setGoodLogDirsDiskUtilizationPerc(
int goodLogDirsDiskUtilizationPerc) {
this.goodLogDirsDiskUtilizationPerc.set(goodLogDirsDiskUtilizationPerc);
}
public int getRunningContainers() {
return containersRunning.value();
}
@ -143,4 +170,25 @@ public class NodeManagerMetrics {
public int getCompletedContainers() {
return containersCompleted.value();
}
@VisibleForTesting
public int getBadLogDirs() {
return badLogDirs.value();
}
@VisibleForTesting
public int getBadLocalDirs() {
return badLocalDirs.value();
}
@VisibleForTesting
public int getGoodLogDirsDiskUtilizationPerc() {
return goodLogDirsDiskUtilizationPerc.value();
}
@VisibleForTesting
public int getGoodLocalDirsDiskUtilizationPerc() {
return goodLocalDirsDiskUtilizationPerc.value();
}
}

View File

@ -129,24 +129,38 @@ public class TestDirectoryCollection {
Assert.assertEquals(0, dc.getGoodDirs().size());
Assert.assertEquals(1, dc.getFailedDirs().size());
Assert.assertEquals(1, dc.getFullDirs().size());
// no good dirs
Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage());
dc = new DirectoryCollection(dirs, 100.0F);
int utilizedSpacePerc =
(int) ((testDir.getTotalSpace() - testDir.getUsableSpace()) * 100 /
testDir.getTotalSpace());
dc.checkDirs();
Assert.assertEquals(1, dc.getGoodDirs().size());
Assert.assertEquals(0, dc.getFailedDirs().size());
Assert.assertEquals(0, dc.getFullDirs().size());
Assert.assertEquals(utilizedSpacePerc,
dc.getGoodDirsDiskUtilizationPercentage());
dc = new DirectoryCollection(dirs, testDir.getTotalSpace() / (1024 * 1024));
dc.checkDirs();
Assert.assertEquals(0, dc.getGoodDirs().size());
Assert.assertEquals(1, dc.getFailedDirs().size());
Assert.assertEquals(1, dc.getFullDirs().size());
// no good dirs
Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage());
dc = new DirectoryCollection(dirs, 100.0F, 0);
utilizedSpacePerc =
(int)((testDir.getTotalSpace() - testDir.getUsableSpace()) * 100 /
testDir.getTotalSpace());
dc.checkDirs();
Assert.assertEquals(1, dc.getGoodDirs().size());
Assert.assertEquals(0, dc.getFailedDirs().size());
Assert.assertEquals(0, dc.getFullDirs().size());
Assert.assertEquals(utilizedSpacePerc,
dc.getGoodDirsDiskUtilizationPercentage());
}
@Test

View File

@ -31,6 +31,7 @@ import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.service.Service.STATE;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
@ -106,12 +107,40 @@ public class TestLocalDirsHandlerService {
conf.set(YarnConfiguration.NM_LOG_DIRS, logDir1 + "," + logDir2);
conf.setFloat(YarnConfiguration.NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE,
0.0f);
LocalDirsHandlerService dirSvc = new LocalDirsHandlerService();
NodeManagerMetrics nm = NodeManagerMetrics.create();
LocalDirsHandlerService dirSvc = new LocalDirsHandlerService(nm);
dirSvc.init(conf);
Assert.assertEquals(0, dirSvc.getLocalDirs().size());
Assert.assertEquals(0, dirSvc.getLogDirs().size());
Assert.assertEquals(1, dirSvc.getDiskFullLocalDirs().size());
Assert.assertEquals(1, dirSvc.getDiskFullLogDirs().size());
// check the metrics
Assert.assertEquals(2, nm.getBadLocalDirs());
Assert.assertEquals(2, nm.getBadLogDirs());
Assert.assertEquals(0, nm.getGoodLocalDirsDiskUtilizationPerc());
Assert.assertEquals(0, nm.getGoodLogDirsDiskUtilizationPerc());
conf.setFloat(YarnConfiguration.NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE,
100.0f);
nm = NodeManagerMetrics.create();
dirSvc = new LocalDirsHandlerService(nm);
dirSvc.init(conf);
Assert.assertEquals(1, dirSvc.getLocalDirs().size());
Assert.assertEquals(1, dirSvc.getLogDirs().size());
Assert.assertEquals(0, dirSvc.getDiskFullLocalDirs().size());
Assert.assertEquals(0, dirSvc.getDiskFullLogDirs().size());
// check the metrics
File dir = new File(localDir1);
int utilizationPerc =
(int) ((dir.getTotalSpace() - dir.getUsableSpace()) * 100 /
dir.getTotalSpace());
Assert.assertEquals(1, nm.getBadLocalDirs());
Assert.assertEquals(1, nm.getBadLogDirs());
Assert.assertEquals(utilizationPerc,
nm.getGoodLocalDirsDiskUtilizationPerc());
Assert
.assertEquals(utilizationPerc, nm.getGoodLogDirsDiskUtilizationPerc());
FileUtils.deleteDirectory(new File(localDir1));
FileUtils.deleteDirectory(new File(localDir2));
FileUtils.deleteDirectory(new File(logDir1));