MAPREDUCE-4444. nodemanager fails to start when one of the local-dirs is bad (Jason Lowe via bobby)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1367783 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Joseph Evans 2012-07-31 21:08:40 +00:00
parent 9d42fb2e8e
commit 123c4f57d3
3 changed files with 59 additions and 21 deletions

View File

@ -463,6 +463,9 @@ Release 2.0.0-alpha - 05-23-2012
MAPREDUCE-4483. 2.0 build does not work (John George via bobby) MAPREDUCE-4483. 2.0 build does not work (John George via bobby)
MAPREDUCE-4444. nodemanager fails to start when one of the local-dirs is
bad (Jason Lowe via bobby)
Release 0.23.3 - UNRELEASED Release 0.23.3 - UNRELEASED
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -93,23 +93,7 @@ public class LocalDirsHandlerService extends AbstractService {
@Override @Override
public void run() { public void run() {
boolean newFailure = false; checkDirs();
if (localDirs.checkDirs()) {
newFailure = true;
}
if (logDirs.checkDirs()) {
newFailure = true;
}
if (newFailure) {
LOG.info("Disk(s) failed. " + getDisksHealthReport());
updateDirsInConfiguration();
if (!areDisksHealthy()) {
// Just log.
LOG.error("Most of the disks failed. " + getDisksHealthReport());
}
}
lastDisksCheckTime = System.currentTimeMillis();
} }
} }
@ -135,6 +119,10 @@ public class LocalDirsHandlerService extends AbstractService {
YarnConfiguration.DEFAULT_NM_MIN_HEALTHY_DISKS_FRACTION); YarnConfiguration.DEFAULT_NM_MIN_HEALTHY_DISKS_FRACTION);
lastDisksCheckTime = System.currentTimeMillis(); lastDisksCheckTime = System.currentTimeMillis();
super.init(conf); super.init(conf);
// Check the disk health immediately to weed out bad directories
// before other init code attempts to use them.
checkDirs();
} }
/** /**
@ -144,10 +132,8 @@ public class LocalDirsHandlerService extends AbstractService {
public void start() { public void start() {
if (isDiskHealthCheckerEnabled) { if (isDiskHealthCheckerEnabled) {
dirsHandlerScheduler = new Timer("DiskHealthMonitor-Timer", true); dirsHandlerScheduler = new Timer("DiskHealthMonitor-Timer", true);
// Start the timer task for disk health checking immediately and dirsHandlerScheduler.scheduleAtFixedRate(monitoringTimerTask,
// then run periodically at interval time. diskHealthCheckInterval, diskHealthCheckInterval);
dirsHandlerScheduler.scheduleAtFixedRate(monitoringTimerTask, 0,
diskHealthCheckInterval);
} }
super.start(); super.start();
} }
@ -253,6 +239,26 @@ public class LocalDirsHandlerService extends AbstractService {
logDirs.toArray(new String[logDirs.size()])); logDirs.toArray(new String[logDirs.size()]));
} }
private void checkDirs() {
boolean newFailure = false;
if (localDirs.checkDirs()) {
newFailure = true;
}
if (logDirs.checkDirs()) {
newFailure = true;
}
if (newFailure) {
LOG.info("Disk(s) failed. " + getDisksHealthReport());
updateDirsInConfiguration();
if (!areDisksHealthy()) {
// Just log.
LOG.error("Most of the disks failed. " + getDisksHealthReport());
}
}
lastDisksCheckTime = System.currentTimeMillis();
}
public Path getLocalPathForWrite(String pathStr) throws IOException { public Path getLocalPathForWrite(String pathStr) throws IOException {
return localDirsAllocator.getLocalPathForWrite(pathStr, getConfig()); return localDirsAllocator.getLocalPathForWrite(pathStr, getConfig());
} }

View File

@ -110,6 +110,35 @@ public class TestDiskFailures {
testDirsFailures(false); testDirsFailures(false);
} }
/**
* Make a local and log directory inaccessible during initialization
* and verify those bad directories are recognized and removed from
* the list of available local and log directories.
* @throws IOException
*/
@Test
public void testDirFailuresOnStartup() throws IOException {
Configuration conf = new YarnConfiguration();
String localDir1 = new File(testDir, "localDir1").getPath();
String localDir2 = new File(testDir, "localDir2").getPath();
String logDir1 = new File(testDir, "logDir1").getPath();
String logDir2 = new File(testDir, "logDir2").getPath();
conf.set(YarnConfiguration.NM_LOCAL_DIRS, localDir1 + "," + localDir2);
conf.set(YarnConfiguration.NM_LOG_DIRS, logDir1 + "," + logDir2);
prepareDirToFail(localDir1);
prepareDirToFail(logDir2);
LocalDirsHandlerService dirSvc = new LocalDirsHandlerService();
dirSvc.init(conf);
List<String> localDirs = dirSvc.getLocalDirs();
Assert.assertEquals(1, localDirs.size());
Assert.assertEquals(localDir2, localDirs.get(0));
List<String> logDirs = dirSvc.getLogDirs();
Assert.assertEquals(1, logDirs.size());
Assert.assertEquals(logDir1, logDirs.get(0));
}
private void testDirsFailures(boolean localORLogDirs) throws IOException { private void testDirsFailures(boolean localORLogDirs) throws IOException {
String dirType = localORLogDirs ? "local" : "log"; String dirType = localORLogDirs ? "local" : "log";
String dirsProperty = localORLogDirs ? YarnConfiguration.NM_LOCAL_DIRS String dirsProperty = localORLogDirs ? YarnConfiguration.NM_LOCAL_DIRS