YARN-8775. TestDiskFailures.testLocalDirsFailures sometimes can fail on concurrent File modifications. (Contributed by Antal Bálint Steinbach)

This commit is contained in:
Haibo Chen 2018-10-15 09:37:20 -07:00
parent fa94d370b6
commit f880ff418c
2 changed files with 19 additions and 24 deletions

View File

@ -27,6 +27,8 @@ import java.util.List;
import java.util.Set; import java.util.Set;
import java.util.Timer; import java.util.Timer;
import java.util.TimerTask; import java.util.TimerTask;
import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.util.DiskChecker.DiskErrorException; import org.apache.hadoop.util.DiskChecker.DiskErrorException;
import org.apache.hadoop.util.DiskValidator; import org.apache.hadoop.util.DiskValidator;
import org.apache.hadoop.util.DiskValidatorFactory; import org.apache.hadoop.util.DiskValidatorFactory;
@ -493,7 +495,8 @@ public class LocalDirsHandlerService extends AbstractService {
} }
private void checkDirs() { @VisibleForTesting
public void checkDirs() {
boolean disksStatusChange = false; boolean disksStatusChange = false;
Set<String> failedLocalDirsPreCheck = Set<String> failedLocalDirsPreCheck =
new HashSet<String>(localDirs.getFailedDirs()); new HashSet<String>(localDirs.getFailedDirs());

View File

@ -27,7 +27,6 @@ import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.api.records.NodeState; import org.apache.hadoop.yarn.api.records.NodeState;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.MiniYARNCluster;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
@ -56,7 +55,12 @@ public class TestDiskFailures {
private static final Logger LOG = LoggerFactory.getLogger(TestDiskFailures.class); private static final Logger LOG = LoggerFactory.getLogger(TestDiskFailures.class);
private static final long DISK_HEALTH_CHECK_INTERVAL = 1000;//1 sec /*
* Set disk check interval high enough so that it never runs during the test.
* Checks will be called manually if necessary.
*/
private static final long TOO_HIGH_DISK_HEALTH_CHECK_INTERVAL =
1000 * 60 * 60 * 24;
private static FileContext localFS = null; private static FileContext localFS = null;
private static final File testDir = new File("target", private static final File testDir = new File("target",
@ -146,9 +150,10 @@ public class TestDiskFailures {
: YarnConfiguration.NM_LOG_DIRS; : YarnConfiguration.NM_LOG_DIRS;
Configuration conf = new Configuration(); Configuration conf = new Configuration();
// set disk health check interval to a small value (say 1 sec). // set disk health check interval to a large value to effectively disable
// disk health check done internally in LocalDirsHandlerService"
conf.setLong(YarnConfiguration.NM_DISK_HEALTH_CHECK_INTERVAL_MS, conf.setLong(YarnConfiguration.NM_DISK_HEALTH_CHECK_INTERVAL_MS,
DISK_HEALTH_CHECK_INTERVAL); TOO_HIGH_DISK_HEALTH_CHECK_INTERVAL);
// If 2 out of the total 4 local-dirs fail OR if 2 Out of the total 4 // If 2 out of the total 4 local-dirs fail OR if 2 Out of the total 4
// log-dirs fail, then the node's health status should become unhealthy. // log-dirs fail, then the node's health status should become unhealthy.
@ -202,22 +207,6 @@ public class TestDiskFailures {
verifyDisksHealth(localORLogDirs, expectedDirs, false); verifyDisksHealth(localORLogDirs, expectedDirs, false);
} }
/**
* Wait for the NodeManger to go for the disk-health-check at least once.
*/
private void waitForDiskHealthCheck() {
long lastDisksCheckTime = dirsHandler.getLastDisksCheckTime();
long time = lastDisksCheckTime;
for (int i = 0; i < 10 && (time <= lastDisksCheckTime); i++) {
try {
Thread.sleep(1000);
} catch(InterruptedException e) {
LOG.error(
"Interrupted while waiting for NodeManager's disk health check.");
}
time = dirsHandler.getLastDisksCheckTime();
}
}
/** /**
* Verify if the NodeManager could identify disk failures. * Verify if the NodeManager could identify disk failures.
@ -228,8 +217,8 @@ public class TestDiskFailures {
*/ */
private void verifyDisksHealth(boolean localORLogDirs, String expectedDirs, private void verifyDisksHealth(boolean localORLogDirs, String expectedDirs,
boolean isHealthy) { boolean isHealthy) {
// Wait for the NodeManager to identify disk failures. // identify disk failures
waitForDiskHealthCheck(); dirsHandler.checkDirs();
List<String> list = localORLogDirs ? dirsHandler.getLocalDirs() List<String> list = localORLogDirs ? dirsHandler.getLocalDirs()
: dirsHandler.getLogDirs(); : dirsHandler.getLogDirs();
@ -272,7 +261,10 @@ public class TestDiskFailures {
*/ */
private void prepareDirToFail(String dir) throws IOException { private void prepareDirToFail(String dir) throws IOException {
File file = new File(dir); File file = new File(dir);
FileUtil.fullyDelete(file); if(!FileUtil.fullyDelete(file)) {
throw new IOException("Delete of file was unsuccessful! Path: " +
file.getAbsolutePath());
}
file.createNewFile(); file.createNewFile();
LOG.info("Prepared " + dir + " to fail."); LOG.info("Prepared " + dir + " to fail.");
} }