YARN-8775. TestDiskFailures.testLocalDirsFailures sometimes can fail on concurrent File modifications. (Contributed by Antal Bálint Steinbach)
This commit is contained in:
parent
fa94d370b6
commit
f880ff418c
|
@ -27,6 +27,8 @@ import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.Timer;
|
import java.util.Timer;
|
||||||
import java.util.TimerTask;
|
import java.util.TimerTask;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import org.apache.hadoop.util.DiskChecker.DiskErrorException;
|
import org.apache.hadoop.util.DiskChecker.DiskErrorException;
|
||||||
import org.apache.hadoop.util.DiskValidator;
|
import org.apache.hadoop.util.DiskValidator;
|
||||||
import org.apache.hadoop.util.DiskValidatorFactory;
|
import org.apache.hadoop.util.DiskValidatorFactory;
|
||||||
|
@ -493,7 +495,8 @@ public class LocalDirsHandlerService extends AbstractService {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void checkDirs() {
|
@VisibleForTesting
|
||||||
|
public void checkDirs() {
|
||||||
boolean disksStatusChange = false;
|
boolean disksStatusChange = false;
|
||||||
Set<String> failedLocalDirsPreCheck =
|
Set<String> failedLocalDirsPreCheck =
|
||||||
new HashSet<String>(localDirs.getFailedDirs());
|
new HashSet<String>(localDirs.getFailedDirs());
|
||||||
|
|
|
@ -27,7 +27,6 @@ import org.apache.hadoop.security.AccessControlException;
|
||||||
import org.apache.hadoop.util.StringUtils;
|
import org.apache.hadoop.util.StringUtils;
|
||||||
import org.apache.hadoop.yarn.api.records.NodeState;
|
import org.apache.hadoop.yarn.api.records.NodeState;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.server.MiniYARNCluster;
|
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
|
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
|
import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
||||||
|
@ -56,7 +55,12 @@ public class TestDiskFailures {
|
||||||
|
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(TestDiskFailures.class);
|
private static final Logger LOG = LoggerFactory.getLogger(TestDiskFailures.class);
|
||||||
|
|
||||||
private static final long DISK_HEALTH_CHECK_INTERVAL = 1000;//1 sec
|
/*
|
||||||
|
* Set disk check interval high enough so that it never runs during the test.
|
||||||
|
* Checks will be called manually if necessary.
|
||||||
|
*/
|
||||||
|
private static final long TOO_HIGH_DISK_HEALTH_CHECK_INTERVAL =
|
||||||
|
1000 * 60 * 60 * 24;
|
||||||
|
|
||||||
private static FileContext localFS = null;
|
private static FileContext localFS = null;
|
||||||
private static final File testDir = new File("target",
|
private static final File testDir = new File("target",
|
||||||
|
@ -146,9 +150,10 @@ public class TestDiskFailures {
|
||||||
: YarnConfiguration.NM_LOG_DIRS;
|
: YarnConfiguration.NM_LOG_DIRS;
|
||||||
|
|
||||||
Configuration conf = new Configuration();
|
Configuration conf = new Configuration();
|
||||||
// set disk health check interval to a small value (say 1 sec).
|
// set disk health check interval to a large value to effectively disable
|
||||||
|
// disk health check done internally in LocalDirsHandlerService"
|
||||||
conf.setLong(YarnConfiguration.NM_DISK_HEALTH_CHECK_INTERVAL_MS,
|
conf.setLong(YarnConfiguration.NM_DISK_HEALTH_CHECK_INTERVAL_MS,
|
||||||
DISK_HEALTH_CHECK_INTERVAL);
|
TOO_HIGH_DISK_HEALTH_CHECK_INTERVAL);
|
||||||
|
|
||||||
// If 2 out of the total 4 local-dirs fail OR if 2 Out of the total 4
|
// If 2 out of the total 4 local-dirs fail OR if 2 Out of the total 4
|
||||||
// log-dirs fail, then the node's health status should become unhealthy.
|
// log-dirs fail, then the node's health status should become unhealthy.
|
||||||
|
@ -202,22 +207,6 @@ public class TestDiskFailures {
|
||||||
verifyDisksHealth(localORLogDirs, expectedDirs, false);
|
verifyDisksHealth(localORLogDirs, expectedDirs, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Wait for the NodeManger to go for the disk-health-check at least once.
|
|
||||||
*/
|
|
||||||
private void waitForDiskHealthCheck() {
|
|
||||||
long lastDisksCheckTime = dirsHandler.getLastDisksCheckTime();
|
|
||||||
long time = lastDisksCheckTime;
|
|
||||||
for (int i = 0; i < 10 && (time <= lastDisksCheckTime); i++) {
|
|
||||||
try {
|
|
||||||
Thread.sleep(1000);
|
|
||||||
} catch(InterruptedException e) {
|
|
||||||
LOG.error(
|
|
||||||
"Interrupted while waiting for NodeManager's disk health check.");
|
|
||||||
}
|
|
||||||
time = dirsHandler.getLastDisksCheckTime();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verify if the NodeManager could identify disk failures.
|
* Verify if the NodeManager could identify disk failures.
|
||||||
|
@ -228,8 +217,8 @@ public class TestDiskFailures {
|
||||||
*/
|
*/
|
||||||
private void verifyDisksHealth(boolean localORLogDirs, String expectedDirs,
|
private void verifyDisksHealth(boolean localORLogDirs, String expectedDirs,
|
||||||
boolean isHealthy) {
|
boolean isHealthy) {
|
||||||
// Wait for the NodeManager to identify disk failures.
|
// identify disk failures
|
||||||
waitForDiskHealthCheck();
|
dirsHandler.checkDirs();
|
||||||
|
|
||||||
List<String> list = localORLogDirs ? dirsHandler.getLocalDirs()
|
List<String> list = localORLogDirs ? dirsHandler.getLocalDirs()
|
||||||
: dirsHandler.getLogDirs();
|
: dirsHandler.getLogDirs();
|
||||||
|
@ -272,7 +261,10 @@ public class TestDiskFailures {
|
||||||
*/
|
*/
|
||||||
private void prepareDirToFail(String dir) throws IOException {
|
private void prepareDirToFail(String dir) throws IOException {
|
||||||
File file = new File(dir);
|
File file = new File(dir);
|
||||||
FileUtil.fullyDelete(file);
|
if(!FileUtil.fullyDelete(file)) {
|
||||||
|
throw new IOException("Delete of file was unsuccessful! Path: " +
|
||||||
|
file.getAbsolutePath());
|
||||||
|
}
|
||||||
file.createNewFile();
|
file.createNewFile();
|
||||||
LOG.info("Prepared " + dir + " to fail.");
|
LOG.info("Prepared " + dir + " to fail.");
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue