HDFS-2186. DN volume failures on startup are not counted. Contributed by Eli Collins

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1156974 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Eli Collins 2011-08-12 06:09:35 +00:00
parent 8061bb59c0
commit 1c2ab728f5
3 changed files with 44 additions and 9 deletions

View File

@ -959,6 +959,8 @@ Trunk (unreleased changes)
HDFS-2235. Encode servlet paths. (eli)
HDFS-2186. DN volume failures on startup are not counted. (eli)
BREAKDOWN OF HDFS-1073 SUBTASKS
HDFS-1521. Persist transaction ID on disk between NN restarts.

View File

@ -775,12 +775,13 @@ public class FSDataset implements FSDatasetInterface {
*/
private volatile List<FSVolume> volumes = null;
BlockVolumeChoosingPolicy blockChooser;
int numFailedVolumes = 0;
int numFailedVolumes;
FSVolumeSet(FSVolume[] volumes, BlockVolumeChoosingPolicy blockChooser) {
FSVolumeSet(FSVolume[] volumes, int failedVols, BlockVolumeChoosingPolicy blockChooser) {
List<FSVolume> list = Arrays.asList(volumes);
this.volumes = Collections.unmodifiableList(list);
this.blockChooser = blockChooser;
this.numFailedVolumes = failedVols;
}
private int numberOfVolumes() {
@ -1144,15 +1145,19 @@ public class FSDataset implements FSDatasetInterface {
String[] dataDirs = conf.getTrimmedStrings(DFSConfigKeys.DFS_DATANODE_DATA_DIR_KEY);
int volsConfigured = (dataDirs == null) ? 0 : dataDirs.length;
int volsFailed = volsConfigured - storage.getNumStorageDirs();
this.validVolsRequired = volsConfigured - volFailuresTolerated;
if (validVolsRequired < 1
|| validVolsRequired > storage.getNumStorageDirs()) {
if (volFailuresTolerated < 0 || volFailuresTolerated >= volsConfigured) {
throw new DiskErrorException("Invalid volume failure "
+ " config value: " + volFailuresTolerated);
}
if (volsFailed > volFailuresTolerated) {
throw new DiskErrorException("Too many failed volumes - "
+ "current valid volumes: " + storage.getNumStorageDirs()
+ ", volumes configured: " + volsConfigured
+ ", volume failures tolerated: " + volFailuresTolerated );
+ ", volumes failed: " + volsFailed
+ ", volume failures tolerated: " + volFailuresTolerated);
}
FSVolume[] volArray = new FSVolume[storage.getNumStorageDirs()];
@ -1170,7 +1175,7 @@ public class FSDataset implements FSDatasetInterface {
RoundRobinVolumesPolicy.class,
BlockVolumeChoosingPolicy.class),
conf);
volumes = new FSVolumeSet(volArray, blockChooserImpl);
volumes = new FSVolumeSet(volArray, volsFailed, blockChooserImpl);
volumes.getVolumeMap(volumeMap);
File[] roots = new File[storage.getNumStorageDirs()];

View File

@ -24,6 +24,7 @@ import static org.junit.Assume.assumeTrue;
import java.io.File;
import java.io.IOException;
import java.util.concurrent.TimeoutException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@ -37,6 +38,7 @@ import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.log4j.Level;
import org.junit.After;
import org.junit.Before;
@ -189,7 +191,7 @@ public class TestDataNodeVolumeFailureToleration {
*/
private void restartDatanodes(int volTolerated, boolean manageDfsDirs)
throws IOException {
//Make sure no datanode is running
// Make sure no datanode is running
cluster.shutdownDataNodes();
conf.setInt(DFSConfigKeys.DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY, volTolerated);
cluster.startDataNodes(conf, 1, manageDfsDirs, null, null);
@ -226,7 +228,7 @@ public class TestDataNodeVolumeFailureToleration {
*/
private void testVolumeConfig(int volumesTolerated, int volumesFailed,
boolean expectedBPServiceState, boolean manageDfsDirs)
throws IOException, InterruptedException {
throws IOException, InterruptedException, TimeoutException {
assumeTrue(!System.getProperty("os.name").startsWith("Windows"));
final int dnIndex = 0;
// Fail the current directory since invalid storage directory perms
@ -261,4 +263,30 @@ public class TestDataNodeVolumeFailureToleration {
assertEquals("Couldn't chmod local vol", 0,
FileUtil.chmod(dir.toString(), "000"));
}
/**
* Test that a volume that is considered failed on startup is seen as
* a failed volume by the NN.
*/
@Test
public void testFailedVolumeOnStartupIsCounted() throws Exception {
assumeTrue(!System.getProperty("os.name").startsWith("Windows"));
final DatanodeManager dm = cluster.getNamesystem().getBlockManager(
).getDatanodeManager();
long origCapacity = DFSTestUtil.getLiveDatanodeCapacity(dm);
File dir = new File(MiniDFSCluster.getStorageDir(0, 0), "current");
try {
prepareDirToFail(dir);
restartDatanodes(1, false);
// The cluster is up..
assertEquals(true, cluster.getDataNodes().get(0)
.isBPServiceAlive(cluster.getNamesystem().getBlockPoolId()));
// but there has been a single volume failure
DFSTestUtil.waitForDatanodeStatus(dm, 1, 0, 1,
origCapacity / 2, WAIT_FOR_HEARTBEATS);
} finally {
FileUtil.chmod(dir.toString(), "755");
}
}
}