HDFS-11986. Dfsadmin should report erasure coding related information separately.
This commit is contained in:
parent
36bada3032
commit
b89ffcff36
|
@ -66,11 +66,13 @@ import org.apache.hadoop.hdfs.HAUtil;
|
|||
import org.apache.hadoop.hdfs.HdfsConfiguration;
|
||||
import org.apache.hadoop.hdfs.NameNodeProxies;
|
||||
import org.apache.hadoop.hdfs.NameNodeProxiesClient.ProxyAndInfo;
|
||||
import org.apache.hadoop.hdfs.protocol.BlocksStats;
|
||||
import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
|
||||
import org.apache.hadoop.hdfs.protocol.ClientProtocol;
|
||||
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
|
||||
import org.apache.hadoop.hdfs.protocol.DatanodeLocalInfo;
|
||||
import org.apache.hadoop.hdfs.protocol.DatanodeVolumeInfo;
|
||||
import org.apache.hadoop.hdfs.protocol.ECBlockGroupsStats;
|
||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
|
||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
|
||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants.RollingUpgradeAction;
|
||||
|
@ -532,16 +534,30 @@ public class DFSAdmin extends FsShell {
|
|||
* minutes. Use "-metaSave" to list of all such blocks and accurate
|
||||
* counts.
|
||||
*/
|
||||
System.out.println("Under replicated blocks: " +
|
||||
dfs.getLowRedundancyBlocksCount());
|
||||
System.out.println("Blocks with corrupt replicas: " +
|
||||
dfs.getCorruptBlocksCount());
|
||||
System.out.println("Missing blocks: " +
|
||||
dfs.getMissingBlocksCount());
|
||||
System.out.println("Missing blocks (with replication factor 1): " +
|
||||
dfs.getMissingReplOneBlocksCount());
|
||||
System.out.println("Pending deletion blocks: " +
|
||||
dfs.getPendingDeletionBlocksCount());
|
||||
BlocksStats blocksStats = dfs.getClient().getNamenode().getBlocksStats();
|
||||
System.out.println("Replicated Blocks:");
|
||||
System.out.println("\tUnder replicated blocks: " +
|
||||
blocksStats.getLowRedundancyBlocksStat());
|
||||
System.out.println("\tBlocks with corrupt replicas: " +
|
||||
blocksStats.getCorruptBlocksStat());
|
||||
System.out.println("\tMissing blocks: " +
|
||||
blocksStats.getMissingReplicaBlocksStat());
|
||||
System.out.println("\tMissing blocks (with replication factor 1): " +
|
||||
blocksStats.getMissingReplicationOneBlocksStat());
|
||||
System.out.println("\tPending deletion blocks: " +
|
||||
blocksStats.getPendingDeletionBlocksStat());
|
||||
|
||||
ECBlockGroupsStats ecBlockGroupsStats =
|
||||
dfs.getClient().getNamenode().getECBlockGroupsStats();
|
||||
System.out.println("Erasure Coded Block Groups: ");
|
||||
System.out.println("\tLow redundancy block groups: " +
|
||||
ecBlockGroupsStats.getLowRedundancyBlockGroupsStat());
|
||||
System.out.println("\tBlock groups with corrupt internal blocks: " +
|
||||
ecBlockGroupsStats.getCorruptBlockGroupsStat());
|
||||
System.out.println("\tMissing block groups: " +
|
||||
ecBlockGroupsStats.getMissingBlockGroupsStat());
|
||||
System.out.println("\tPending deletion block groups: " +
|
||||
ecBlockGroupsStats.getPendingDeletionBlockGroupsStat());
|
||||
|
||||
System.out.println();
|
||||
|
||||
|
|
|
@ -38,11 +38,18 @@ import org.apache.hadoop.fs.Path;
|
|||
import org.apache.hadoop.hdfs.DFSClient;
|
||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||
import org.apache.hadoop.hdfs.DFSTestUtil;
|
||||
import org.apache.hadoop.hdfs.DistributedFileSystem;
|
||||
import org.apache.hadoop.hdfs.HdfsConfiguration;
|
||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
|
||||
import org.apache.hadoop.hdfs.protocol.ErasureCodingPolicy;
|
||||
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
|
||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
|
||||
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
|
||||
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
|
||||
import org.apache.hadoop.hdfs.protocol.LocatedStripedBlock;
|
||||
import org.apache.hadoop.hdfs.protocol.SystemErasureCodingPolicies;
|
||||
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
|
||||
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
|
||||
import org.apache.hadoop.hdfs.server.common.Storage;
|
||||
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
||||
|
@ -498,79 +505,10 @@ public class TestDFSAdmin {
|
|||
return sb.toString();
|
||||
}
|
||||
|
||||
@Test(timeout = 120000)
|
||||
public void testReportCommand() throws Exception {
|
||||
redirectStream();
|
||||
|
||||
/* init conf */
|
||||
final Configuration dfsConf = new HdfsConfiguration();
|
||||
dfsConf.setInt(
|
||||
DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY,
|
||||
500); // 0.5s
|
||||
dfsConf.setLong(DFS_HEARTBEAT_INTERVAL_KEY, 1);
|
||||
final Path baseDir = new Path(
|
||||
PathUtils.getTestDir(getClass()).getAbsolutePath(),
|
||||
GenericTestUtils.getMethodName());
|
||||
dfsConf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.toString());
|
||||
|
||||
final int numDn = 3;
|
||||
|
||||
/* init cluster */
|
||||
try(MiniDFSCluster miniCluster = new MiniDFSCluster
|
||||
.Builder(dfsConf)
|
||||
.numDataNodes(numDn).build()) {
|
||||
|
||||
miniCluster.waitActive();
|
||||
assertEquals(numDn, miniCluster.getDataNodes().size());
|
||||
|
||||
/* local vars */
|
||||
final DFSAdmin dfsAdmin = new DFSAdmin(dfsConf);
|
||||
final DFSClient client = miniCluster.getFileSystem().getClient();
|
||||
|
||||
/* run and verify report command */
|
||||
resetStream();
|
||||
assertEquals(0, ToolRunner.run(dfsAdmin, new String[] {"-report"}));
|
||||
verifyNodesAndCorruptBlocks(numDn, numDn, 0, client);
|
||||
|
||||
/* shut down one DN */
|
||||
final List<DataNode> datanodes = miniCluster.getDataNodes();
|
||||
final DataNode last = datanodes.get(datanodes.size() - 1);
|
||||
last.shutdown();
|
||||
miniCluster.setDataNodeDead(last.getDatanodeId());
|
||||
|
||||
/* run and verify report command */
|
||||
assertEquals(0, ToolRunner.run(dfsAdmin, new String[] {"-report"}));
|
||||
verifyNodesAndCorruptBlocks(numDn, numDn - 1, 0, client);
|
||||
|
||||
/* corrupt one block */
|
||||
final short replFactor = 1;
|
||||
final long fileLength = 512L;
|
||||
final FileSystem fs = miniCluster.getFileSystem();
|
||||
final Path file = new Path(baseDir, "/corrupted");
|
||||
DFSTestUtil.createFile(fs, file, fileLength, replFactor, 12345L);
|
||||
DFSTestUtil.waitReplication(fs, file, replFactor);
|
||||
|
||||
final ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, file);
|
||||
final int blockFilesCorrupted = miniCluster
|
||||
.corruptBlockOnDataNodes(block);
|
||||
assertEquals("Fail to corrupt all replicas for block " + block,
|
||||
replFactor, blockFilesCorrupted);
|
||||
|
||||
try {
|
||||
IOUtils.copyBytes(fs.open(file), new IOUtils.NullOutputStream(),
|
||||
conf, true);
|
||||
fail("Should have failed to read the file with corrupted blocks.");
|
||||
} catch (ChecksumException ignored) {
|
||||
// expected exception reading corrupt blocks
|
||||
}
|
||||
|
||||
/*
|
||||
* Increase replication factor, this should invoke transfer request.
|
||||
* Receiving datanode fails on checksum and reports it to namenode
|
||||
*/
|
||||
fs.setReplication(file, (short) (replFactor + 1));
|
||||
|
||||
/* get block details and check if the block is corrupt */
|
||||
// get block details and check if the block is corrupt
|
||||
private void waitForCorruptBlock(MiniDFSCluster miniCluster,
|
||||
DFSClient client, Path file)
|
||||
throws TimeoutException, InterruptedException {
|
||||
GenericTestUtils.waitFor(new Supplier<Boolean>() {
|
||||
@Override
|
||||
public Boolean get() {
|
||||
|
@ -585,14 +523,152 @@ public class TestDFSAdmin {
|
|||
return blocks != null && blocks.get(0).isCorrupt();
|
||||
}
|
||||
}, 1000, 60000);
|
||||
}
|
||||
|
||||
BlockManagerTestUtil.updateState(
|
||||
miniCluster.getNameNode().getNamesystem().getBlockManager());
|
||||
@Test(timeout = 180000)
|
||||
public void testReportCommand() throws Exception {
|
||||
tearDown();
|
||||
redirectStream();
|
||||
|
||||
/* run and verify report command */
|
||||
// init conf
|
||||
final Configuration dfsConf = new HdfsConfiguration();
|
||||
ErasureCodingPolicy ecPolicy = SystemErasureCodingPolicies.getByID(
|
||||
SystemErasureCodingPolicies.XOR_2_1_POLICY_ID);
|
||||
dfsConf.set(DFSConfigKeys.DFS_NAMENODE_EC_POLICIES_ENABLED_KEY,
|
||||
ecPolicy.getName());
|
||||
dfsConf.setInt(
|
||||
DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY, 500);
|
||||
dfsConf.setLong(DFS_HEARTBEAT_INTERVAL_KEY, 1);
|
||||
final Path baseDir = new Path(
|
||||
PathUtils.getTestDir(getClass()).getAbsolutePath(),
|
||||
GenericTestUtils.getMethodName());
|
||||
dfsConf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.toString());
|
||||
final int numDn =
|
||||
ecPolicy.getNumDataUnits() + ecPolicy.getNumParityUnits();
|
||||
|
||||
try(MiniDFSCluster miniCluster = new MiniDFSCluster
|
||||
.Builder(dfsConf)
|
||||
.numDataNodes(numDn).build()) {
|
||||
|
||||
miniCluster.waitActive();
|
||||
assertEquals(numDn, miniCluster.getDataNodes().size());
|
||||
|
||||
final DFSAdmin dfsAdmin = new DFSAdmin(dfsConf);
|
||||
final DFSClient client = miniCluster.getFileSystem().getClient();
|
||||
|
||||
// Verify report command for all counts to be zero
|
||||
resetStream();
|
||||
assertEquals(0, ToolRunner.run(dfsAdmin, new String[] {"-report"}));
|
||||
verifyNodesAndCorruptBlocks(numDn, numDn - 1, 1, client);
|
||||
verifyNodesAndCorruptBlocks(numDn, numDn, 0, 0, client);
|
||||
|
||||
final short replFactor = 1;
|
||||
final long fileLength = 512L;
|
||||
final DistributedFileSystem fs = miniCluster.getFileSystem();
|
||||
final Path file = new Path(baseDir, "/corrupted");
|
||||
DFSTestUtil.createFile(fs, file, fileLength, replFactor, 12345L);
|
||||
DFSTestUtil.waitReplication(fs, file, replFactor);
|
||||
final ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, file);
|
||||
LocatedBlocks lbs = miniCluster.getFileSystem().getClient().
|
||||
getNamenode().getBlockLocations(
|
||||
file.toString(), 0, fileLength);
|
||||
assertTrue("Unexpected block type: " + lbs.get(0),
|
||||
lbs.get(0) instanceof LocatedBlock);
|
||||
LocatedBlock locatedBlock = lbs.get(0);
|
||||
DatanodeInfo locatedDataNode = locatedBlock.getLocations()[0];
|
||||
LOG.info("Replica block located on: " + locatedDataNode);
|
||||
|
||||
Path ecDir = new Path(baseDir, "ec");
|
||||
fs.mkdirs(ecDir);
|
||||
fs.getClient().setErasureCodingPolicy(ecDir.toString(),
|
||||
ecPolicy.getName());
|
||||
Path ecFile = new Path(ecDir, "ec-file");
|
||||
int stripesPerBlock = 2;
|
||||
int cellSize = ecPolicy.getCellSize();
|
||||
int blockSize = stripesPerBlock * cellSize;
|
||||
int blockGroupSize = ecPolicy.getNumDataUnits() * blockSize;
|
||||
int totalBlockGroups = 1;
|
||||
DFSTestUtil.createStripedFile(miniCluster, ecFile, ecDir,
|
||||
totalBlockGroups, stripesPerBlock, false, ecPolicy);
|
||||
|
||||
// Verify report command for all counts to be zero
|
||||
resetStream();
|
||||
assertEquals(0, ToolRunner.run(dfsAdmin, new String[] {"-report"}));
|
||||
verifyNodesAndCorruptBlocks(numDn, numDn, 0, 0, client);
|
||||
|
||||
// Choose a DataNode to shutdown
|
||||
final List<DataNode> datanodes = miniCluster.getDataNodes();
|
||||
DataNode dataNodeToShutdown = null;
|
||||
for (DataNode dn : datanodes) {
|
||||
if (!dn.getDatanodeId().getDatanodeUuid().equals(
|
||||
locatedDataNode.getDatanodeUuid())) {
|
||||
dataNodeToShutdown = dn;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assertTrue("Unable to choose a DataNode to shutdown!",
|
||||
dataNodeToShutdown != null);
|
||||
|
||||
// Shut down the DataNode not hosting the replicated block
|
||||
LOG.info("Shutting down: " + dataNodeToShutdown);
|
||||
dataNodeToShutdown.shutdown();
|
||||
miniCluster.setDataNodeDead(dataNodeToShutdown.getDatanodeId());
|
||||
|
||||
// Verify report command to show dead DataNode
|
||||
assertEquals(0, ToolRunner.run(dfsAdmin, new String[] {"-report"}));
|
||||
verifyNodesAndCorruptBlocks(numDn, numDn - 1, 0, 0, client);
|
||||
|
||||
// Corrupt the replicated block
|
||||
final int blockFilesCorrupted = miniCluster
|
||||
.corruptBlockOnDataNodes(block);
|
||||
assertEquals("Fail to corrupt all replicas for block " + block,
|
||||
replFactor, blockFilesCorrupted);
|
||||
|
||||
try {
|
||||
IOUtils.copyBytes(fs.open(file), new IOUtils.NullOutputStream(),
|
||||
conf, true);
|
||||
fail("Should have failed to read the file with corrupted blocks.");
|
||||
} catch (ChecksumException ignored) {
|
||||
// expected exception reading corrupt blocks
|
||||
}
|
||||
|
||||
// Increase replication factor, this should invoke transfer request.
|
||||
// Receiving datanode fails on checksum and reports it to namenode
|
||||
fs.setReplication(file, (short) (replFactor + 1));
|
||||
|
||||
// get block details and check if the block is corrupt
|
||||
BlockManagerTestUtil.updateState(
|
||||
miniCluster.getNameNode().getNamesystem().getBlockManager());
|
||||
waitForCorruptBlock(miniCluster, client, file);
|
||||
|
||||
// verify report command for corrupt replicated block
|
||||
resetStream();
|
||||
assertEquals(0, ToolRunner.run(dfsAdmin, new String[] {"-report"}));
|
||||
verifyNodesAndCorruptBlocks(numDn, numDn - 1, 1, 0, client);
|
||||
|
||||
lbs = miniCluster.getFileSystem().getClient().
|
||||
getNamenode().getBlockLocations(
|
||||
ecFile.toString(), 0, blockGroupSize);
|
||||
assertTrue("Unexpected block type: " + lbs.get(0),
|
||||
lbs.get(0) instanceof LocatedStripedBlock);
|
||||
LocatedStripedBlock bg =
|
||||
(LocatedStripedBlock)(lbs.get(0));
|
||||
|
||||
miniCluster.getNamesystem().writeLock();
|
||||
try {
|
||||
BlockManager bm = miniCluster.getNamesystem().getBlockManager();
|
||||
bm.findAndMarkBlockAsCorrupt(bg.getBlock(), bg.getLocations()[0],
|
||||
"STORAGE_ID", "TEST");
|
||||
BlockManagerTestUtil.updateState(bm);
|
||||
} finally {
|
||||
miniCluster.getNamesystem().writeUnlock();
|
||||
}
|
||||
waitForCorruptBlock(miniCluster, client, file);
|
||||
|
||||
// verify report command for corrupt replicated block
|
||||
// and EC block group
|
||||
resetStream();
|
||||
assertEquals(0, ToolRunner.run(dfsAdmin, new String[] {"-report"}));
|
||||
verifyNodesAndCorruptBlocks(numDn, numDn - 1, 1, 1, client);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -669,6 +745,7 @@ public class TestDFSAdmin {
|
|||
final int numDn,
|
||||
final int numLiveDn,
|
||||
final int numCorruptBlocks,
|
||||
final int numCorruptECBlockGroups,
|
||||
final DFSClient client) throws IOException {
|
||||
|
||||
/* init vars */
|
||||
|
@ -679,11 +756,15 @@ public class TestDFSAdmin {
|
|||
final String expectedCorruptedBlocksStr = String.format(
|
||||
"Blocks with corrupt replicas: %d",
|
||||
numCorruptBlocks);
|
||||
final String expectedCorruptedECBlockGroupsStr = String.format(
|
||||
"Block groups with corrupt internal blocks: %d",
|
||||
numCorruptECBlockGroups);
|
||||
|
||||
/* verify nodes and corrupt blocks */
|
||||
// verify nodes and corrupt blocks
|
||||
assertThat(outStr, is(allOf(
|
||||
containsString(expectedLiveNodesStr),
|
||||
containsString(expectedCorruptedBlocksStr))));
|
||||
containsString(expectedCorruptedBlocksStr),
|
||||
containsString(expectedCorruptedECBlockGroupsStr))));
|
||||
|
||||
assertEquals(
|
||||
numDn,
|
||||
|
@ -694,7 +775,12 @@ public class TestDFSAdmin {
|
|||
assertEquals(
|
||||
numDn - numLiveDn,
|
||||
client.getDatanodeStorageReport(DatanodeReportType.DEAD).length);
|
||||
assertEquals(numCorruptBlocks, client.getCorruptBlocksCount());
|
||||
assertEquals(numCorruptBlocks + numCorruptECBlockGroups,
|
||||
client.getCorruptBlocksCount());
|
||||
assertEquals(numCorruptBlocks, client.getNamenode()
|
||||
.getBlocksStats().getCorruptBlocksStat());
|
||||
assertEquals(numCorruptECBlockGroups, client.getNamenode()
|
||||
.getECBlockGroupsStats().getCorruptBlockGroupsStat());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
Loading…
Reference in New Issue