HDFS-10330. Add Corrupt Blocks Information in Metasave output. Contributed by Kuhu Shukla.

This commit is contained in:
Kihwal Lee 2016-04-27 08:19:48 -05:00
parent 66b07d8374
commit 919a1d824a
4 changed files with 116 additions and 1 deletions

View File

@ -109,6 +109,7 @@ import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.ExitUtil;
import org.apache.hadoop.util.LightWeightGSet;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Time;
import org.apache.hadoop.util.VersionInfo;
@ -612,6 +613,48 @@ public class BlockManager implements BlockStatsMXBean {
// Dump blocks that are waiting to be deleted
invalidateBlocks.dump(out);
//Dump corrupt blocks and their storageIDs
Set<Block> corruptBlocks = corruptReplicas.getCorruptBlocks();
out.println("Corrupt Blocks:");
for(Block block : corruptBlocks) {
Collection<DatanodeDescriptor> corruptNodes =
corruptReplicas.getNodes(block);
if (corruptNodes == null) {
LOG.warn(block.getBlockId() +
" is corrupt but has no associated node.");
continue;
}
int numNodesToFind = corruptNodes.size();
for (DatanodeStorageInfo storage : blocksMap.getStorages(block)) {
DatanodeDescriptor node = storage.getDatanodeDescriptor();
if (corruptNodes.contains(node)) {
String storageId = storage.getStorageID();
DatanodeStorageInfo storageInfo = node.getStorageInfo(storageId);
State state = (storageInfo == null) ? null : storageInfo.getState();
out.println("Block=" + block.getBlockId() + "\tNode=" + node.getName()
+ "\tStorageID=" + storageId + "\tStorageState=" + state
+ "\tTotalReplicas=" +
blocksMap.numNodes(block)
+ "\tReason=" + corruptReplicas.getCorruptReason(block, node));
numNodesToFind--;
if (numNodesToFind == 0) {
break;
}
}
}
if (numNodesToFind > 0) {
String[] corruptNodesList = new String[corruptNodes.size()];
int i = 0;
for (DatanodeDescriptor d : corruptNodes) {
corruptNodesList[i] = d.getHostName();
i++;
}
out.println(block.getBlockId() + " corrupt on " +
StringUtils.join(",", corruptNodesList) + " but not all nodes are" +
"found in its block locations");
}
}
// Dump all datanodes
getDatanodeManager().datanodeDump(out);
}

View File

@ -20,8 +20,10 @@ package org.apache.hadoop.hdfs.server.blockmanagement;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import org.apache.hadoop.classification.InterfaceAudience;
@ -231,6 +233,16 @@ public class CorruptReplicasMap{
return ret;
}
/**
* method to get the set of corrupt blocks in corruptReplicasMap.
* @return Set of Block objects
*/
Set<Block> getCorruptBlocks() {
Set<Block> corruptBlocks = new HashSet<Block>();
corruptBlocks.addAll(corruptReplicasMap.keySet());
return corruptBlocks;
}
/**
* return the reason about corrupted replica for a given block
* on a given dn

View File

@ -29,7 +29,13 @@ import static org.mockito.Mockito.reset;
import static org.mockito.Mockito.spy;
import static org.mockito.Mockito.verify;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@ -527,6 +533,22 @@ public class TestBlockManager {
return blockInfo;
}
private BlockInfo addCorruptBlockOnNodes(long blockId,
List<DatanodeDescriptor> nodes) throws IOException {
long inodeId = ++mockINodeId;
final INodeFile bc = TestINodeFile.createINodeFile(inodeId);
BlockInfo blockInfo = blockOnNodes(blockId, nodes);
blockInfo.setReplication((short) 3);
blockInfo.setBlockCollectionId(inodeId);
Mockito.doReturn(bc).when(fsn).getBlockCollection(inodeId);
bm.blocksMap.addBlockCollection(blockInfo, bc);
bm.markBlockReplicasAsCorrupt(blockInfo, blockInfo,
blockInfo.getGenerationStamp() + 1, blockInfo.getNumBytes(),
new DatanodeStorageInfo[]{nodes.get(0).getStorageInfos()[0]});
return blockInfo;
}
private DatanodeStorageInfo[] scheduleSingleReplication(BlockInfo block) {
// list for priority 1
List<BlockInfo> list_p1 = new ArrayList<>();
@ -1129,4 +1151,40 @@ public class TestBlockManager {
cluster.shutdown();
}
}
@Test
public void testMetaSaveCorruptBlocks() throws Exception {
List<DatanodeStorageInfo> origStorages = getStorages(0, 1);
List<DatanodeDescriptor> origNodes = getNodes(origStorages);
addCorruptBlockOnNodes(0, origNodes);
File file = new File("test.log");
PrintWriter out = new PrintWriter(file);
bm.metaSave(out);
out.flush();
FileInputStream fstream = new FileInputStream(file);
DataInputStream in = new DataInputStream(fstream);
BufferedReader reader = new BufferedReader(new InputStreamReader(in));
try {
for(int i =0;i<6;i++) {
reader.readLine();
}
String corruptBlocksLine = reader.readLine();
assertEquals("Unexpected text in metasave," +
"was expecting corrupt blocks section!", 0,
corruptBlocksLine.compareTo("Corrupt Blocks:"));
corruptBlocksLine = reader.readLine();
String regex = "Block=[0-9]+\\tNode=.*\\tStorageID=.*StorageState.*" +
"TotalReplicas=.*Reason=GENSTAMP_MISMATCH";
assertTrue("Unexpected corrupt block section in metasave!",
corruptBlocksLine.matches(regex));
corruptBlocksLine = reader.readLine();
regex = "Metasave: Number of datanodes.*";
assertTrue("Unexpected corrupt block section in metasave!",
corruptBlocksLine.matches(regex));
} finally {
if (reader != null)
reader.close();
file.delete();
}
}
}

View File

@ -159,9 +159,11 @@ public class TestMetaSave {
assertTrue(line.equals("Metasave: Blocks being reconstructed: 0"));
line = reader.readLine();
assertTrue(line.equals("Metasave: Blocks 2 waiting deletion from 1 datanodes."));
//skip 2 lines to reach HDFS-9033 scenario.
//skip 2 lines to reach HDFS-9033 scenario.
line = reader.readLine();
line = reader.readLine();
// skip 1 line for Corrupt Blocks section.
line = reader.readLine();
line = reader.readLine();
assertTrue(line.equals("Metasave: Number of datanodes: 2"));
line = reader.readLine();