HDFS-6231. DFSClient hangs infinitely if using hedged reads and all eligible datanodes die. Contributed by Chris Nauroth.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1586551 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Chris Nauroth 2014-04-11 03:48:29 +00:00
parent dc84800184
commit f8904ad299
3 changed files with 19 additions and 10 deletions

View File

@ -372,6 +372,9 @@ Release 2.4.1 - UNRELEASED
HDFS-6208. DataNode caching can leak file descriptors. (cnauroth) HDFS-6208. DataNode caching can leak file descriptors. (cnauroth)
HDFS-6231. DFSClient hangs infinitely if using hedged reads and all eligible
datanodes die. (cnauroth)
Release 2.4.0 - 2014-04-07 Release 2.4.0 - 2014-04-07
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -983,12 +983,15 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
return new Callable<ByteBuffer>() { return new Callable<ByteBuffer>() {
@Override @Override
public ByteBuffer call() throws Exception { public ByteBuffer call() throws Exception {
byte[] buf = bb.array(); try {
int offset = bb.position(); byte[] buf = bb.array();
actualGetFromOneDataNode(datanode, block, start, end, buf, offset, int offset = bb.position();
corruptedBlockMap); actualGetFromOneDataNode(datanode, block, start, end, buf, offset,
latch.countDown(); corruptedBlockMap);
return bb; return bb;
} finally {
latch.countDown();
}
} }
}; };
} }
@ -1101,7 +1104,7 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
long end, byte[] buf, int offset, long end, byte[] buf, int offset,
Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
throws IOException { throws IOException {
ArrayList<Future<ByteBuffer>> futures = null; ArrayList<Future<ByteBuffer>> futures = new ArrayList<Future<ByteBuffer>>();
ArrayList<DatanodeInfo> ignored = new ArrayList<DatanodeInfo>(); ArrayList<DatanodeInfo> ignored = new ArrayList<DatanodeInfo>();
ByteBuffer bb = null; ByteBuffer bb = null;
int len = (int) (end - start + 1); int len = (int) (end - start + 1);
@ -1112,7 +1115,7 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
DNAddrPair chosenNode = null; DNAddrPair chosenNode = null;
Future<ByteBuffer> future = null; Future<ByteBuffer> future = null;
// futures is null if there is no request already executing. // futures is null if there is no request already executing.
if (futures == null) { if (futures.isEmpty()) {
// chooseDataNode is a commitment. If no node, we go to // chooseDataNode is a commitment. If no node, we go to
// the NN to reget block locations. Only go here on first read. // the NN to reget block locations. Only go here on first read.
chosenNode = chooseDataNode(block, ignored); chosenNode = chooseDataNode(block, ignored);
@ -1130,7 +1133,6 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
// Ignore this node on next go around. // Ignore this node on next go around.
ignored.add(chosenNode.info); ignored.add(chosenNode.info);
dfsClient.getHedgedReadMetrics().incHedgedReadOps(); dfsClient.getHedgedReadMetrics().incHedgedReadOps();
futures = new ArrayList<Future<ByteBuffer>>();
futures.add(future); futures.add(future);
continue; // no need to refresh block locations continue; // no need to refresh block locations
} catch (InterruptedException e) { } catch (InterruptedException e) {

View File

@ -237,7 +237,7 @@ public class TestPread {
public void testHedgedPreadDFSBasic() throws IOException { public void testHedgedPreadDFSBasic() throws IOException {
Configuration conf = new Configuration(); Configuration conf = new Configuration();
conf.setInt(DFSConfigKeys.DFS_DFSCLIENT_HEDGED_READ_THREADPOOL_SIZE, 5); conf.setInt(DFSConfigKeys.DFS_DFSCLIENT_HEDGED_READ_THREADPOOL_SIZE, 5);
conf.setLong(DFSConfigKeys.DFS_DFSCLIENT_HEDGED_READ_THRESHOLD_MILLIS, 100); conf.setLong(DFSConfigKeys.DFS_DFSCLIENT_HEDGED_READ_THRESHOLD_MILLIS, 1);
dfsPreadTest(conf, false, true); // normal pread dfsPreadTest(conf, false, true); // normal pread
dfsPreadTest(conf, true, true); // trigger read code path without dfsPreadTest(conf, true, true); // trigger read code path without
// transferTo. // transferTo.
@ -273,6 +273,10 @@ public class TestPread {
DistributedFileSystem fileSys = cluster.getFileSystem(); DistributedFileSystem fileSys = cluster.getFileSystem();
DFSClient dfsClient = fileSys.getClient(); DFSClient dfsClient = fileSys.getClient();
DFSHedgedReadMetrics metrics = dfsClient.getHedgedReadMetrics(); DFSHedgedReadMetrics metrics = dfsClient.getHedgedReadMetrics();
// Metrics instance is static, so we need to reset counts from prior tests.
metrics.hedgedReadOps.set(0);
metrics.hedgedReadOpsWin.set(0);
metrics.hedgedReadOpsInCurThread.set(0);
try { try {
Path file1 = new Path("hedgedReadMaxOut.dat"); Path file1 = new Path("hedgedReadMaxOut.dat");