diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 5be2515f3c6..68ed00517e2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -122,6 +122,9 @@ Release 2.4.1 - UNRELEASED HDFS-6208. DataNode caching can leak file descriptors. (cnauroth) + HDFS-6231. DFSClient hangs infinitely if using hedged reads and all eligible + datanodes die. (cnauroth) + Release 2.4.0 - 2014-04-07 INCOMPATIBLE CHANGES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java index 756ef4b60d3..301bd4c15ab 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java @@ -983,12 +983,15 @@ private Callable getFromOneDataNode(final DNAddrPair datanode, return new Callable() { @Override public ByteBuffer call() throws Exception { - byte[] buf = bb.array(); - int offset = bb.position(); - actualGetFromOneDataNode(datanode, block, start, end, buf, offset, - corruptedBlockMap); - latch.countDown(); - return bb; + try { + byte[] buf = bb.array(); + int offset = bb.position(); + actualGetFromOneDataNode(datanode, block, start, end, buf, offset, + corruptedBlockMap); + return bb; + } finally { + latch.countDown(); + } } }; } @@ -1101,7 +1104,7 @@ private void hedgedFetchBlockByteRange(LocatedBlock block, long start, long end, byte[] buf, int offset, Map> corruptedBlockMap) throws IOException { - ArrayList> futures = null; + ArrayList> futures = new ArrayList>(); ArrayList ignored = new ArrayList(); ByteBuffer bb = null; int len = (int) (end - start + 1); @@ -1112,7 +1115,7 @@ private void hedgedFetchBlockByteRange(LocatedBlock block, long start, DNAddrPair chosenNode = null; Future future = null; // futures is null if there is no request already executing. - if (futures == null) { + if (futures.isEmpty()) { // chooseDataNode is a commitment. If no node, we go to // the NN to reget block locations. Only go here on first read. chosenNode = chooseDataNode(block, ignored); @@ -1130,7 +1133,6 @@ private void hedgedFetchBlockByteRange(LocatedBlock block, long start, // Ignore this node on next go around. ignored.add(chosenNode.info); dfsClient.getHedgedReadMetrics().incHedgedReadOps(); - futures = new ArrayList>(); futures.add(future); continue; // no need to refresh block locations } catch (InterruptedException e) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestPread.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestPread.java index 8bd911cbb76..5168eb5ca26 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestPread.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestPread.java @@ -243,7 +243,7 @@ public void testPreadDFSNoChecksum() throws IOException { public void testHedgedPreadDFSBasic() throws IOException { Configuration conf = new Configuration(); conf.setInt(DFSConfigKeys.DFS_DFSCLIENT_HEDGED_READ_THREADPOOL_SIZE, 5); - conf.setLong(DFSConfigKeys.DFS_DFSCLIENT_HEDGED_READ_THRESHOLD_MILLIS, 100); + conf.setLong(DFSConfigKeys.DFS_DFSCLIENT_HEDGED_READ_THRESHOLD_MILLIS, 1); dfsPreadTest(conf, false, true); // normal pread dfsPreadTest(conf, true, true); // trigger read code path without // transferTo. @@ -279,6 +279,10 @@ public Void answer(InvocationOnMock invocation) throws Throwable { DistributedFileSystem fileSys = cluster.getFileSystem(); DFSClient dfsClient = fileSys.getClient(); DFSHedgedReadMetrics metrics = dfsClient.getHedgedReadMetrics(); + // Metrics instance is static, so we need to reset counts from prior tests. + metrics.hedgedReadOps.set(0); + metrics.hedgedReadOpsWin.set(0); + metrics.hedgedReadOpsInCurThread.set(0); try { Path file1 = new Path("hedgedReadMaxOut.dat");