diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 78c2309e934..acec25195ce 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -2560,6 +2560,9 @@ Release 0.23.8 - UNRELEASED HDFS-4477. Secondary namenode may retain old tokens (daryn via kihwal) + HDFS-4699. TestPipelinesFailover#testPipelineRecoveryStress fails + sporadically (Chris Nauroth via kihwal) + Release 0.23.7 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java index 7427e834dba..d41433b80c4 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java @@ -1286,7 +1286,10 @@ public class DataNode extends Configured LOG.warn("checkDiskError: exception: ", e); if (e instanceof SocketException || e instanceof SocketTimeoutException || e instanceof ClosedByInterruptException - || e.getMessage().startsWith("Broken pipe")) { + || e.getMessage().startsWith("An established connection was aborted") + || e.getMessage().startsWith("Broken pipe") + || e.getMessage().startsWith("Connection reset") + || e.getMessage().contains("java.nio.channels.SocketChannel")) { LOG.info("Not checking disk as checkDiskError was called on a network" + " related exception"); return; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestPipelinesFailover.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestPipelinesFailover.java index f7af9b03aed..bfac1afcaaa 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestPipelinesFailover.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestPipelinesFailover.java @@ -422,6 +422,11 @@ public class TestPipelinesFailover { // Disable permissions so that another user can recover the lease. harness.conf.setBoolean( DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY, false); + // This test triggers rapid NN failovers. The client retry policy uses an + // exponential backoff. This can quickly lead to long sleep times and even + // timeout the whole test. Cap the sleep time at 1s to prevent this. + harness.conf.setInt(DFSConfigKeys.DFS_CLIENT_FAILOVER_SLEEPTIME_MAX_KEY, + 1000); final MiniDFSCluster cluster = harness.startCluster(); try { @@ -537,11 +542,10 @@ public class TestPipelinesFailover { } /** - * Try to cover the lease on the given file for up to 30 - * seconds. + * Try to recover the lease on the given file for up to 60 seconds. * @param fsOtherUser the filesystem to use for the recoverLease call * @param testPath the path on which to run lease recovery - * @throws TimeoutException if lease recover does not succeed within 30 + * @throws TimeoutException if lease recover does not succeed within 60 * seconds * @throws InterruptedException if the thread is interrupted */ @@ -564,7 +568,7 @@ public class TestPipelinesFailover { } return success; } - }, 1000, 30000); + }, 1000, 60000); } catch (TimeoutException e) { throw new TimeoutException("Timed out recovering lease for " + testPath);