From 865650052b07c8a20d51306202354ac770ed36d5 Mon Sep 17 00:00:00 2001 From: Wei-Chiu Chuang Date: Thu, 16 Aug 2018 16:29:38 -0700 Subject: [PATCH] HDFS-10240. Race between close/recoverLease leads to missing block. Contributed by Jinglun, zhouyingchao and Wei-Chiu Chuang. --- .../server/blockmanagement/BlockInfo.java | 4 ++ .../server/blockmanagement/BlockManager.java | 4 ++ .../hdfs/server/datanode/BPServiceActor.java | 3 +- .../hadoop/hdfs/server/datanode/DataNode.java | 10 +++ .../hadoop/hdfs/TestLeaseRecovery2.java | 65 +++++++++++++++++++ .../server/datanode/DataNodeTestUtils.java | 3 + 6 files changed, 88 insertions(+), 1 deletion(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfo.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfo.java index 111ade10bc3..43f4f476bbc 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfo.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfo.java @@ -262,6 +262,10 @@ public abstract class BlockInfo extends Block return getBlockUCState().equals(BlockUCState.COMPLETE); } + public boolean isUnderRecovery() { + return getBlockUCState().equals(BlockUCState.UNDER_RECOVERY); + } + public final boolean isCompleteOrCommitted() { final BlockUCState state = getBlockUCState(); return state.equals(BlockUCState.COMPLETE) || diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index bac89bfd64c..6ab237f9b5a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -971,6 +971,10 @@ public class BlockManager implements BlockStatsMXBean { return false; // no blocks in file yet if(lastBlock.isComplete()) return false; // already completed (e.g. by syncBlock) + if(lastBlock.isUnderRecovery()) { + throw new IOException("Commit or complete block " + commitBlock + + ", whereas it is under recovery."); + } final boolean committed = commitBlock(lastBlock, commitBlock); if (committed && lastBlock.isStriped()) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java index a94d2df4315..6c167f4f757 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java @@ -682,7 +682,8 @@ class BPServiceActor implements Runnable { } } } - if (ibrManager.sendImmediately() || sendHeartbeat) { + if (!dn.areIBRDisabledForTests() && + (ibrManager.sendImmediately()|| sendHeartbeat)) { ibrManager.sendIBRs(bpNamenode, bpRegistration, bpos.getBlockPoolId()); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java index 482335895d1..ade2b115971 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java @@ -331,6 +331,7 @@ public class DataNode extends ReconfigurableBase ThreadGroup threadGroup = null; private DNConf dnConf; private volatile boolean heartbeatsDisabledForTests = false; + private volatile boolean ibrDisabledForTests = false; private volatile boolean cacheReportsDisabledForTests = false; private DataStorage storage = null; @@ -1334,6 +1335,15 @@ public class DataNode extends ReconfigurableBase } @VisibleForTesting + void setIBRDisabledForTest(boolean disabled) { + this.ibrDisabledForTests = disabled; + } + + @VisibleForTesting + boolean areIBRDisabledForTests() { + return this.ibrDisabledForTests; + } + void setCacheReportsDisabledForTest(boolean disabled) { this.cacheReportsDisabledForTests = disabled; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery2.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery2.java index a96d8b3b431..940e13e6512 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery2.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery2.java @@ -25,6 +25,7 @@ import static org.mockito.Mockito.doNothing; import static org.mockito.Mockito.spy; import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; import java.util.Map; @@ -163,6 +164,70 @@ public class TestLeaseRecovery2 { verifyFile(dfs, filepath1, actual, size); } + @Test + public void testCloseWhileRecoverLease() throws Exception { + // test recoverLease + // set the soft limit to be 1 hour but recoverLease should + // close the file immediately + cluster.setLeasePeriod(LONG_LEASE_PERIOD, LONG_LEASE_PERIOD); + int size = AppendTestUtil.nextInt(FILE_SIZE); + String filestr = "/testCloseWhileRecoverLease"; + + AppendTestUtil.LOG.info("filestr=" + filestr); + Path filepath = new Path(filestr); + FSDataOutputStream stm = dfs.create(filepath, true, BUF_SIZE, + REPLICATION_NUM, BLOCK_SIZE); + assertTrue(dfs.dfs.exists(filestr)); + + // hflush file + AppendTestUtil.LOG.info("hflush"); + stm.hflush(); + + // Pause DN block report. + // Let client recover lease, and then close the file, and then let DN + // report blocks. + ArrayList dataNodes = cluster.getDataNodes(); + for (DataNode dn: dataNodes) { + DataNodeTestUtils.setHeartbeatsDisabledForTests(dn, false); + } + + LOG.info("pause IBR"); + for (DataNode dn: dataNodes) { + DataNodeTestUtils.pauseIBR(dn); + } + + AppendTestUtil.LOG.info("size=" + size); + stm.write(buffer, 0, size); + + // hflush file + AppendTestUtil.LOG.info("hflush"); + stm.hflush(); + + LOG.info("recover lease"); + dfs.recoverLease(filepath); + try { + stm.close(); + fail("close() should fail because the file is under recovery."); + } catch (IOException ioe) { + GenericTestUtils.assertExceptionContains( + "whereas it is under recovery", ioe); + } + + for (DataNode dn: dataNodes) { + DataNodeTestUtils.setHeartbeatsDisabledForTests(dn, false); + } + + LOG.info("trigger heartbeats"); + // resume DN block report + for (DataNode dn: dataNodes) { + DataNodeTestUtils.triggerHeartbeat(dn); + } + + stm.close(); + assertEquals(cluster.getNamesystem().getBlockManager(). + getMissingBlocksCount(), 0); + } + @Test public void testLeaseRecoverByAnotherUser() throws Exception { byte [] actual = new byte[FILE_SIZE]; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeTestUtils.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeTestUtils.java index 19d9dfcd47d..25eca88ae45 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeTestUtils.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeTestUtils.java @@ -98,6 +98,9 @@ public class DataNodeTestUtils { } } + public static void pauseIBR(DataNode dn) { + dn.setIBRDisabledForTest(true); + } public static InterDatanodeProtocol createInterDatanodeProtocolProxy( DataNode dn, DatanodeID datanodeid, final Configuration conf, boolean connectToDnViaHostname) throws IOException {