From 3d58c7a7006991b46efe8a8f60b244f4f85b481a Mon Sep 17 00:00:00 2001 From: Ravi Prakash Date: Mon, 20 Jul 2015 14:03:34 -0700 Subject: [PATCH] HDFS-8344. NameNode doesn't recover lease for files with missing blocks (raviprak) (cherry picked from commit e4f756260f16156179ba4adad974ec92279c2fac) --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 + .../org/apache/hadoop/hdfs/DFSConfigKeys.java | 3 + .../BlockInfoUnderConstruction.java | 19 ++++- .../server/blockmanagement/BlockManager.java | 14 +++- .../hdfs/server/namenode/FSNamesystem.java | 10 +++ .../src/main/resources/hdfs-default.xml | 9 +++ .../apache/hadoop/hdfs/TestLeaseRecovery.java | 78 +++++++++++++++++++ 7 files changed, 132 insertions(+), 4 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index bc01dde076f..c4ce0095747 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -714,6 +714,9 @@ Release 2.8.0 - UNRELEASED HDFS-8778. TestBlockReportRateLimiting#testLeaseExpiration can deadlock. (Arpit Agarwal) + HDFS-8344. NameNode doesn't recover lease for files with missing blocks + (raviprak) + Release 2.7.2 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java index 0fafadedc07..37e2d3dec40 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java @@ -440,6 +440,9 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final long DFS_CACHEREPORT_INTERVAL_MSEC_DEFAULT = 10 * 1000; public static final String DFS_BLOCK_INVALIDATE_LIMIT_KEY = "dfs.block.invalidate.limit"; public static final int DFS_BLOCK_INVALIDATE_LIMIT_DEFAULT = 1000; + public static final String DFS_BLOCK_UC_MAX_RECOVERY_ATTEMPTS = "dfs.block.uc.max.recovery.attempts"; + public static final int DFS_BLOCK_UC_MAX_RECOVERY_ATTEMPTS_DEFAULT = 5; + public static final String DFS_DEFAULT_MAX_CORRUPT_FILES_RETURNED_KEY = "dfs.corruptfilesreturned.max"; public static final int DFS_DEFAULT_MAX_CORRUPT_FILES_RETURNED = 500; /* Maximum number of blocks to process for initializing replication queues */ diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfoUnderConstruction.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfoUnderConstruction.java index 9cd3987a5cb..28f1633e92d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfoUnderConstruction.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfoUnderConstruction.java @@ -18,7 +18,6 @@ package org.apache.hadoop.hdfs.server.blockmanagement; import java.io.IOException; -import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -61,6 +60,11 @@ public abstract class BlockInfoUnderConstruction extends BlockInfo { */ protected Block truncateBlock; + /** The number of times all replicas will be used to attempt recovery before + * giving up and marking the block under construction missing. + */ + private int recoveryAttemptsBeforeMarkingBlockMissing; + /** * ReplicaUnderConstruction contains information about replicas while * they are under construction. @@ -174,6 +178,8 @@ public abstract class BlockInfoUnderConstruction extends BlockInfo { "BlockInfoUnderConstruction cannot be in COMPLETE state"); this.blockUCState = state; setExpectedLocations(targets); + this.recoveryAttemptsBeforeMarkingBlockMissing = + BlockManager.getMaxBlockUCRecoveries(); } /** Set expected locations. */ @@ -271,7 +277,7 @@ public abstract class BlockInfoUnderConstruction extends BlockInfo { if (replicas.size() == 0) { NameNode.blockStateChangeLog.warn("BLOCK* " + "BlockInfoUnderConstruction.initLeaseRecovery: " + - "No blocks found, lease removed."); + "No replicas found."); } boolean allLiveReplicasTriedAsPrimary = true; for (int i = 0; i < replicas.size(); i++) { @@ -283,6 +289,11 @@ public abstract class BlockInfoUnderConstruction extends BlockInfo { } } if (allLiveReplicasTriedAsPrimary) { + recoveryAttemptsBeforeMarkingBlockMissing--; + NameNode.blockStateChangeLog.info("Tried to recover " + this +" using all" + + " replicas. Will try " + recoveryAttemptsBeforeMarkingBlockMissing + + " more times"); + // Just set all the replicas to be chosen whether they are alive or not. for (int i = 0; i < replicas.size(); i++) { replicas.get(i).setChosenAsPrimary(false); @@ -341,6 +352,10 @@ public abstract class BlockInfoUnderConstruction extends BlockInfo { replicas.add(new ReplicaUnderConstruction(block, storage, rState)); } + public int getNumRecoveryAttemptsLeft() { + return recoveryAttemptsBeforeMarkingBlockMissing; + } + /** * Convert an under construction block to a complete block. * diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index 3f461e0e5bd..68a3fd4145d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -17,7 +17,6 @@ */ package org.apache.hadoop.hdfs.server.blockmanagement; -import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_NAMENODES_KEY_PREFIX; import static org.apache.hadoop.util.ExitUtil.terminate; import java.io.IOException; @@ -273,6 +272,13 @@ public class BlockManager implements BlockStatsMXBean { private BlockPlacementPolicy blockplacement; private final BlockStoragePolicySuite storagePolicySuite; + /** The number of times a block under construction's recovery will be + * attempted using all known replicas. e.g. if there are 3 replicas, each + * node will be tried 5 times (for a total of 15 retries across all nodes)*/ + private static int maxBlockUCRecoveries = + DFSConfigKeys.DFS_BLOCK_UC_MAX_RECOVERY_ATTEMPTS_DEFAULT; + public static int getMaxBlockUCRecoveries() { return maxBlockUCRecoveries; } + /** Check whether name system is running before terminating */ private boolean checkNSRunning = true; @@ -281,6 +287,9 @@ public class BlockManager implements BlockStatsMXBean { this.namesystem = namesystem; datanodeManager = new DatanodeManager(this, namesystem, conf); heartbeatManager = datanodeManager.getHeartbeatManager(); + maxBlockUCRecoveries = conf.getInt( + DFSConfigKeys.DFS_BLOCK_UC_MAX_RECOVERY_ATTEMPTS, + DFSConfigKeys.DFS_BLOCK_UC_MAX_RECOVERY_ATTEMPTS_DEFAULT); startupDelayBlockDeletionInMs = conf.getLong( DFSConfigKeys.DFS_NAMENODE_STARTUP_DELAY_BLOCK_DELETION_SEC_KEY, @@ -723,7 +732,8 @@ public class BlockManager implements BlockStatsMXBean { /** * Force the given block in the given file to be marked as complete, * regardless of whether enough replicas are present. This is necessary - * when tailing edit logs as a Standby. + * when tailing edit logs as a Standby or when recovering a lease on a file + * with missing blocks. */ public BlockInfo forceCompleteBlock(final BlockCollection bc, final BlockInfoUnderConstruction block) throws IOException { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index a1bbedd8375..d2eca44363e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -3284,6 +3284,16 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean, + "Removed empty last block and closed file."); return true; } + + //If the block's recovery has been attempted enough times, mark the block + //complete anyway and recover the lease + if(uc.getNumRecoveryAttemptsLeft() == 0) { + blockManager.forceCompleteBlock(pendingFile, uc); + finalizeINodeFileUnderConstruction(src, pendingFile, + iip.getLatestSnapshotId()); + return true; + } + // start recovery of the last block for this file long blockRecoveryId = nextGenerationStamp(blockIdManager.isLegacyBlock(uc)); lease = reassignLease(lease, src, recoveryLeaseHolder, pendingFile); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml index 0dff1b3eea8..7b8b5a07aa0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml @@ -424,6 +424,15 @@ The lifetime of access tokens in minutes. + + dfs.block.uc.max.recovery.attempts + 5 + The number of times a block under construction's recovery will be + attempted using all known replicas. e.g. if there are 3 replicas, each node + will be tried 5 times (for a total of 15 retries across all nodes). + + + dfs.datanode.data.dir file://${hadoop.tmp.dir}/dfs/data diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java index c9f3842d8b6..c9448acb115 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java @@ -27,6 +27,7 @@ import java.util.EnumSet; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CreateFlag; +import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -258,4 +259,81 @@ public class TestLeaseRecovery { } } } + + /** + * Test that when a client was writing to a file and died, and before the + * lease can be recovered, all the datanodes to which the file was written + * also die, after some time (5 * lease recovery times) the file is indeed + * closed and lease recovered. + * We also check that if the datanode came back after some time, the data + * originally written is not truncated + * @throws IOException + * @throws InterruptedException + */ + @Test + public void testLeaseRecoveryWithMissingBlocks() + throws IOException, InterruptedException { + Configuration conf = new HdfsConfiguration(); + + //Start a cluster with 3 datanodes + cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build(); + cluster.setLeasePeriod(LEASE_PERIOD, LEASE_PERIOD); + cluster.waitActive(); + + //create a file (with replication 1) + Path file = new Path("/testRecoveryFile"); + DistributedFileSystem dfs = cluster.getFileSystem(); + FSDataOutputStream out = dfs.create(file, (short) 1); + + //This keeps count of the number of bytes written (AND is also the data we + //are writing) + long writtenBytes = 0; + while (writtenBytes < 2 * 1024 * 1024) { + out.writeLong(writtenBytes); + writtenBytes += 8; + } + System.out.println("Written " + writtenBytes + " bytes"); + out.hsync(); + System.out.println("hsynced the data"); + + //Kill the datanode to which the file was written. + DatanodeInfo dn = + ((DFSOutputStream) out.getWrappedStream()).getPipeline()[0]; + DataNodeProperties dnStopped = cluster.stopDataNode(dn.getName()); + + //Wait at most 20 seconds for the lease to be recovered + LeaseManager lm = NameNodeAdapter.getLeaseManager(cluster.getNamesystem()); + int i = 40; + while(i-- > 0 && lm.countLease() != 0) { + System.out.println("Still got " + lm.countLease() + " lease(s)"); + Thread.sleep(500); + } + assertTrue("The lease was not recovered", lm.countLease() == 0); + System.out.println("Got " + lm.countLease() + " leases"); + + //Make sure we can't read any data because the datanode is dead + FSDataInputStream in = dfs.open(file); + try { + in.readLong(); + assertTrue("Shouldn't have reached here", false); + } catch(BlockMissingException bme) { + System.out.println("Correctly got BlockMissingException because datanode" + + " is still dead"); + } + + //Bring the dead datanode back. + cluster.restartDataNode(dnStopped); + System.out.println("Restart datanode"); + + //Make sure we can read all the data back (since we hsync'ed). + in = dfs.open(file); + int readBytes = 0; + while(in.available() != 0) { + assertEquals("Didn't read the data we wrote", in.readLong(), readBytes); + readBytes += 8; + } + assertEquals("Didn't get all the data", readBytes, writtenBytes); + System.out.println("Read back all the " + readBytes + " bytes"); + } + }