HDFS-10220. A large number of expired leases can make namenode unresponsive and cause failover (Nicolas Fraison via raviprak)

(cherry picked from commit ae047655f4)
(cherry picked from commit 9c5f7f290e)
This commit is contained in:
Ravi Prakash 2016-06-08 13:44:22 -07:00
parent c5dafb4ad7
commit 588ea6d431
6 changed files with 94 additions and 22 deletions

View File

@ -365,6 +365,16 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
public static final int DFS_NAMENODE_MAX_XATTR_SIZE_DEFAULT = 16384; public static final int DFS_NAMENODE_MAX_XATTR_SIZE_DEFAULT = 16384;
public static final int DFS_NAMENODE_MAX_XATTR_SIZE_HARD_LIMIT = 32768; public static final int DFS_NAMENODE_MAX_XATTR_SIZE_HARD_LIMIT = 32768;
public static final String DFS_NAMENODE_LEASE_RECHECK_INTERVAL_MS_KEY =
"dfs.namenode.lease-recheck-interval-ms";
public static final long DFS_NAMENODE_LEASE_RECHECK_INTERVAL_MS_DEFAULT =
2000;
public static final String
DFS_NAMENODE_MAX_LOCK_HOLD_TO_RELEASE_LEASE_MS_KEY =
"dfs.namenode.max-lock-hold-to-release-lease-ms";
public static final long
DFS_NAMENODE_MAX_LOCK_HOLD_TO_RELEASE_LEASE_MS_DEFAULT = 25;
public static final String DFS_UPGRADE_DOMAIN_FACTOR = "dfs.namenode.upgrade.domain.factor"; public static final String DFS_UPGRADE_DOMAIN_FACTOR = "dfs.namenode.upgrade.domain.factor";
public static final int DFS_UPGRADE_DOMAIN_FACTOR_DEFAULT = DFS_REPLICATION_DEFAULT; public static final int DFS_UPGRADE_DOMAIN_FACTOR_DEFAULT = DFS_REPLICATION_DEFAULT;

View File

@ -355,7 +355,6 @@ public interface HdfsServerConstants {
} }
String NAMENODE_LEASE_HOLDER = "HDFS_NameNode"; String NAMENODE_LEASE_HOLDER = "HDFS_NameNode";
long NAMENODE_LEASE_RECHECK_INTERVAL = 2000;
String CRYPTO_XATTR_ENCRYPTION_ZONE = String CRYPTO_XATTR_ENCRYPTION_ZONE =
"raw.hdfs.crypto.encryption.zone"; "raw.hdfs.crypto.encryption.zone";

View File

@ -82,6 +82,10 @@ import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DAT
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LEASE_RECHECK_INTERVAL_MS_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LEASE_RECHECK_INTERVAL_MS_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_LOCK_HOLD_TO_RELEASE_LEASE_MS_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_LOCK_HOLD_TO_RELEASE_LEASE_MS_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT;
@ -383,7 +387,12 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
private final UserGroupInformation fsOwner; private final UserGroupInformation fsOwner;
private final String supergroup; private final String supergroup;
private final boolean standbyShouldCheckpoint; private final boolean standbyShouldCheckpoint;
/** Interval between each check of lease to release. */
private final long leaseRecheckIntervalMs;
/** Maximum time the lock is hold to release lease. */
private final long maxLockHoldToReleaseLeaseMs;
// Scan interval is not configurable. // Scan interval is not configurable.
private static final long DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL = private static final long DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL =
TimeUnit.MILLISECONDS.convert(1, TimeUnit.HOURS); TimeUnit.MILLISECONDS.convert(1, TimeUnit.HOURS);
@ -819,6 +828,13 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
DFSConfigKeys.DFS_NAMENODE_EDEKCACHELOADER_INTERVAL_MS_KEY, DFSConfigKeys.DFS_NAMENODE_EDEKCACHELOADER_INTERVAL_MS_KEY,
DFSConfigKeys.DFS_NAMENODE_EDEKCACHELOADER_INTERVAL_MS_DEFAULT); DFSConfigKeys.DFS_NAMENODE_EDEKCACHELOADER_INTERVAL_MS_DEFAULT);
this.leaseRecheckIntervalMs = conf.getLong(
DFS_NAMENODE_LEASE_RECHECK_INTERVAL_MS_KEY,
DFS_NAMENODE_LEASE_RECHECK_INTERVAL_MS_DEFAULT);
this.maxLockHoldToReleaseLeaseMs = conf.getLong(
DFS_NAMENODE_MAX_LOCK_HOLD_TO_RELEASE_LEASE_MS_KEY,
DFS_NAMENODE_MAX_LOCK_HOLD_TO_RELEASE_LEASE_MS_DEFAULT);
// For testing purposes, allow the DT secret manager to be started regardless // For testing purposes, allow the DT secret manager to be started regardless
// of whether security is enabled. // of whether security is enabled.
alwaysUseDelegationTokensForTests = conf.getBoolean( alwaysUseDelegationTokensForTests = conf.getBoolean(
@ -863,6 +879,16 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
return retryCache; return retryCache;
} }
@VisibleForTesting
public long getLeaseRecheckIntervalMs() {
return leaseRecheckIntervalMs;
}
@VisibleForTesting
public long getMaxLockHoldToReleaseLeaseMs() {
return maxLockHoldToReleaseLeaseMs;
}
void lockRetryCache() { void lockRetryCache() {
if (retryCache != null) { if (retryCache != null) {
retryCache.lock(); retryCache.lock();
@ -3120,9 +3146,9 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
if(nrCompleteBlocks == nrBlocks) { if(nrCompleteBlocks == nrBlocks) {
finalizeINodeFileUnderConstruction(src, pendingFile, finalizeINodeFileUnderConstruction(src, pendingFile,
iip.getLatestSnapshotId(), false); iip.getLatestSnapshotId(), false);
NameNode.stateChangeLog.warn("BLOCK*" NameNode.stateChangeLog.warn("BLOCK*" +
+ " internalReleaseLease: All existing blocks are COMPLETE," " internalReleaseLease: All existing blocks are COMPLETE," +
+ " lease removed, file closed."); " lease removed, file " + src + " closed.");
return true; // closed! return true; // closed!
} }
@ -3159,9 +3185,9 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
blockManager.checkMinReplication(lastBlock)) { blockManager.checkMinReplication(lastBlock)) {
finalizeINodeFileUnderConstruction(src, pendingFile, finalizeINodeFileUnderConstruction(src, pendingFile,
iip.getLatestSnapshotId(), false); iip.getLatestSnapshotId(), false);
NameNode.stateChangeLog.warn("BLOCK*" NameNode.stateChangeLog.warn("BLOCK*" +
+ " internalReleaseLease: Committed blocks are minimally replicated," " internalReleaseLease: Committed blocks are minimally" +
+ " lease removed, file closed."); " replicated, lease removed, file" + src + " closed.");
return true; // closed! return true; // closed!
} }
// Cannot close file right now, since some blocks // Cannot close file right now, since some blocks
@ -3204,7 +3230,7 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
finalizeINodeFileUnderConstruction(src, pendingFile, finalizeINodeFileUnderConstruction(src, pendingFile,
iip.getLatestSnapshotId(), false); iip.getLatestSnapshotId(), false);
NameNode.stateChangeLog.warn("BLOCK* internalReleaseLease: " NameNode.stateChangeLog.warn("BLOCK* internalReleaseLease: "
+ "Removed empty last block and closed file."); + "Removed empty last block and closed file " + src);
return true; return true;
} }
// start recovery of the last block for this file // start recovery of the last block for this file

View File

@ -336,7 +336,7 @@ public class LeaseManager {
} }
} }
Thread.sleep(HdfsServerConstants.NAMENODE_LEASE_RECHECK_INTERVAL); Thread.sleep(fsnamesystem.getLeaseRecheckIntervalMs());
} catch(InterruptedException ie) { } catch(InterruptedException ie) {
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug(name + " is interrupted", ie); LOG.debug(name + " is interrupted", ie);
@ -356,8 +356,11 @@ public class LeaseManager {
boolean needSync = false; boolean needSync = false;
assert fsnamesystem.hasWriteLock(); assert fsnamesystem.hasWriteLock();
while(!sortedLeases.isEmpty() && sortedLeases.peek().expiredHardLimit()) { long start = monotonicNow();
Lease leaseToCheck = sortedLeases.poll();
while(!sortedLeases.isEmpty() && sortedLeases.peek().expiredHardLimit()
&& !isMaxLockHoldToReleaseLease(start)) {
Lease leaseToCheck = sortedLeases.peek();
LOG.info(leaseToCheck + " has expired hard limit"); LOG.info(leaseToCheck + " has expired hard limit");
final List<Long> removing = new ArrayList<>(); final List<Long> removing = new ArrayList<>();
@ -397,6 +400,11 @@ public class LeaseManager {
+ leaseToCheck, e); + leaseToCheck, e);
removing.add(id); removing.add(id);
} }
if (isMaxLockHoldToReleaseLease(start)) {
LOG.debug("Breaking out of checkLeases after " +
fsnamesystem.getMaxLockHoldToReleaseLeaseMs() + "ms.");
break;
}
} }
for(Long id : removing) { for(Long id : removing) {
@ -407,6 +415,13 @@ public class LeaseManager {
return needSync; return needSync;
} }
/** @return true if max lock hold is reached */
private boolean isMaxLockHoldToReleaseLease(long start) {
return monotonicNow() - start >
fsnamesystem.getMaxLockHoldToReleaseLeaseMs();
}
@Override @Override
public synchronized String toString() { public synchronized String toString() {
return getClass().getSimpleName() + "= {" return getClass().getSimpleName() + "= {"

View File

@ -2594,6 +2594,24 @@
</description> </description>
</property> </property>
<property>
<name>dfs.namenode.lease-recheck-interval-ms</name>
<value>2000</value>
<description>During the release of lease a lock is hold that make any
operations on the namenode stuck. In order to not block them during
a too long duration we stop releasing lease after this max lock limit.
</description>
</property>
<property>
<name>dfs.namenode.max-lock-hold-to-release-lease-ms</name>
<value>25</value>
<description>During the release of lease a lock is hold that make any
operations on the namenode stuck. In order to not block them during
a too long duration we stop releasing lease after this max lock limit.
</description>
</property>
<property> <property>
<name>dfs.namenode.startup.delay.block.deletion.sec</name> <name>dfs.namenode.startup.delay.block.deletion.sec</name>
<value>0</value> <value>0</value>

View File

@ -19,6 +19,7 @@ package org.apache.hadoop.hdfs.server.namenode;
import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.CoreMatchers.is;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull; import static org.junit.Assert.assertNull;
@ -39,6 +40,8 @@ public class TestLeaseManager {
@Rule @Rule
public Timeout timeout = new Timeout(300000); public Timeout timeout = new Timeout(300000);
public static long maxLockHoldToReleaseLeaseMs = 100;
@Test @Test
public void testRemoveLeases() throws Exception { public void testRemoveLeases() throws Exception {
FSNamesystem fsn = mock(FSNamesystem.class); FSNamesystem fsn = mock(FSNamesystem.class);
@ -57,28 +60,28 @@ public class TestLeaseManager {
assertEquals(0, lm.getINodeIdWithLeases().size()); assertEquals(0, lm.getINodeIdWithLeases().size());
} }
/** Check that even if LeaseManager.checkLease is not able to relinquish /** Check that LeaseManager.checkLease release some leases
* leases, the Namenode does't enter an infinite loop while holding the FSN
* write lock and thus become unresponsive
*/ */
@Test @Test
public void testCheckLeaseNotInfiniteLoop() { public void testCheckLease() {
LeaseManager lm = new LeaseManager(makeMockFsNameSystem()); LeaseManager lm = new LeaseManager(makeMockFsNameSystem());
long numLease = 100;
//Make sure the leases we are going to add exceed the hard limit //Make sure the leases we are going to add exceed the hard limit
lm.setLeasePeriod(0, 0); lm.setLeasePeriod(0, 0);
//Add some leases to the LeaseManager for (long i = 0; i <= numLease - 1; i++) {
lm.addLease("holder1", INodeId.ROOT_INODE_ID + 1); //Add some leases to the LeaseManager
lm.addLease("holder2", INodeId.ROOT_INODE_ID + 2); lm.addLease("holder"+i, INodeId.ROOT_INODE_ID + i);
lm.addLease("holder3", INodeId.ROOT_INODE_ID + 3); }
assertEquals(lm.countLease(), 3); assertEquals(numLease, lm.countLease());
//Initiate a call to checkLease. This should exit within the test timeout //Initiate a call to checkLease. This should exit within the test timeout
lm.checkLeases(); lm.checkLeases();
assertTrue(lm.countLease() < numLease);
} }
@Test @Test
public void testCountPath() { public void testCountPath() {
LeaseManager lm = new LeaseManager(makeMockFsNameSystem()); LeaseManager lm = new LeaseManager(makeMockFsNameSystem());
@ -112,6 +115,7 @@ public class TestLeaseManager {
when(fsn.isRunning()).thenReturn(true); when(fsn.isRunning()).thenReturn(true);
when(fsn.hasWriteLock()).thenReturn(true); when(fsn.hasWriteLock()).thenReturn(true);
when(fsn.getFSDirectory()).thenReturn(dir); when(fsn.getFSDirectory()).thenReturn(dir);
when(fsn.getMaxLockHoldToReleaseLeaseMs()).thenReturn(maxLockHoldToReleaseLeaseMs);
return fsn; return fsn;
} }