HDFS-6229. Race condition in failover can cause RetryCache fail to work. Contributed by Jing Zhao.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1586714 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jing Zhao 2014-04-11 16:42:46 +00:00
parent 6d45bb54d2
commit 01ea648056
4 changed files with 41 additions and 5 deletions

View File

@ -20,6 +20,7 @@ package org.apache.hadoop.ipc;
import java.util.Arrays;
import java.util.UUID;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@ -183,6 +184,8 @@ public class RetryCache {
private final long expirationTime;
private String cacheName;
private final ReentrantLock lock = new ReentrantLock();
/**
* Constructor
* @param cacheName name to identify the cache by
@ -206,6 +209,13 @@ public class RetryCache {
|| Arrays.equals(Server.getClientId(), RpcConstants.DUMMY_CLIENT_ID);
}
public void lock() {
this.lock.lock();
}
public void unlock() {
this.lock.unlock();
}
private void incrCacheClearedCounter() {
retryCacheMetrics.incrCacheCleared();
@ -247,7 +257,8 @@ public class RetryCache {
*/
private CacheEntry waitForCompletion(CacheEntry newEntry) {
CacheEntry mapEntry = null;
synchronized (this) {
lock.lock();
try {
mapEntry = set.get(newEntry);
// If an entry in the cache does not exist, add a new one
if (mapEntry == null) {
@ -262,6 +273,8 @@ public class RetryCache {
} else {
retryCacheMetrics.incrCacheHit();
}
} finally {
lock.unlock();
}
// Entry already exists in cache. Wait for completion and return its state
Preconditions.checkNotNull(mapEntry,
@ -292,8 +305,11 @@ public class RetryCache {
public void addCacheEntry(byte[] clientId, int callId) {
CacheEntry newEntry = new CacheEntry(clientId, callId, System.nanoTime()
+ expirationTime, true);
synchronized(this) {
lock.lock();
try {
set.put(newEntry);
} finally {
lock.unlock();
}
retryCacheMetrics.incrCacheUpdated();
}
@ -303,8 +319,11 @@ public class RetryCache {
// since the entry is loaded from editlog, we can assume it succeeded.
CacheEntry newEntry = new CacheEntryWithPayload(clientId, callId, payload,
System.nanoTime() + expirationTime, true);
synchronized(this) {
lock.lock();
try {
set.put(newEntry);
} finally {
lock.unlock();
}
retryCacheMetrics.incrCacheUpdated();
}

View File

@ -381,6 +381,9 @@ Release 2.4.1 - UNRELEASED
HDFS-6235. TestFileJournalManager can fail on Windows due to file locking if
tests run out of order. (cnauroth)
HDFS-6229. Race condition in failover can cause RetryCache fail to work.
(jing9)
Release 2.4.0 - 2014-04-07
INCOMPATIBLE CHANGES

View File

@ -805,7 +805,19 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
public RetryCache getRetryCache() {
return retryCache;
}
void lockRetryCache() {
if (retryCache != null) {
retryCache.lock();
}
}
void unlockRetryCache() {
if (retryCache != null) {
retryCache.unlock();
}
}
/** Whether or not retry cache is enabled */
boolean hasRetryCache() {
return retryCache != null;
@ -6934,8 +6946,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
if (cacheEntry != null && cacheEntry.isSuccess()) {
return (String) cacheEntry.getPayload();
}
writeLock();
String snapshotPath = null;
writeLock();
try {
checkOperation(OperationCategory.WRITE);
checkNameNodeSafeMode("Cannot create snapshot for " + snapshotRoot);

View File

@ -1574,10 +1574,12 @@ public class NameNode implements NameNodeStatusMXBean {
@Override
public void writeLock() {
namesystem.writeLock();
namesystem.lockRetryCache();
}
@Override
public void writeUnlock() {
namesystem.unlockRetryCache();
namesystem.writeUnlock();
}