diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RetryCache.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RetryCache.java index 2b8ad123cae..d35ed950cf4 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RetryCache.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RetryCache.java @@ -20,6 +20,7 @@ package org.apache.hadoop.ipc; import java.util.Arrays; import java.util.UUID; +import java.util.concurrent.locks.ReentrantLock; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -183,6 +184,8 @@ public class RetryCache { private final long expirationTime; private String cacheName; + private final ReentrantLock lock = new ReentrantLock(); + /** * Constructor * @param cacheName name to identify the cache by @@ -206,6 +209,13 @@ public class RetryCache { || Arrays.equals(Server.getClientId(), RpcConstants.DUMMY_CLIENT_ID); } + public void lock() { + this.lock.lock(); + } + + public void unlock() { + this.lock.unlock(); + } private void incrCacheClearedCounter() { retryCacheMetrics.incrCacheCleared(); @@ -247,7 +257,8 @@ public class RetryCache { */ private CacheEntry waitForCompletion(CacheEntry newEntry) { CacheEntry mapEntry = null; - synchronized (this) { + lock.lock(); + try { mapEntry = set.get(newEntry); // If an entry in the cache does not exist, add a new one if (mapEntry == null) { @@ -262,6 +273,8 @@ public class RetryCache { } else { retryCacheMetrics.incrCacheHit(); } + } finally { + lock.unlock(); } // Entry already exists in cache. Wait for completion and return its state Preconditions.checkNotNull(mapEntry, @@ -292,8 +305,11 @@ public class RetryCache { public void addCacheEntry(byte[] clientId, int callId) { CacheEntry newEntry = new CacheEntry(clientId, callId, System.nanoTime() + expirationTime, true); - synchronized(this) { + lock.lock(); + try { set.put(newEntry); + } finally { + lock.unlock(); } retryCacheMetrics.incrCacheUpdated(); } @@ -303,8 +319,11 @@ public class RetryCache { // since the entry is loaded from editlog, we can assume it succeeded. CacheEntry newEntry = new CacheEntryWithPayload(clientId, callId, payload, System.nanoTime() + expirationTime, true); - synchronized(this) { + lock.lock(); + try { set.put(newEntry); + } finally { + lock.unlock(); } retryCacheMetrics.incrCacheUpdated(); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 69b1a8be63a..0f04c6aaef3 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -381,6 +381,9 @@ Release 2.4.1 - UNRELEASED HDFS-6235. TestFileJournalManager can fail on Windows due to file locking if tests run out of order. (cnauroth) + HDFS-6229. Race condition in failover can cause RetryCache fail to work. + (jing9) + Release 2.4.0 - 2014-04-07 INCOMPATIBLE CHANGES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index 16c2df9ce4f..2141f49c176 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -805,7 +805,19 @@ public class FSNamesystem implements Namesystem, FSClusterStats, public RetryCache getRetryCache() { return retryCache; } - + + void lockRetryCache() { + if (retryCache != null) { + retryCache.lock(); + } + } + + void unlockRetryCache() { + if (retryCache != null) { + retryCache.unlock(); + } + } + /** Whether or not retry cache is enabled */ boolean hasRetryCache() { return retryCache != null; @@ -6934,8 +6946,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats, if (cacheEntry != null && cacheEntry.isSuccess()) { return (String) cacheEntry.getPayload(); } - writeLock(); String snapshotPath = null; + writeLock(); try { checkOperation(OperationCategory.WRITE); checkNameNodeSafeMode("Cannot create snapshot for " + snapshotRoot); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java index 673453886b5..441eedef5ae 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java @@ -1574,10 +1574,12 @@ public class NameNode implements NameNodeStatusMXBean { @Override public void writeLock() { namesystem.writeLock(); + namesystem.lockRetryCache(); } @Override public void writeUnlock() { + namesystem.unlockRetryCache(); namesystem.writeUnlock(); }