HDFS-5064. Standby checkpoints should not block concurrent readers. Contributed by Aaron Twining Myers.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1575449 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
78f1a475c8
commit
fd3e554ff0
|
@ -329,6 +329,9 @@ Release 2.4.0 - UNRELEASED
|
|||
HDFS-6065. HDFS zero-copy reads should return null on EOF when doing ZCR
|
||||
(cmccabe)
|
||||
|
||||
HDFS-5064. Standby checkpoints should not block concurrent readers.
|
||||
(atm via wang)
|
||||
|
||||
BREAKDOWN OF HDFS-5698 SUBTASKS AND RELATED JIRAS
|
||||
|
||||
HDFS-5717. Save FSImage header in protobuf. (Haohui Mai via jing9)
|
||||
|
|
|
@ -1262,7 +1262,11 @@ public class FSImage implements Closeable {
|
|||
this.lastAppliedTxId = editLog.getLastWrittenTxId();
|
||||
}
|
||||
|
||||
public synchronized long getMostRecentCheckpointTxId() {
|
||||
// Should be OK for this to not be synchronized since all of the places which
|
||||
// mutate this value are themselves synchronized so it shouldn't be possible
|
||||
// to see this flop back and forth. In the worst case this will just return an
|
||||
// old value.
|
||||
public long getMostRecentCheckpointTxId() {
|
||||
return storage.getMostRecentCheckpointTxId();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -116,6 +116,7 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||
|
||||
import javax.management.NotCompliantMBeanException;
|
||||
|
@ -1294,20 +1295,47 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
|
|||
this.fsLock.readLock().lock();
|
||||
}
|
||||
@Override
|
||||
public void longReadLockInterruptibly() throws InterruptedException {
|
||||
this.fsLock.longReadLock().lockInterruptibly();
|
||||
try {
|
||||
this.fsLock.readLock().lockInterruptibly();
|
||||
} catch (InterruptedException ie) {
|
||||
// In the event we're interrupted while getting the normal FSNS read lock,
|
||||
// release the long read lock.
|
||||
this.fsLock.longReadLock().unlock();
|
||||
throw ie;
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public void longReadUnlock() {
|
||||
this.fsLock.readLock().unlock();
|
||||
this.fsLock.longReadLock().unlock();
|
||||
}
|
||||
@Override
|
||||
public void readUnlock() {
|
||||
this.fsLock.readLock().unlock();
|
||||
}
|
||||
@Override
|
||||
public void writeLock() {
|
||||
this.fsLock.longReadLock().lock();
|
||||
this.fsLock.writeLock().lock();
|
||||
}
|
||||
@Override
|
||||
public void writeLockInterruptibly() throws InterruptedException {
|
||||
this.fsLock.writeLock().lockInterruptibly();
|
||||
this.fsLock.longReadLock().lockInterruptibly();
|
||||
try {
|
||||
this.fsLock.writeLock().lockInterruptibly();
|
||||
} catch (InterruptedException ie) {
|
||||
// In the event we're interrupted while getting the normal FSNS write
|
||||
// lock, release the long read lock.
|
||||
this.fsLock.longReadLock().unlock();
|
||||
throw ie;
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public void writeUnlock() {
|
||||
this.fsLock.writeLock().unlock();
|
||||
this.fsLock.longReadLock().unlock();
|
||||
}
|
||||
@Override
|
||||
public boolean hasWriteLock() {
|
||||
|
@ -6806,10 +6834,15 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
|
|||
}
|
||||
|
||||
@VisibleForTesting
|
||||
ReentrantReadWriteLock getFsLockForTests() {
|
||||
public ReentrantReadWriteLock getFsLockForTests() {
|
||||
return fsLock.coarseLock;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public ReentrantLock getLongReadLockForTests() {
|
||||
return fsLock.longReadLock;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public SafeModeInfo getSafeModeInfoForTests() {
|
||||
return safeMode;
|
||||
|
|
|
@ -21,6 +21,7 @@ package org.apache.hadoop.hdfs.server.namenode;
|
|||
|
||||
import java.util.concurrent.locks.Lock;
|
||||
import java.util.concurrent.locks.ReadWriteLock;
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
|
@ -33,6 +34,24 @@ class FSNamesystemLock implements ReadWriteLock {
|
|||
@VisibleForTesting
|
||||
protected ReentrantReadWriteLock coarseLock;
|
||||
|
||||
/**
|
||||
* When locking the FSNS for a read that may take a long time, we take this
|
||||
* lock before taking the regular FSNS read lock. All writers also take this
|
||||
* lock before taking the FSNS write lock. Regular (short) readers do not
|
||||
* take this lock at all, instead relying solely on the synchronization of the
|
||||
* regular FSNS lock.
|
||||
*
|
||||
* This scheme ensures that:
|
||||
* 1) In the case of normal (fast) ops, readers proceed concurrently and
|
||||
* writers are not starved.
|
||||
* 2) In the case of long read ops, short reads are allowed to proceed
|
||||
* concurrently during the duration of the long read.
|
||||
*
|
||||
* See HDFS-5064 for more context.
|
||||
*/
|
||||
@VisibleForTesting
|
||||
protected ReentrantLock longReadLock = new ReentrantLock(true);
|
||||
|
||||
FSNamesystemLock(boolean fair) {
|
||||
this.coarseLock = new ReentrantReadWriteLock(fair);
|
||||
}
|
||||
|
@ -47,6 +66,10 @@ class FSNamesystemLock implements ReadWriteLock {
|
|||
return coarseLock.writeLock();
|
||||
}
|
||||
|
||||
public Lock longReadLock() {
|
||||
return longReadLock;
|
||||
}
|
||||
|
||||
public int getReadHoldCount() {
|
||||
return coarseLock.getReadHoldCount();
|
||||
}
|
||||
|
|
|
@ -124,7 +124,7 @@ public class NNStorage extends Storage implements Closeable,
|
|||
* recent fsimage file. This does not include any transactions
|
||||
* that have since been written to the edit log.
|
||||
*/
|
||||
protected long mostRecentCheckpointTxId = HdfsConstants.INVALID_TXID;
|
||||
protected volatile long mostRecentCheckpointTxId = HdfsConstants.INVALID_TXID;
|
||||
|
||||
/**
|
||||
* Time of the last checkpoint, in milliseconds since the epoch.
|
||||
|
|
|
@ -145,7 +145,7 @@ public class StandbyCheckpointer {
|
|||
assert canceler != null;
|
||||
final long txid;
|
||||
|
||||
namesystem.writeLockInterruptibly();
|
||||
namesystem.longReadLockInterruptibly();
|
||||
try {
|
||||
assert namesystem.getEditLog().isOpenForRead() :
|
||||
"Standby Checkpointer should only attempt a checkpoint when " +
|
||||
|
@ -168,7 +168,7 @@ public class StandbyCheckpointer {
|
|||
assert txid == thisCheckpointTxId : "expected to save checkpoint at txid=" +
|
||||
thisCheckpointTxId + " but instead saved at txid=" + txid;
|
||||
} finally {
|
||||
namesystem.writeUnlock();
|
||||
namesystem.longReadUnlock();
|
||||
}
|
||||
|
||||
// Upload the saved checkpoint back to the active
|
||||
|
|
|
@ -22,6 +22,15 @@ public interface RwLock {
|
|||
/** Acquire read lock. */
|
||||
public void readLock();
|
||||
|
||||
/**
|
||||
* Acquire the long read lock, unless interrupted while waiting. The long
|
||||
* read lock should also serve to block all concurrent writers.
|
||||
**/
|
||||
void longReadLockInterruptibly() throws InterruptedException;
|
||||
|
||||
/** Release the long read lock. */
|
||||
public void longReadUnlock();
|
||||
|
||||
/** Release read lock. */
|
||||
public void readUnlock();
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
package org.apache.hadoop.hdfs.server.namenode.ha;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
|
@ -25,6 +26,7 @@ import java.io.File;
|
|||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.net.URI;
|
||||
import java.net.URL;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
|
@ -33,6 +35,7 @@ import org.apache.hadoop.conf.Configuration;
|
|||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||
import org.apache.hadoop.hdfs.DFSTestUtil;
|
||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||
import org.apache.hadoop.hdfs.MiniDFSNNTopology;
|
||||
import org.apache.hadoop.hdfs.server.namenode.FSImage;
|
||||
|
@ -311,6 +314,66 @@ public class TestStandbyCheckpoints {
|
|||
answerer.getFireCount() == 1 && answerer.getResultCount() == 1);
|
||||
}
|
||||
|
||||
@Test(timeout=300000)
|
||||
public void testReadsAllowedDuringCheckpoint() throws Exception {
|
||||
|
||||
// Set it up so that we know when the SBN checkpoint starts and ends.
|
||||
FSImage spyImage1 = NameNodeAdapter.spyOnFsImage(nn1);
|
||||
DelayAnswer answerer = new DelayAnswer(LOG);
|
||||
Mockito.doAnswer(answerer).when(spyImage1)
|
||||
.saveNamespace(Mockito.any(FSNamesystem.class),
|
||||
Mockito.any(Canceler.class));
|
||||
|
||||
// Perform some edits and wait for a checkpoint to start on the SBN.
|
||||
doEdits(0, 1000);
|
||||
nn0.getRpcServer().rollEditLog();
|
||||
answerer.waitForCall();
|
||||
assertTrue("SBN is not performing checkpoint but it should be.",
|
||||
answerer.getFireCount() == 1 && answerer.getResultCount() == 0);
|
||||
|
||||
// Make sure that the lock has actually been taken by the checkpointing
|
||||
// thread.
|
||||
ThreadUtil.sleepAtLeastIgnoreInterrupts(1000);
|
||||
|
||||
// Perform an RPC that needs to take the write lock.
|
||||
Thread t = new Thread() {
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
nn1.getRpcServer().restoreFailedStorage("false");
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
};
|
||||
t.start();
|
||||
|
||||
// Make sure that our thread is waiting for the lock.
|
||||
ThreadUtil.sleepAtLeastIgnoreInterrupts(1000);
|
||||
|
||||
assertFalse(nn1.getNamesystem().getFsLockForTests().hasQueuedThreads());
|
||||
assertFalse(nn1.getNamesystem().getFsLockForTests().isWriteLocked());
|
||||
assertTrue(nn1.getNamesystem().getLongReadLockForTests().hasQueuedThreads());
|
||||
|
||||
// Get /jmx of the standby NN web UI, which will cause the FSNS read lock to
|
||||
// be taken.
|
||||
String pageContents = DFSTestUtil.urlGet(new URL("http://" +
|
||||
nn1.getHttpAddress().getHostName() + ":" +
|
||||
nn1.getHttpAddress().getPort() + "/jmx"));
|
||||
assertTrue(pageContents.contains("NumLiveDataNodes"));
|
||||
|
||||
// Make sure that the checkpoint is still going on, implying that the client
|
||||
// RPC to the SBN happened during the checkpoint.
|
||||
assertTrue("SBN should have still been checkpointing.",
|
||||
answerer.getFireCount() == 1 && answerer.getResultCount() == 0);
|
||||
answerer.proceed();
|
||||
answerer.waitForResult();
|
||||
assertTrue("SBN should have finished checkpointing.",
|
||||
answerer.getFireCount() == 1 && answerer.getResultCount() == 1);
|
||||
|
||||
t.join();
|
||||
}
|
||||
|
||||
private void doEdits(int start, int stop) throws IOException {
|
||||
for (int i = start; i < stop; i++) {
|
||||
Path p = new Path("/test" + i);
|
||||
|
|
Loading…
Reference in New Issue