diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java index 09bec147aea..6b77e809426 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java @@ -679,7 +679,8 @@ public class FSHLog extends AbstractFSWAL { return logRollNeeded; } - private long getSequenceOnRingBuffer() { + @VisibleForTesting + protected long getSequenceOnRingBuffer() { return this.disruptor.getRingBuffer().next(); } @@ -688,7 +689,8 @@ public class FSHLog extends AbstractFSWAL { return publishSyncOnRingBuffer(sequence); } - private SyncFuture publishSyncOnRingBuffer(long sequence) { + @VisibleForTesting + protected SyncFuture publishSyncOnRingBuffer(long sequence) { // here we use ring buffer sequence as transaction id SyncFuture syncFuture = getSyncFuture(sequence); try { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestWALLockup.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestWALLockup.java index 5eb8fa8a51a..9e051aeed1e 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestWALLockup.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestWALLockup.java @@ -56,6 +56,7 @@ import org.apache.hadoop.hbase.wal.WALProvider.Writer; import org.apache.hadoop.hbase.zookeeper.MetaTableLocator; import org.apache.hadoop.hbase.zookeeper.ZKWatcher; import org.junit.After; +import org.junit.Assert; import org.junit.Before; import org.junit.ClassRule; import org.junit.Rule; @@ -308,6 +309,140 @@ public class TestWALLockup { } } + /** + * + * If below is broken, we will see this test timeout because RingBufferEventHandler was stuck in + * attainSafePoint. Everyone will wait for sync to finish forever. See HBASE-14317. + */ + @Test (timeout=30000) + public void testRingBufferEventHandlerStuckWhenSyncFailed() + throws IOException, InterruptedException { + + // A WAL that we can have throw exceptions and slow FSHLog.replaceWriter down + class DodgyFSLog extends FSHLog { + + private volatile boolean zigZagCreated = false; + + public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf) + throws IOException { + super(fs, root, logDir, conf); + } + + @Override + protected void afterCreatingZigZagLatch() { + zigZagCreated = true; + // Sleep a while to wait for RingBufferEventHandler to get stuck first. + try { + Thread.sleep(3000); + } catch (InterruptedException ignore) { + } + } + + @Override + protected long getSequenceOnRingBuffer() { + return super.getSequenceOnRingBuffer(); + } + + protected void publishSyncOnRingBufferAndBlock(long sequence) { + try { + super.blockOnSync(super.publishSyncOnRingBuffer(sequence)); + Assert.fail("Expect an IOException here."); + } catch (IOException ignore) { + // Here, we will get an IOException. + } + } + + @Override + protected Writer createWriterInstance(Path path) throws IOException { + final Writer w = super.createWriterInstance(path); + return new Writer() { + @Override + public void close() throws IOException { + w.close(); + } + + @Override + public void sync() throws IOException { + throw new IOException("FAKE! Failed to replace a bad datanode...SYNC"); + } + + @Override + public void append(Entry entry) throws IOException { + w.append(entry); + } + + @Override + public long getLength() { + return w.getLength(); + } + }; + } + } + + // Mocked up server and regionserver services. Needed below. + final Server server = Mockito.mock(Server.class); + Mockito.when(server.getConfiguration()).thenReturn(CONF); + Mockito.when(server.isStopped()).thenReturn(false); + Mockito.when(server.isAborted()).thenReturn(false); + RegionServerServices services = Mockito.mock(RegionServerServices.class); + + // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test. + FileSystem fs = FileSystem.get(CONF); + Path rootDir = new Path(dir + getName()); + final DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF); + // I need a log roller running. + LogRoller logRoller = new LogRoller(server, services); + logRoller.addWAL(dodgyWAL); + // There is no 'stop' once a logRoller is running.. it just dies. + logRoller.start(); + + try { + final long seqForSync = dodgyWAL.getSequenceOnRingBuffer(); + + // This call provokes a WAL roll, and we will get a new RingBufferEventHandler.ZigZagLatch + // in LogRoller. + // After creating ZigZagLatch, RingBufferEventHandler would get stuck due to sync event, + // as long as HBASE-14317 hasn't be fixed. + LOG.info("Trigger log roll for creating a ZigZagLatch."); + logRoller.requestRollAll(); + + while (!dodgyWAL.zigZagCreated) { + Thread.sleep(10); + } + + // Send a sync event for RingBufferEventHandler, + // and it gets blocked in RingBufferEventHandler.attainSafePoint + LOG.info("Send sync for RingBufferEventHandler"); + Thread syncThread = new Thread() { + @Override + public void run() { + dodgyWAL.publishSyncOnRingBufferAndBlock(seqForSync); + } + }; + // Sync in another thread to avoid reset SyncFuture again. + syncThread.start(); + syncThread.join(); + + try { + LOG.info("Call sync for testing whether RingBufferEventHandler is hanging."); + dodgyWAL.sync(); // Should not get a hang here, otherwise we will see timeout in this test. + Assert.fail("Expect an IOException here."); + } catch (IOException ignore) { + } + + } finally { + // To stop logRoller, its server has to say it is stopped. + Mockito.when(server.isStopped()).thenReturn(true); + if (logRoller != null) { + logRoller.interrupt(); + } + if (dodgyWAL != null) { + dodgyWAL.close(); + } + } + } + + static class DummyServer implements Server { private Configuration conf; private String serverName;