From 0f514ab75ae3d903bb2229c97ef824bbd53d3ba8 Mon Sep 17 00:00:00 2001 From: Mingliang Liu Date: Thu, 6 Sep 2018 23:01:52 -0700 Subject: [PATCH] HBASE-21164 reportForDuty should do backoff rather than retry Remove unused methods from Sleeper (its ok, its @Private). Remove notion of startTime from Sleeper handling (it is is unused). Allow passing in how long to sleep so can maintain externally. In HRS, use a RetryCounter to calculate backoff sleep time for when reportForDuty is failing against a struggling Master. --- .../org/apache/hadoop/hbase/util/Sleeper.java | 31 +++---- .../apache/hadoop/hbase/master/HMaster.java | 3 +- .../hbase/regionserver/HRegionServer.java | 16 ++-- .../TestRegionServerReportForDuty.java | 88 +++++++++++++++++++ 4 files changed, 111 insertions(+), 27 deletions(-) diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java index 7d4d692e1a1..93ef08cc605 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java @@ -49,13 +49,6 @@ public class Sleeper { this.stopper = stopper; } - /** - * Sleep for period. - */ - public void sleep() { - sleep(System.currentTimeMillis()); - } - /** * If currently asleep, stops sleeping; if not asleep, will skip the next * sleep cycle. @@ -68,28 +61,24 @@ public class Sleeper { } /** - * Sleep for period adjusted by passed startTime - * @param startTime Time some task started previous to now. Time to sleep - * will be docked current time minus passed startTime. + * Sleep for period. */ - public void sleep(final long startTime) { + public void sleep() { + sleep(this.period); + } + + public void sleep(long sleepTime) { if (this.stopper.isStopped()) { return; } long now = System.currentTimeMillis(); - long waitTime = this.period - (now - startTime); - if (waitTime > this.period) { - LOG.warn("Calculated wait time > " + this.period + - "; setting to this.period: " + System.currentTimeMillis() + ", " + - startTime); - waitTime = this.period; - } - while (waitTime > 0) { + long currentSleepTime = sleepTime; + while (currentSleepTime > 0) { long woke = -1; try { synchronized (sleepLock) { if (triggerWake) break; - sleepLock.wait(waitTime); + sleepLock.wait(currentSleepTime); } woke = System.currentTimeMillis(); long slept = woke - now; @@ -108,7 +97,7 @@ public class Sleeper { } // Recalculate waitTime. woke = (woke == -1)? System.currentTimeMillis(): woke; - waitTime = this.period - (woke - startTime); + currentSleepTime = this.period - (woke - now); } synchronized(sleepLock) { triggerWake = false; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 50c7b80bbd1..0678bfe9477 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -2998,7 +2998,8 @@ public class HMaster extends HRegionServer implements MasterServices { } } - void checkServiceStarted() throws ServerNotRunningYetException { + @VisibleForTesting + protected void checkServiceStarted() throws ServerNotRunningYetException { if (!serviceStarted) { throw new ServerNotRunningYetException("Server is not running yet"); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 9c817745639..38897fb830b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -147,6 +147,8 @@ import org.apache.hadoop.hbase.util.HasThread; import org.apache.hadoop.hbase.util.JvmPauseMonitor; import org.apache.hadoop.hbase.util.NettyEventLoopGroupConfig; import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.hbase.util.RetryCounter; +import org.apache.hadoop.hbase.util.RetryCounterFactory; import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil; import org.apache.hadoop.hbase.util.Sleeper; import org.apache.hadoop.hbase.util.Threads; @@ -937,14 +939,18 @@ public class HRegionServer extends HasThread implements this.rsHost = new RegionServerCoprocessorHost(this, this.conf); } - // Try and register with the Master; tell it we are here. Break if - // server is stopped or the clusterup flag is down or hdfs went wacky. - // Once registered successfully, go ahead and start up all Services. + // Try and register with the Master; tell it we are here. Break if server is stopped or the + // clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and start + // up all Services. Use RetryCounter to get backoff in case Master is struggling to come up. + RetryCounterFactory rcf = new RetryCounterFactory(Integer.MAX_VALUE, + this.sleeper.getPeriod(), 1000 * 60 * 5); + RetryCounter rc = rcf.create(); while (keepLooping()) { RegionServerStartupResponse w = reportForDuty(); if (w == null) { - LOG.warn("reportForDuty failed; sleeping and then retrying."); - this.sleeper.sleep(); + long sleepTime = rc.getBackoffTimeAndIncrementAttempts(); + LOG.warn("reportForDuty failed; sleeping {} ms and then retrying.", sleepTime); + this.sleeper.sleep(sleepTime); } else { handleReportForDutyResponse(w); break; diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java index 83632be6789..aaf2d2eb9a0 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java @@ -21,6 +21,9 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.IOException; +import java.io.StringWriter; + +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtility; @@ -28,11 +31,17 @@ import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.LocalHBaseCluster; import org.apache.hadoop.hbase.MiniHBaseCluster.MiniHBaseClusterRegionServer; import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException; +import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.master.LoadBalancer; import org.apache.hadoop.hbase.master.ServerManager; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread; +import org.apache.log4j.Appender; +import org.apache.log4j.Layout; +import org.apache.log4j.PatternLayout; +import org.apache.log4j.WriterAppender; import org.apache.zookeeper.KeeperException; import org.junit.After; import org.junit.Before; @@ -77,6 +86,85 @@ public class TestRegionServerReportForDuty { testUtil.shutdownMiniDFSCluster(); } + /** + * LogCapturer is similar to {@link org.apache.hadoop.test.GenericTestUtils.LogCapturer} + * except that this implementation has a default appender to the root logger. + * Hadoop 2.8+ supports the default appender in the LogCapture it ships and this can be replaced. + * TODO: This class can be removed after we upgrade Hadoop dependency. + */ + static class LogCapturer { + private StringWriter sw = new StringWriter(); + private WriterAppender appender; + private org.apache.log4j.Logger logger; + + LogCapturer(org.apache.log4j.Logger logger) { + this.logger = logger; + Appender defaultAppender = org.apache.log4j.Logger.getRootLogger().getAppender("stdout"); + if (defaultAppender == null) { + defaultAppender = org.apache.log4j.Logger.getRootLogger().getAppender("console"); + } + final Layout layout = (defaultAppender == null) ? new PatternLayout() : + defaultAppender.getLayout(); + this.appender = new WriterAppender(layout, sw); + this.logger.addAppender(this.appender); + } + + String getOutput() { + return sw.toString(); + } + + public void stopCapturing() { + this.logger.removeAppender(this.appender); + } + } + + /** + * This test HMaster class will always throw ServerNotRunningYetException if checked. + */ + public static class NeverInitializedMaster extends HMaster { + public NeverInitializedMaster(Configuration conf) throws IOException, KeeperException { + super(conf); + } + + @Override + protected void checkServiceStarted() throws ServerNotRunningYetException { + throw new ServerNotRunningYetException("Server is not running yet"); + } + } + + /** + * Tests region server should backoff to report for duty if master is not ready. + */ + @Test + public void testReportForDutyBackoff() throws IOException, InterruptedException { + cluster.getConfiguration().set(HConstants.MASTER_IMPL, NeverInitializedMaster.class.getName()); + master = cluster.addMaster(); + master.start(); + + LogCapturer capturer = new LogCapturer(org.apache.log4j.Logger.getLogger(HRegionServer.class)); + // Set sleep interval relatively low so that exponential backoff is more demanding. + int msginterval = 100; + cluster.getConfiguration().setInt("hbase.regionserver.msginterval", msginterval); + rs = cluster.addRegionServer(); + rs.start(); + + int interval = 10_000; + Thread.sleep(interval); + capturer.stopCapturing(); + String output = capturer.getOutput(); + LOG.info("{}", output); + String failMsg = "reportForDuty failed;"; + int count = StringUtils.countMatches(output, failMsg); + + // Following asserts the actual retry number is in range (expectedRetry/2, expectedRetry*2). + // Ideally we can assert the exact retry count. We relax here to tolerate contention error. + int expectedRetry = (int)Math.ceil(Math.log(interval - msginterval)); + assertTrue(String.format("reportForDuty retries %d times, less than expected min %d", + count, expectedRetry / 2), count > expectedRetry / 2); + assertTrue(String.format("reportForDuty retries %d times, more than expected max %d", + count, expectedRetry * 2), count < expectedRetry * 2); + } + /** * Tests region sever reportForDuty with backup master becomes primary master after * the first master goes away.