Revert "HBASE-21164 reportForDuty should do backoff rather than retry"

This reverts commit f3f3798575.

This change causes TestMasterShutdown to reproducibly fail in some environments.
This commit is contained in:
Andrew Purtell 2019-02-20 09:25:46 -08:00
parent fd74857d24
commit 47e42dab75
No known key found for this signature in database
GPG Key ID: 8597754DD5365CCD
5 changed files with 26 additions and 132 deletions

View File

@ -174,14 +174,4 @@ public class RetryCounter {
public int getAttemptTimes() {
return attempts;
}
public long getBackoffTime() {
return this.retryConfig.backoffPolicy.getBackoffTime(this.retryConfig, getAttemptTimes());
}
public long getBackoffTimeAndIncrementAttempts() {
long backoffTime = getBackoffTime();
useRetry();
return backoffTime;
}
}

View File

@ -49,6 +49,13 @@ public class Sleeper {
this.stopper = stopper;
}
/**
* Sleep for period.
*/
public void sleep() {
sleep(System.currentTimeMillis());
}
/**
* If currently asleep, stops sleeping; if not asleep, will skip the next
* sleep cycle.
@ -61,24 +68,28 @@ public class Sleeper {
}
/**
* Sleep for period.
* Sleep for period adjusted by passed <code>startTime</code>
* @param startTime Time some task started previous to now. Time to sleep
* will be docked current time minus passed <code>startTime</code>.
*/
public void sleep() {
sleep(this.period);
}
public void sleep(long sleepTime) {
public void sleep(final long startTime) {
if (this.stopper.isStopped()) {
return;
}
long now = System.currentTimeMillis();
long currentSleepTime = sleepTime;
while (currentSleepTime > 0) {
long waitTime = this.period - (now - startTime);
if (waitTime > this.period) {
LOG.warn("Calculated wait time > " + this.period +
"; setting to this.period: " + System.currentTimeMillis() + ", " +
startTime);
waitTime = this.period;
}
while (waitTime > 0) {
long woke = -1;
try {
synchronized (sleepLock) {
if (triggerWake) break;
sleepLock.wait(currentSleepTime);
sleepLock.wait(waitTime);
}
woke = System.currentTimeMillis();
long slept = woke - now;
@ -97,7 +108,7 @@ public class Sleeper {
}
// Recalculate waitTime.
woke = (woke == -1)? System.currentTimeMillis(): woke;
currentSleepTime = this.period - (woke - now);
waitTime = this.period - (woke - startTime);
}
synchronized(sleepLock) {
triggerWake = false;

View File

@ -2649,18 +2649,7 @@ public class HMaster extends HRegionServer implements MasterServices, Server {
stop("Stopped by " + Thread.currentThread().getName());
}
@Override
public void stop(String msg) {
if (!isStopped()) {
super.stop(msg);
if (this.activeMasterManager != null) {
this.activeMasterManager.stop();
}
}
}
@VisibleForTesting
protected void checkServiceStarted() throws ServerNotRunningYetException {
void checkServiceStarted() throws ServerNotRunningYetException {
if (!serviceStarted) {
throw new ServerNotRunningYetException("Server is not running yet");
}

View File

@ -182,8 +182,6 @@ import org.apache.hadoop.hbase.util.HasThread;
import org.apache.hadoop.hbase.util.JSONBean;
import org.apache.hadoop.hbase.util.JvmPauseMonitor;
import org.apache.hadoop.hbase.util.MBeanUtil;
import org.apache.hadoop.hbase.util.RetryCounter;
import org.apache.hadoop.hbase.util.RetryCounterFactory;
import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil;
import org.apache.hadoop.hbase.util.Sleeper;
import org.apache.hadoop.hbase.util.Threads;
@ -974,18 +972,13 @@ public class HRegionServer extends HasThread implements
this.rsHost = new RegionServerCoprocessorHost(this, this.conf);
}
// Try and register with the Master; tell it we are here. Break if server is stopped or the
// clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and start
// up all Services. Use RetryCounter to get backoff in case Master is struggling to come up.
RetryCounterFactory rcf = new RetryCounterFactory(Integer.MAX_VALUE,
this.sleeper.getPeriod(), 1000 * 60 * 5);
RetryCounter rc = rcf.create();
// Try and register with the Master; tell it we are here. Break if
// server is stopped or the clusterup flag is down or hdfs went wacky.
while (keepLooping()) {
RegionServerStartupResponse w = reportForDuty();
if (w == null) {
long sleepTime = rc.getBackoffTimeAndIncrementAttempts();
LOG.warn("reportForDuty failed; sleeping " + sleepTime + " ms and then retrying");
this.sleeper.sleep(sleepTime);
LOG.warn("reportForDuty failed; sleeping and then retrying.");
this.sleeper.sleep();
} else {
handleReportForDutyResponse(w);
break;

View File

@ -21,9 +21,7 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.io.StringWriter;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
@ -32,19 +30,12 @@ import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.LocalHBaseCluster;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
import org.apache.hadoop.hbase.MiniHBaseCluster.MiniHBaseClusterRegionServer;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.ServerManager;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
import org.apache.log4j.Appender;
import org.apache.log4j.Layout;
import org.apache.log4j.PatternLayout;
import org.apache.log4j.WriterAppender;
import org.apache.zookeeper.KeeperException;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
@ -81,86 +72,6 @@ public class TestRegionServerReportForDuty {
testUtil.shutdownMiniDFSCluster();
}
/**
* LogCapturer is similar to {@link org.apache.hadoop.test.GenericTestUtils.LogCapturer}
* except that this implementation has a default appender to the root logger.
* Hadoop 2.8+ supports the default appender in the LogCapture it ships and this can be replaced.
* TODO: This class can be removed after we upgrade Hadoop dependency.
*/
static class LogCapturer {
private StringWriter sw = new StringWriter();
private WriterAppender appender;
private org.apache.log4j.Logger logger;
LogCapturer(org.apache.log4j.Logger logger) {
this.logger = logger;
Appender defaultAppender = org.apache.log4j.Logger.getRootLogger().getAppender("stdout");
if (defaultAppender == null) {
defaultAppender = org.apache.log4j.Logger.getRootLogger().getAppender("console");
}
final Layout layout = (defaultAppender == null) ? new PatternLayout() :
defaultAppender.getLayout();
this.appender = new WriterAppender(layout, sw);
this.logger.addAppender(this.appender);
}
String getOutput() {
return sw.toString();
}
public void stopCapturing() {
this.logger.removeAppender(this.appender);
}
}
/**
* This test HMaster class will always throw ServerNotRunningYetException if checked.
*/
public static class NeverInitializedMaster extends HMaster {
public NeverInitializedMaster(Configuration conf, CoordinatedStateManager csm)
throws IOException, KeeperException, InterruptedException {
super(conf, csm);
}
@Override
protected void checkServiceStarted() throws ServerNotRunningYetException {
throw new ServerNotRunningYetException("Server is not running yet");
}
}
/**
* Tests region server should backoff to report for duty if master is not ready.
*/
@Test
public void testReportForDutyBackoff() throws IOException, InterruptedException {
cluster.getConfiguration().set(HConstants.MASTER_IMPL, NeverInitializedMaster.class.getName());
master = cluster.addMaster();
master.start();
LogCapturer capturer = new LogCapturer(org.apache.log4j.Logger.getLogger(HRegionServer.class));
// Set sleep interval relatively low so that exponential backoff is more demanding.
int msginterval = 100;
cluster.getConfiguration().setInt("hbase.regionserver.msginterval", msginterval);
rs = cluster.addRegionServer();
rs.start();
int interval = 10_000;
Thread.sleep(interval);
capturer.stopCapturing();
String output = capturer.getOutput();
LOG.info(output);
String failMsg = "reportForDuty failed;";
int count = StringUtils.countMatches(output, failMsg);
// Following asserts the actual retry number is in range (expectedRetry/2, expectedRetry*2).
// Ideally we can assert the exact retry count. We relax here to tolerate contention error.
int expectedRetry = (int)Math.ceil(Math.log(interval - msginterval));
assertTrue(String.format("reportForDuty retries %d times, less than expected min %d",
count, expectedRetry / 2), count > expectedRetry / 2);
assertTrue(String.format("reportForDuty retries %d times, more than expected max %d",
count, expectedRetry * 2), count < expectedRetry * 2);
}
/**
* Tests region sever reportForDuty with backup master becomes primary master after
* the first master goes away.