HBASE-24511 Ability to configure timeout between RPC retry to RS from master (#1861)
Signed-off-by: Viraj Jasani <vjasani@apache.org>
This commit is contained in:
parent
b67f896954
commit
c02554024f
@ -235,12 +235,19 @@ public class RSProcedureDispatcher
|
||||
private int numberOfAttemptsSoFar = 0;
|
||||
private long maxWaitTime = -1;
|
||||
|
||||
private final long rsRpcRetryInterval;
|
||||
private static final String RS_RPC_RETRY_INTERVAL_CONF_KEY =
|
||||
"hbase.regionserver.rpc.retry.interval";
|
||||
private static final int DEFAULT_RS_RPC_RETRY_INTERVAL = 100;
|
||||
|
||||
private ExecuteProceduresRequest.Builder request = null;
|
||||
|
||||
public ExecuteProceduresRemoteCall(final ServerName serverName,
|
||||
final Set<RemoteProcedure> remoteProcedures) {
|
||||
this.serverName = serverName;
|
||||
this.remoteProcedures = remoteProcedures;
|
||||
this.rsRpcRetryInterval = master.getConfiguration().getLong(RS_RPC_RETRY_INTERVAL_CONF_KEY,
|
||||
DEFAULT_RS_RPC_RETRY_INTERVAL);
|
||||
}
|
||||
|
||||
private AdminService.BlockingInterface getRsAdmin() throws IOException {
|
||||
@ -265,8 +272,8 @@ public class RSProcedureDispatcher
|
||||
LOG.warn("Waiting a little before retrying {}, try={}, can wait up to {}ms",
|
||||
serverName, numberOfAttemptsSoFar, remainingTime);
|
||||
numberOfAttemptsSoFar++;
|
||||
// Retry every 100ms up to maximum wait time.
|
||||
submitTask(this, 100, TimeUnit.MILLISECONDS);
|
||||
// Retry every rsRpcRetryInterval millis up to maximum wait time.
|
||||
submitTask(this, rsRpcRetryInterval, TimeUnit.MILLISECONDS);
|
||||
return true;
|
||||
}
|
||||
LOG.warn("{} is throwing ServerNotRunningYetException for {}ms; trying another server",
|
||||
@ -311,10 +318,12 @@ public class RSProcedureDispatcher
|
||||
numberOfAttemptsSoFar++;
|
||||
// Add some backoff here as the attempts rise otherwise if a stuck condition, will fill logs
|
||||
// with failed attempts. None of our backoff classes -- RetryCounter or ClientBackoffPolicy
|
||||
// -- fit here nicely so just do something simple; increment by 100ms * retry^2 on each try
|
||||
// -- fit here nicely so just do something simple; increment by rsRpcRetryInterval millis *
|
||||
// retry^2 on each try
|
||||
// up to max of 10 seconds (don't want to back off too much in case of situation change).
|
||||
submitTask(this,
|
||||
Math.min(100 * (this.numberOfAttemptsSoFar * this.numberOfAttemptsSoFar), 10 * 1000),
|
||||
Math.min(rsRpcRetryInterval * (this.numberOfAttemptsSoFar * this.numberOfAttemptsSoFar),
|
||||
10 * 1000),
|
||||
TimeUnit.MILLISECONDS);
|
||||
return true;
|
||||
}
|
||||
|
@ -22,7 +22,8 @@ import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
|
||||
import java.util.concurrent.ScheduledThreadPoolExecutor;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
@ -38,6 +39,7 @@ import org.apache.hadoop.hbase.master.ServerManager;
|
||||
import org.apache.hadoop.hbase.testclassification.LargeTests;
|
||||
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
|
||||
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
|
||||
import org.apache.hadoop.hbase.util.Threads;
|
||||
import org.apache.log4j.Appender;
|
||||
import org.apache.log4j.Layout;
|
||||
import org.apache.log4j.PatternLayout;
|
||||
@ -222,6 +224,40 @@ public class TestRegionServerReportForDuty {
|
||||
tablesOnMaster? 3: 2);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests region sever reportForDuty with RS RPC retry
|
||||
*/
|
||||
@Test
|
||||
public void testReportForDutyWithRSRpcRetry() throws Exception {
|
||||
ScheduledThreadPoolExecutor scheduledThreadPoolExecutor =
|
||||
new ScheduledThreadPoolExecutor(1, Threads.newDaemonThreadFactory("RSDelayedStart"));
|
||||
|
||||
// Start a master and wait for it to become the active/primary master.
|
||||
// Use a random unique port
|
||||
cluster.getConfiguration().setInt(HConstants.MASTER_PORT, HBaseTestingUtility.randomFreePort());
|
||||
// Override the default RS RPC retry interval of 100ms to 300ms
|
||||
cluster.getConfiguration().setLong("hbase.regionserver.rpc.retry.interval", 300);
|
||||
// master has a rs. defaultMinToStart = 2
|
||||
boolean tablesOnMaster = LoadBalancer.isTablesOnMaster(testUtil.getConfiguration());
|
||||
cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART,
|
||||
tablesOnMaster ? 2 : 1);
|
||||
cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART,
|
||||
tablesOnMaster ? 2 : 1);
|
||||
master = cluster.addMaster();
|
||||
rs = cluster.addRegionServer();
|
||||
LOG.debug("Starting master: " + master.getMaster().getServerName());
|
||||
master.start();
|
||||
// Delay the RS start so that the meta assignment fails in first attempt and goes to retry block
|
||||
scheduledThreadPoolExecutor.schedule(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
rs.start();
|
||||
}
|
||||
}, 1000, TimeUnit.MILLISECONDS);
|
||||
|
||||
waitForClusterOnline(master);
|
||||
}
|
||||
|
||||
private void waitForClusterOnline(MasterThread master) throws InterruptedException {
|
||||
while (true) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user