From 1eb159b455075f92e4e8ed3de96a599d035ccb1a Mon Sep 17 00:00:00 2001 From: sguggilam Date: Wed, 10 Jun 2020 00:23:08 -0700 Subject: [PATCH] HBASE-24511 Ability to configure timeout between RPC retry to RS from master (#1861) Signed-off-by: Viraj Jasani --- .../procedure/RSProcedureDispatcher.java | 17 +++++++-- .../TestRegionServerReportForDuty.java | 38 ++++++++++++++++++- 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java index 72d504eeced..1942ed6e8ab 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java @@ -234,12 +234,19 @@ public class RSProcedureDispatcher private int numberOfAttemptsSoFar = 0; private long maxWaitTime = -1; + private final long rsRpcRetryInterval; + private static final String RS_RPC_RETRY_INTERVAL_CONF_KEY = + "hbase.regionserver.rpc.retry.interval"; + private static final int DEFAULT_RS_RPC_RETRY_INTERVAL = 100; + private ExecuteProceduresRequest.Builder request = null; public ExecuteProceduresRemoteCall(final ServerName serverName, final Set remoteProcedures) { this.serverName = serverName; this.remoteProcedures = remoteProcedures; + this.rsRpcRetryInterval = master.getConfiguration().getLong(RS_RPC_RETRY_INTERVAL_CONF_KEY, + DEFAULT_RS_RPC_RETRY_INTERVAL); } private AsyncRegionServerAdmin getRsAdmin() throws IOException { @@ -259,8 +266,8 @@ public class RSProcedureDispatcher LOG.warn("Waiting a little before retrying {}, try={}, can wait up to {}ms", serverName, numberOfAttemptsSoFar, remainingTime); numberOfAttemptsSoFar++; - // Retry every 100ms up to maximum wait time. - submitTask(this, 100, TimeUnit.MILLISECONDS); + // Retry every rsRpcRetryInterval millis up to maximum wait time. + submitTask(this, rsRpcRetryInterval, TimeUnit.MILLISECONDS); return true; } LOG.warn("{} is throwing ServerNotRunningYetException for {}ms; trying another server", @@ -305,10 +312,12 @@ public class RSProcedureDispatcher numberOfAttemptsSoFar++; // Add some backoff here as the attempts rise otherwise if a stuck condition, will fill logs // with failed attempts. None of our backoff classes -- RetryCounter or ClientBackoffPolicy - // -- fit here nicely so just do something simple; increment by 100ms * retry^2 on each try + // -- fit here nicely so just do something simple; increment by rsRpcRetryInterval millis * + // retry^2 on each try // up to max of 10 seconds (don't want to back off too much in case of situation change). submitTask(this, - Math.min(100 * (this.numberOfAttemptsSoFar * this.numberOfAttemptsSoFar), 10 * 1000), + Math.min(rsRpcRetryInterval * (this.numberOfAttemptsSoFar * this.numberOfAttemptsSoFar), + 10 * 1000), TimeUnit.MILLISECONDS); return true; } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java index dd1c085718b..0e1c2801120 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java @@ -22,7 +22,8 @@ import static org.junit.Assert.assertTrue; import java.io.IOException; import java.io.StringWriter; - +import java.util.concurrent.ScheduledThreadPoolExecutor; +import java.util.concurrent.TimeUnit; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseClassTestRule; @@ -38,6 +39,7 @@ import org.apache.hadoop.hbase.master.ServerManager; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread; +import org.apache.hadoop.hbase.util.Threads; import org.apache.log4j.Appender; import org.apache.log4j.Layout; import org.apache.log4j.PatternLayout; @@ -222,6 +224,40 @@ public class TestRegionServerReportForDuty { tablesOnMaster? 3: 2); } + + /** + * Tests region sever reportForDuty with RS RPC retry + */ + @Test + public void testReportForDutyWithRSRpcRetry() throws Exception { + ScheduledThreadPoolExecutor scheduledThreadPoolExecutor = + new ScheduledThreadPoolExecutor(1, Threads.newDaemonThreadFactory("RSDelayedStart")); + + // Start a master and wait for it to become the active/primary master. + // Use a random unique port + cluster.getConfiguration().setInt(HConstants.MASTER_PORT, HBaseTestingUtility.randomFreePort()); + // Override the default RS RPC retry interval of 100ms to 300ms + cluster.getConfiguration().setLong("hbase.regionserver.rpc.retry.interval", 300); + // master has a rs. defaultMinToStart = 2 + boolean tablesOnMaster = LoadBalancer.isTablesOnMaster(testUtil.getConfiguration()); + cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, + tablesOnMaster ? 2 : 1); + cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, + tablesOnMaster ? 2 : 1); + master = cluster.addMaster(); + rs = cluster.addRegionServer(); + LOG.debug("Starting master: " + master.getMaster().getServerName()); + master.start(); + // Delay the RS start so that the meta assignment fails in first attempt and goes to retry block + scheduledThreadPoolExecutor.schedule(new Runnable() { + @Override + public void run() { + rs.start(); + } + }, 1000, TimeUnit.MILLISECONDS); + + waitForClusterOnline(master); + } private void waitForClusterOnline(MasterThread master) throws InterruptedException { while (true) {