From 382ebf3f4eeb3e8b5055dc957a16825b34c0ed7d Mon Sep 17 00:00:00 2001 From: Pankaj Date: Tue, 29 Dec 2020 22:25:36 +0530 Subject: [PATCH] HBASE-25379 Make retry pause time configurable for regionserver short operation RPC (reportRegionStateTransition/reportProcedureDone) (#2757) * HBASE-25379 Make retry pause time configurable for regionserver short operation RPC (reportRegionStateTransition/reportProcedureDone) * HBASE-25379 RemoteProcedureResultReporter also should retry after the configured pause time * Addressed the review comments Signed-off-by: Yulin Niu (cherry picked from commit c96fbf04077fa37555033f88939fdd69ac810b35) --- .../org/apache/hadoop/hbase/HConstants.java | 11 +++++++++ .../hbase/regionserver/HRegionServer.java | 23 +++++++++++++++---- .../RemoteProcedureResultReporter.java | 7 ++---- 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java index c7b13af674b..dc43e8a23e5 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java @@ -987,6 +987,17 @@ public final class HConstants { */ public static final int DEFAULT_HBASE_RPC_SHORTOPERATION_TIMEOUT = 10000; + /** + * Retry pause time for short operation RPC + */ + public static final String HBASE_RPC_SHORTOPERATION_RETRY_PAUSE_TIME = + "hbase.rpc.shortoperation.retry.pause.time"; + + /** + * Default value of {@link #HBASE_RPC_SHORTOPERATION_RETRY_PAUSE_TIME} + */ + public static final long DEFAULT_HBASE_RPC_SHORTOPERATION_RETRY_PAUSE_TIME = 1000; + /** * Value indicating the server name was saved with no sequence number. */ diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 67a93a61954..702d147ac11 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -437,6 +437,9 @@ public class HRegionServer extends Thread implements private final int operationTimeout; private final int shortOperationTimeout; + // Time to pause if master says 'please hold' + private final long retryPauseTime; + private final RegionServerAccounting regionServerAccounting; private SlowLogTableOpsChore slowLogTableOpsChore = null; @@ -617,6 +620,9 @@ public class HRegionServer extends Thread implements this.shortOperationTimeout = conf.getInt(HConstants.HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY, HConstants.DEFAULT_HBASE_RPC_SHORTOPERATION_TIMEOUT); + this.retryPauseTime = conf.getLong(HConstants.HBASE_RPC_SHORTOPERATION_RETRY_PAUSE_TIME, + HConstants.DEFAULT_HBASE_RPC_SHORTOPERATION_RETRY_PAUSE_TIME); + this.abortRequested = new AtomicBoolean(false); this.stopped = false; @@ -2425,10 +2431,8 @@ public class HRegionServer extends Thread implements final ReportRegionStateTransitionRequest request = createReportRegionStateTransitionRequest(context); - // Time to pause if master says 'please hold'. Make configurable if needed. - final long initPauseTime = 1000; int tries = 0; - long pauseTime; + long pauseTime = this.retryPauseTime; // Keep looping till we get an error. We want to send reports even though server is going down. // Only go down if clusterConnection is null. It is set to null almost as last thing as the // HRegionServer does down. @@ -2459,9 +2463,9 @@ public class HRegionServer extends Thread implements || ioe instanceof CallQueueTooBigException; if (pause) { // Do backoff else we flood the Master with requests. - pauseTime = ConnectionUtils.getPauseTime(initPauseTime, tries); + pauseTime = ConnectionUtils.getPauseTime(this.retryPauseTime, tries); } else { - pauseTime = initPauseTime; // Reset. + pauseTime = this.retryPauseTime; // Reset. } LOG.info("Failed report transition " + TextFormat.shortDebugString(request) + "; retry (#" + tries + ")" + @@ -3916,4 +3920,13 @@ public class HRegionServer extends Thread implements public CompactedHFilesDischarger getCompactedHFilesDischarger() { return compactedFileDischarger; } + + /** + * Return pause time configured in {@link HConstants#HBASE_RPC_SHORTOPERATION_RETRY_PAUSE_TIME}} + * @return pause time + */ + @InterfaceAudience.Private + public long getRetryPauseTime() { + return this.retryPauseTime; + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RemoteProcedureResultReporter.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RemoteProcedureResultReporter.java index 981f090534a..63e050a710a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RemoteProcedureResultReporter.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RemoteProcedureResultReporter.java @@ -41,9 +41,6 @@ class RemoteProcedureResultReporter extends Thread { private static final Logger LOG = LoggerFactory.getLogger(RemoteProcedureResultReporter.class); - // Time to pause if master says 'please hold'. Make configurable if needed. - private static final int INIT_PAUSE_TIME_MS = 1000; - private static final int MAX_BATCH = 100; private final HRegionServer server; @@ -98,9 +95,9 @@ class RemoteProcedureResultReporter extends Thread { long pauseTime; if (pause) { // Do backoff else we flood the Master with requests. - pauseTime = ConnectionUtils.getPauseTime(INIT_PAUSE_TIME_MS, tries); + pauseTime = ConnectionUtils.getPauseTime(server.getRetryPauseTime(), tries); } else { - pauseTime = INIT_PAUSE_TIME_MS; // Reset. + pauseTime = server.getRetryPauseTime(); // Reset. } LOG.info("Failed procedure report " + TextFormat.shortDebugString(request) + "; retry (#" + tries + ")" + (pause ? " after " + pauseTime + "ms delay (Master is coming online...)."