HBASE-25379 Make retry pause time configurable for regionserver short operation RPC (reportRegionStateTransition/reportProcedureDone) (#2757)

* HBASE-25379 Make retry pause time configurable for regionserver short operation RPC (reportRegionStateTransition/reportProcedureDone)
* HBASE-25379 RemoteProcedureResultReporter also should retry after the configured pause time
* Addressed the review comments

Signed-off-by: Yulin Niu <niuyulin@apache.org>
This commit is contained in:
Pankaj 2020-12-29 22:25:36 +05:30 committed by GitHub
parent d963342f8a
commit c96fbf0407
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 31 additions and 10 deletions

View File

@ -965,6 +965,17 @@ public final class HConstants {
*/ */
public static final int DEFAULT_HBASE_RPC_SHORTOPERATION_TIMEOUT = 10000; public static final int DEFAULT_HBASE_RPC_SHORTOPERATION_TIMEOUT = 10000;
/**
* Retry pause time for short operation RPC
*/
public static final String HBASE_RPC_SHORTOPERATION_RETRY_PAUSE_TIME =
"hbase.rpc.shortoperation.retry.pause.time";
/**
* Default value of {@link #HBASE_RPC_SHORTOPERATION_RETRY_PAUSE_TIME}
*/
public static final long DEFAULT_HBASE_RPC_SHORTOPERATION_RETRY_PAUSE_TIME = 1000;
/** /**
* Value indicating the server name was saved with no sequence number. * Value indicating the server name was saved with no sequence number.
*/ */

View File

@ -435,6 +435,9 @@ public class HRegionServer extends Thread implements
private final int shortOperationTimeout; private final int shortOperationTimeout;
// Time to pause if master says 'please hold'
private final long retryPauseTime;
private final RegionServerAccounting regionServerAccounting; private final RegionServerAccounting regionServerAccounting;
private SlowLogTableOpsChore slowLogTableOpsChore = null; private SlowLogTableOpsChore slowLogTableOpsChore = null;
@ -615,6 +618,9 @@ public class HRegionServer extends Thread implements
this.shortOperationTimeout = conf.getInt(HConstants.HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY, this.shortOperationTimeout = conf.getInt(HConstants.HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY,
HConstants.DEFAULT_HBASE_RPC_SHORTOPERATION_TIMEOUT); HConstants.DEFAULT_HBASE_RPC_SHORTOPERATION_TIMEOUT);
this.retryPauseTime = conf.getLong(HConstants.HBASE_RPC_SHORTOPERATION_RETRY_PAUSE_TIME,
HConstants.DEFAULT_HBASE_RPC_SHORTOPERATION_RETRY_PAUSE_TIME);
this.abortRequested = new AtomicBoolean(false); this.abortRequested = new AtomicBoolean(false);
this.stopped = false; this.stopped = false;
@ -2436,10 +2442,8 @@ public class HRegionServer extends Thread implements
final ReportRegionStateTransitionRequest request = final ReportRegionStateTransitionRequest request =
createReportRegionStateTransitionRequest(context); createReportRegionStateTransitionRequest(context);
// Time to pause if master says 'please hold'. Make configurable if needed.
final long initPauseTime = 1000;
int tries = 0; int tries = 0;
long pauseTime; long pauseTime = this.retryPauseTime;
// Keep looping till we get an error. We want to send reports even though server is going down. // Keep looping till we get an error. We want to send reports even though server is going down.
// Only go down if clusterConnection is null. It is set to null almost as last thing as the // Only go down if clusterConnection is null. It is set to null almost as last thing as the
// HRegionServer does down. // HRegionServer does down.
@ -2470,9 +2474,9 @@ public class HRegionServer extends Thread implements
|| ioe instanceof CallQueueTooBigException; || ioe instanceof CallQueueTooBigException;
if (pause) { if (pause) {
// Do backoff else we flood the Master with requests. // Do backoff else we flood the Master with requests.
pauseTime = ConnectionUtils.getPauseTime(initPauseTime, tries); pauseTime = ConnectionUtils.getPauseTime(this.retryPauseTime, tries);
} else { } else {
pauseTime = initPauseTime; // Reset. pauseTime = this.retryPauseTime; // Reset.
} }
LOG.info("Failed report transition " + LOG.info("Failed report transition " +
TextFormat.shortDebugString(request) + "; retry (#" + tries + ")" + TextFormat.shortDebugString(request) + "; retry (#" + tries + ")" +
@ -3938,4 +3942,13 @@ public class HRegionServer extends Thread implements
public CompactedHFilesDischarger getCompactedHFilesDischarger() { public CompactedHFilesDischarger getCompactedHFilesDischarger() {
return compactedFileDischarger; return compactedFileDischarger;
} }
/**
* Return pause time configured in {@link HConstants#HBASE_RPC_SHORTOPERATION_RETRY_PAUSE_TIME}}
* @return pause time
*/
@InterfaceAudience.Private
public long getRetryPauseTime() {
return this.retryPauseTime;
}
} }

View File

@ -41,9 +41,6 @@ class RemoteProcedureResultReporter extends Thread {
private static final Logger LOG = LoggerFactory.getLogger(RemoteProcedureResultReporter.class); private static final Logger LOG = LoggerFactory.getLogger(RemoteProcedureResultReporter.class);
// Time to pause if master says 'please hold'. Make configurable if needed.
private static final int INIT_PAUSE_TIME_MS = 1000;
private static final int MAX_BATCH = 100; private static final int MAX_BATCH = 100;
private final HRegionServer server; private final HRegionServer server;
@ -98,9 +95,9 @@ class RemoteProcedureResultReporter extends Thread {
long pauseTime; long pauseTime;
if (pause) { if (pause) {
// Do backoff else we flood the Master with requests. // Do backoff else we flood the Master with requests.
pauseTime = ConnectionUtils.getPauseTime(INIT_PAUSE_TIME_MS, tries); pauseTime = ConnectionUtils.getPauseTime(server.getRetryPauseTime(), tries);
} else { } else {
pauseTime = INIT_PAUSE_TIME_MS; // Reset. pauseTime = server.getRetryPauseTime(); // Reset.
} }
LOG.info("Failed procedure report " + TextFormat.shortDebugString(request) + "; retry (#" + LOG.info("Failed procedure report " + TextFormat.shortDebugString(request) + "; retry (#" +
tries + ")" + (pause ? " after " + pauseTime + "ms delay (Master is coming online...)." tries + ")" + (pause ? " after " + pauseTime + "ms delay (Master is coming online...)."