HBASE-26807 Unify CallQueueTooBigException special pause with CallDroppedException (#4273)

Signed-off-by: Nick Dimiduk <ndimiduk@apache.org>
This commit is contained in:
Bryan Beaudreault 2022-04-22 08:56:52 -04:00 committed by GitHub
parent edbe44b0d1
commit 2240025349
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
41 changed files with 931 additions and 368 deletions

View File

@ -18,8 +18,6 @@
package org.apache.hadoop.hbase;
import java.io.IOException;
import org.apache.yetus.audience.InterfaceAudience;
/**
@ -28,14 +26,16 @@ import org.apache.yetus.audience.InterfaceAudience;
*/
@SuppressWarnings("serial")
@InterfaceAudience.Public
public class CallDroppedException extends IOException {
public class CallDroppedException extends HBaseServerException {
public CallDroppedException() {
super();
// For now all call drops are due to server being overloaded.
// We could decouple this if desired.
super(true);
}
// Absence of this constructor prevents proper unwrapping of
// remote exception on the client side
public CallDroppedException(String message) {
super(message);
super(true, message);
}
}

View File

@ -18,13 +18,15 @@
package org.apache.hadoop.hbase;
import java.io.IOException;
import org.apache.yetus.audience.InterfaceAudience;
/**
* Returned to clients when their request was dropped because the call queue was too big to
* accept a new call. Clients should retry upon receiving it.
*/
@SuppressWarnings("serial")
@InterfaceAudience.Public
public class CallQueueTooBigException extends IOException {
public class CallQueueTooBigException extends CallDroppedException {
public CallQueueTooBigException() {
super();
}

View File

@ -0,0 +1,72 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase;
import org.apache.yetus.audience.InterfaceAudience;
/**
* Base class for exceptions thrown by an HBase server. May contain extra info about
* the state of the server when the exception was thrown.
*/
@InterfaceAudience.Public
public class HBaseServerException extends HBaseIOException {
private boolean serverOverloaded;
public HBaseServerException() {
this(false);
}
public HBaseServerException(String message) {
this(false, message);
}
public HBaseServerException(boolean serverOverloaded) {
this.serverOverloaded = serverOverloaded;
}
public HBaseServerException(boolean serverOverloaded, String message) {
super(message);
this.serverOverloaded = serverOverloaded;
}
/**
* @param t throwable to check for server overloaded state
* @return True if the server was considered overloaded when the exception was thrown
*/
public static boolean isServerOverloaded(Throwable t) {
if (t instanceof HBaseServerException) {
return ((HBaseServerException) t).isServerOverloaded();
}
return false;
}
/**
* Necessary for parsing RemoteException on client side
* @param serverOverloaded True if server was overloaded when exception was thrown
*/
public void setServerOverloaded(boolean serverOverloaded) {
this.serverOverloaded = serverOverloaded;
}
/**
* @return True if server was considered overloaded when exception was thrown
*/
public boolean isServerOverloaded() {
return serverOverloaded;
}
}

View File

@ -21,6 +21,7 @@ import static org.apache.hadoop.hbase.client.ConnectionUtils.retries2Attempts;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.hbase.HBaseServerException;
import org.apache.yetus.audience.InterfaceAudience;
/**
@ -50,21 +51,42 @@ public interface AsyncAdminBuilder {
* Set the base pause time for retrying. We use an exponential policy to generate sleep time when
* retrying.
* @return this for invocation chaining
* @see #setRetryPauseForCQTBE(long, TimeUnit)
* @see #setRetryPauseForServerOverloaded(long, TimeUnit)
*/
AsyncAdminBuilder setRetryPause(long timeout, TimeUnit unit);
/**
* Set the base pause time for retrying when we hit {@code CallQueueTooBigException}. We use an
* exponential policy to generate sleep time when retrying.
* Set the base pause time for retrying when {@link HBaseServerException#isServerOverloaded()}.
* We use an exponential policy to generate sleep time from this base when retrying.
* <p/>
* This value should be greater than the normal pause value which could be set with the above
* {@link #setRetryPause(long, TimeUnit)} method, as usually {@code CallQueueTooBigException}
* means the server is overloaded. We just use the normal pause value for
* {@code CallQueueTooBigException} if here you specify a smaller value.
* {@link #setRetryPause(long, TimeUnit)} method, as usually
* {@link HBaseServerException#isServerOverloaded()} means the server is overloaded. We just use
* the normal pause value for {@link HBaseServerException#isServerOverloaded()} if here you
* specify a smaller value.
*
* @see #setRetryPause(long, TimeUnit)
* @deprecated Since 2.5.0, will be removed in 4.0.0. Please use
* {@link #setRetryPauseForServerOverloaded(long, TimeUnit)} instead.
*/
@Deprecated
default AsyncAdminBuilder setRetryPauseForCQTBE(long pause, TimeUnit unit) {
return setRetryPauseForServerOverloaded(pause, unit);
}
/**
* Set the base pause time for retrying when {@link HBaseServerException#isServerOverloaded()}.
* We use an exponential policy to generate sleep time when retrying.
* <p/>
* This value should be greater than the normal pause value which could be set with the above
* {@link #setRetryPause(long, TimeUnit)} method, as usually
* {@link HBaseServerException#isServerOverloaded()} means the server is overloaded. We just use
* the normal pause value for {@link HBaseServerException#isServerOverloaded()} if here you
* specify a smaller value.
*
* @see #setRetryPause(long, TimeUnit)
*/
AsyncAdminBuilder setRetryPauseForCQTBE(long pause, TimeUnit unit);
AsyncAdminBuilder setRetryPauseForServerOverloaded(long pause, TimeUnit unit);
/**
* Set the max retry times for an admin operation. Usually it is the max attempt times minus 1.

View File

@ -33,7 +33,7 @@ abstract class AsyncAdminBuilderBase implements AsyncAdminBuilder {
protected long pauseNs;
protected long pauseForCQTBENs;
protected long pauseNsForServerOverloaded;
protected int maxAttempts;
@ -43,7 +43,7 @@ abstract class AsyncAdminBuilderBase implements AsyncAdminBuilder {
this.rpcTimeoutNs = connConf.getRpcTimeoutNs();
this.operationTimeoutNs = connConf.getOperationTimeoutNs();
this.pauseNs = connConf.getPauseNs();
this.pauseForCQTBENs = connConf.getPauseForCQTBENs();
this.pauseNsForServerOverloaded = connConf.getPauseNsForServerOverloaded();
this.maxAttempts = connConf.getMaxRetries();
this.startLogErrorsCnt = connConf.getStartLogErrorsCnt();
}
@ -67,8 +67,8 @@ abstract class AsyncAdminBuilderBase implements AsyncAdminBuilder {
}
@Override
public AsyncAdminBuilder setRetryPauseForCQTBE(long timeout, TimeUnit unit) {
this.pauseForCQTBENs = unit.toNanos(timeout);
public AsyncAdminBuilder setRetryPauseForServerOverloaded(long timeout, TimeUnit unit) {
this.pauseNsForServerOverloaded = unit.toNanos(timeout);
return this;
}

View File

@ -44,10 +44,10 @@ public class AsyncAdminRequestRetryingCaller<T> extends AsyncRpcRetryingCaller<T
private ServerName serverName;
public AsyncAdminRequestRetryingCaller(Timer retryTimer, AsyncConnectionImpl conn, int priority,
long pauseNs, long pauseForCQTBENs, int maxAttempts, long operationTimeoutNs,
long pauseNs, long pauseNsForServerOverloaded, int maxAttempts, long operationTimeoutNs,
long rpcTimeoutNs, int startLogErrorsCnt, ServerName serverName, Callable<T> callable) {
super(retryTimer, conn, priority, pauseNs, pauseForCQTBENs, maxAttempts, operationTimeoutNs,
rpcTimeoutNs, startLogErrorsCnt);
super(retryTimer, conn, priority, pauseNs, pauseNsForServerOverloaded, maxAttempts,
operationTimeoutNs, rpcTimeoutNs, startLogErrorsCnt);
this.serverName = serverName;
this.callable = callable;
}

View File

@ -45,9 +45,9 @@ import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.mutable.MutableBoolean;
import org.apache.hadoop.hbase.CallQueueTooBigException;
import org.apache.hadoop.hbase.CellScannable;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.HBaseServerException;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.RetryImmediatelyException;
@ -63,9 +63,7 @@ import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hbase.thirdparty.io.netty.util.Timer;
import org.apache.hadoop.hbase.shaded.protobuf.RequestConverter;
import org.apache.hadoop.hbase.shaded.protobuf.ResponseConverter;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos;
@ -104,7 +102,7 @@ class AsyncBatchRpcRetryingCaller<T> {
private final long pauseNs;
private final long pauseForCQTBENs;
private final long pauseNsForServerOverloaded;
private final int maxAttempts;
@ -150,13 +148,14 @@ class AsyncBatchRpcRetryingCaller<T> {
}
public AsyncBatchRpcRetryingCaller(Timer retryTimer, AsyncConnectionImpl conn,
TableName tableName, List<? extends Row> actions, long pauseNs, long pauseForCQTBENs,
int maxAttempts, long operationTimeoutNs, long rpcTimeoutNs, int startLogErrorsCnt) {
TableName tableName, List<? extends Row> actions, long pauseNs,
long pauseNsForServerOverloaded, int maxAttempts, long operationTimeoutNs,
long rpcTimeoutNs, int startLogErrorsCnt) {
this.retryTimer = retryTimer;
this.conn = conn;
this.tableName = tableName;
this.pauseNs = pauseNs;
this.pauseForCQTBENs = pauseForCQTBENs;
this.pauseNsForServerOverloaded = pauseNsForServerOverloaded;
this.maxAttempts = maxAttempts;
this.operationTimeoutNs = operationTimeoutNs;
this.rpcTimeoutNs = rpcTimeoutNs;
@ -466,17 +465,17 @@ class AsyncBatchRpcRetryingCaller<T> {
.collect(Collectors.toList());
addError(copiedActions, error, serverName);
tryResubmit(copiedActions.stream(), tries, error instanceof RetryImmediatelyException,
error instanceof CallQueueTooBigException);
HBaseServerException.isServerOverloaded(error));
}
private void tryResubmit(Stream<Action> actions, int tries, boolean immediately,
boolean isCallQueueTooBig) {
boolean isServerOverloaded) {
if (immediately) {
groupAndSend(actions, tries);
return;
}
long delayNs;
long pauseNsToUse = isCallQueueTooBig ? pauseForCQTBENs : pauseNs;
long pauseNsToUse = isServerOverloaded ? pauseNsForServerOverloaded : pauseNs;
if (operationTimeoutNs > 0) {
long maxDelayNs = remainingTimeNs() - SLEEP_DELTA_NS;
if (maxDelayNs <= 0) {

View File

@ -78,7 +78,7 @@ class AsyncClientScanner {
private final long pauseNs;
private final long pauseForCQTBENs;
private final long pauseNsForServerOverloaded;
private final int maxAttempts;
@ -93,7 +93,7 @@ class AsyncClientScanner {
private final Span span;
public AsyncClientScanner(Scan scan, AdvancedScanResultConsumer consumer, TableName tableName,
AsyncConnectionImpl conn, Timer retryTimer, long pauseNs, long pauseForCQTBENs,
AsyncConnectionImpl conn, Timer retryTimer, long pauseNs, long pauseNsForServerOverloaded,
int maxAttempts, long scanTimeoutNs, long rpcTimeoutNs, int startLogErrorsCnt) {
if (scan.getStartRow() == null) {
scan.withStartRow(EMPTY_START_ROW, scan.includeStartRow());
@ -107,7 +107,7 @@ class AsyncClientScanner {
this.conn = conn;
this.retryTimer = retryTimer;
this.pauseNs = pauseNs;
this.pauseForCQTBENs = pauseForCQTBENs;
this.pauseNsForServerOverloaded = pauseNsForServerOverloaded;
this.maxAttempts = maxAttempts;
this.scanTimeoutNs = scanTimeoutNs;
this.rpcTimeoutNs = rpcTimeoutNs;
@ -198,7 +198,8 @@ class AsyncClientScanner {
.setScan(scan).metrics(scanMetrics).consumer(consumer).resultCache(resultCache)
.rpcTimeout(rpcTimeoutNs, TimeUnit.NANOSECONDS)
.scanTimeout(scanTimeoutNs, TimeUnit.NANOSECONDS).pause(pauseNs, TimeUnit.NANOSECONDS)
.pauseForCQTBE(pauseForCQTBENs, TimeUnit.NANOSECONDS).maxAttempts(maxAttempts)
.pauseForServerOverloaded(pauseNsForServerOverloaded, TimeUnit.NANOSECONDS)
.maxAttempts(maxAttempts)
.startLogErrorsCnt(startLogErrorsCnt).start(resp.controller, resp.resp),
(hasMore, error) -> {
try (Scope ignored = span.makeCurrent()) {
@ -231,7 +232,8 @@ class AsyncClientScanner {
.row(scan.getStartRow()).replicaId(replicaId).locateType(getLocateType(scan))
.priority(scan.getPriority()).rpcTimeout(rpcTimeoutNs, TimeUnit.NANOSECONDS)
.operationTimeout(scanTimeoutNs, TimeUnit.NANOSECONDS)
.pause(pauseNs, TimeUnit.NANOSECONDS).pauseForCQTBE(pauseForCQTBENs, TimeUnit.NANOSECONDS)
.pause(pauseNs, TimeUnit.NANOSECONDS)
.pauseForServerOverloaded(pauseNsForServerOverloaded, TimeUnit.NANOSECONDS)
.maxAttempts(maxAttempts).startLogErrorsCnt(startLogErrorsCnt)
.action(this::callOpenScanner).call();
}

View File

@ -17,46 +17,15 @@
*/
package org.apache.hadoop.hbase.client;
import static org.apache.hadoop.hbase.HConstants.DEFAULT_HBASE_CLIENT_OPERATION_TIMEOUT;
import static org.apache.hadoop.hbase.HConstants.DEFAULT_HBASE_CLIENT_PAUSE;
import static org.apache.hadoop.hbase.HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER;
import static org.apache.hadoop.hbase.HConstants.DEFAULT_HBASE_CLIENT_SCANNER_CACHING;
import static org.apache.hadoop.hbase.HConstants.DEFAULT_HBASE_CLIENT_SCANNER_MAX_RESULT_SIZE;
import static org.apache.hadoop.hbase.HConstants.DEFAULT_HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD;
import static org.apache.hadoop.hbase.HConstants.DEFAULT_HBASE_META_SCANNER_CACHING;
import static org.apache.hadoop.hbase.HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
import static org.apache.hadoop.hbase.HConstants.HBASE_CLIENT_META_OPERATION_TIMEOUT;
import static org.apache.hadoop.hbase.HConstants.HBASE_CLIENT_META_REPLICA_SCAN_TIMEOUT;
import static org.apache.hadoop.hbase.HConstants.HBASE_CLIENT_META_REPLICA_SCAN_TIMEOUT_DEFAULT;
import static org.apache.hadoop.hbase.HConstants.HBASE_CLIENT_OPERATION_TIMEOUT;
import static org.apache.hadoop.hbase.HConstants.HBASE_CLIENT_PAUSE;
import static org.apache.hadoop.hbase.HConstants.HBASE_CLIENT_PAUSE_FOR_CQTBE;
import static org.apache.hadoop.hbase.HConstants.HBASE_CLIENT_RETRIES_NUMBER;
import static org.apache.hadoop.hbase.HConstants.HBASE_CLIENT_SCANNER_CACHING;
import static org.apache.hadoop.hbase.HConstants.HBASE_CLIENT_SCANNER_MAX_RESULT_SIZE_KEY;
import static org.apache.hadoop.hbase.HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD;
import static org.apache.hadoop.hbase.HConstants.HBASE_META_SCANNER_CACHING;
import static org.apache.hadoop.hbase.HConstants.HBASE_RPC_READ_TIMEOUT_KEY;
import static org.apache.hadoop.hbase.HConstants.HBASE_RPC_TIMEOUT_KEY;
import static org.apache.hadoop.hbase.HConstants.HBASE_RPC_WRITE_TIMEOUT_KEY;
import static org.apache.hadoop.hbase.client.AsyncProcess.DEFAULT_START_LOG_ERRORS_AFTER_COUNT;
import static org.apache.hadoop.hbase.client.AsyncProcess.START_LOG_ERRORS_AFTER_COUNT_KEY;
import static org.apache.hadoop.hbase.client.ConnectionConfiguration.MAX_KEYVALUE_SIZE_DEFAULT;
import static org.apache.hadoop.hbase.client.ConnectionConfiguration.MAX_KEYVALUE_SIZE_KEY;
import static org.apache.hadoop.hbase.client.ConnectionConfiguration.PRIMARY_CALL_TIMEOUT_MICROSECOND;
import static org.apache.hadoop.hbase.client.ConnectionConfiguration.PRIMARY_CALL_TIMEOUT_MICROSECOND_DEFAULT;
import static org.apache.hadoop.hbase.client.ConnectionConfiguration.PRIMARY_SCAN_TIMEOUT_MICROSECOND;
import static org.apache.hadoop.hbase.client.ConnectionConfiguration.PRIMARY_SCAN_TIMEOUT_MICROSECOND_DEFAULT;
import static org.apache.hadoop.hbase.client.ConnectionConfiguration.WRITE_BUFFER_PERIODIC_FLUSH_TIMEOUT_MS;
import static org.apache.hadoop.hbase.client.ConnectionConfiguration.WRITE_BUFFER_PERIODIC_FLUSH_TIMEOUT_MS_DEFAULT;
import static org.apache.hadoop.hbase.client.ConnectionConfiguration.WRITE_BUFFER_SIZE_DEFAULT;
import static org.apache.hadoop.hbase.client.ConnectionConfiguration.WRITE_BUFFER_SIZE_KEY;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.conf.Configuration;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Timeout configs.
@ -64,7 +33,15 @@ import org.slf4j.LoggerFactory;
@InterfaceAudience.Private
class AsyncConnectionConfiguration {
private static final Logger LOG = LoggerFactory.getLogger(AsyncConnectionConfiguration.class);
/**
* Configure the number of failures after which the client will start logging. A few failures
* is fine: region moved, then is not opened, then is overloaded. We try to have an acceptable
* heuristic for the number of errors we don't log. 5 was chosen because we wait for 1s at
* this stage.
*/
public static final String START_LOG_ERRORS_AFTER_COUNT_KEY =
"hbase.client.start.log.errors.counter";
public static final int DEFAULT_START_LOG_ERRORS_AFTER_COUNT = 5;
private final long metaOperationTimeoutNs;
@ -84,7 +61,7 @@ class AsyncConnectionConfiguration {
private final long pauseNs;
private final long pauseForCQTBENs;
private final long pauseNsForServerOverloaded;
private final int maxRetries;
@ -118,50 +95,45 @@ class AsyncConnectionConfiguration {
private final int maxKeyValueSize;
AsyncConnectionConfiguration(Configuration conf) {
ConnectionConfiguration connectionConf = new ConnectionConfiguration(conf);
// fields we can pull directly from connection configuration
this.scannerCaching = connectionConf.getScannerCaching();
this.scannerMaxResultSize = connectionConf.getScannerMaxResultSize();
this.writeBufferSize = connectionConf.getWriteBufferSize();
this.writeBufferPeriodicFlushTimeoutNs = connectionConf.getWriteBufferPeriodicFlushTimeoutMs();
this.maxKeyValueSize = connectionConf.getMaxKeyValueSize();
this.maxRetries = connectionConf.getRetriesNumber();
// fields from connection configuration that need to be converted to nanos
this.metaOperationTimeoutNs = TimeUnit.MILLISECONDS.toNanos(
conf.getLong(HBASE_CLIENT_META_OPERATION_TIMEOUT, DEFAULT_HBASE_CLIENT_OPERATION_TIMEOUT));
this.operationTimeoutNs = TimeUnit.MILLISECONDS.toNanos(
conf.getLong(HBASE_CLIENT_OPERATION_TIMEOUT, DEFAULT_HBASE_CLIENT_OPERATION_TIMEOUT));
long rpcTimeoutMs = conf.getLong(HBASE_RPC_TIMEOUT_KEY, DEFAULT_HBASE_RPC_TIMEOUT);
this.rpcTimeoutNs = TimeUnit.MILLISECONDS.toNanos(rpcTimeoutMs);
connectionConf.getMetaOperationTimeout());
this.operationTimeoutNs = TimeUnit.MILLISECONDS.toNanos(connectionConf.getOperationTimeout());
this.rpcTimeoutNs = TimeUnit.MILLISECONDS.toNanos(connectionConf.getRpcTimeout());
this.readRpcTimeoutNs =
TimeUnit.MILLISECONDS.toNanos(conf.getLong(HBASE_RPC_READ_TIMEOUT_KEY, rpcTimeoutMs));
TimeUnit.MILLISECONDS.toNanos(conf.getLong(HBASE_RPC_READ_TIMEOUT_KEY,
connectionConf.getReadRpcTimeout()));
this.writeRpcTimeoutNs =
TimeUnit.MILLISECONDS.toNanos(conf.getLong(HBASE_RPC_WRITE_TIMEOUT_KEY, rpcTimeoutMs));
long pauseMs = conf.getLong(HBASE_CLIENT_PAUSE, DEFAULT_HBASE_CLIENT_PAUSE);
long pauseForCQTBEMs = conf.getLong(HBASE_CLIENT_PAUSE_FOR_CQTBE, pauseMs);
if (pauseForCQTBEMs < pauseMs) {
LOG.warn(
"The {} setting: {} ms is less than the {} setting: {} ms, use the greater one instead",
HBASE_CLIENT_PAUSE_FOR_CQTBE, pauseForCQTBEMs, HBASE_CLIENT_PAUSE, pauseMs);
pauseForCQTBEMs = pauseMs;
}
this.pauseNs = TimeUnit.MILLISECONDS.toNanos(pauseMs);
this.pauseForCQTBENs = TimeUnit.MILLISECONDS.toNanos(pauseForCQTBEMs);
this.maxRetries = conf.getInt(HBASE_CLIENT_RETRIES_NUMBER, DEFAULT_HBASE_CLIENT_RETRIES_NUMBER);
this.startLogErrorsCnt =
conf.getInt(START_LOG_ERRORS_AFTER_COUNT_KEY, DEFAULT_START_LOG_ERRORS_AFTER_COUNT);
TimeUnit.MILLISECONDS.toNanos(conf.getLong(HBASE_RPC_WRITE_TIMEOUT_KEY,
connectionConf.getWriteRpcTimeout()));
this.pauseNs = TimeUnit.MILLISECONDS.toNanos(connectionConf.getPauseMillis());
this.pauseNsForServerOverloaded = TimeUnit.MILLISECONDS.toNanos(
connectionConf.getPauseMillisForServerOverloaded());
this.primaryCallTimeoutNs = TimeUnit.MICROSECONDS.toNanos(
connectionConf.getPrimaryCallTimeoutMicroSecond());
this.primaryScanTimeoutNs = TimeUnit.MICROSECONDS.toNanos(
connectionConf.getReplicaCallTimeoutMicroSecondScan());
this.primaryMetaScanTimeoutNs =
TimeUnit.MICROSECONDS.toNanos(connectionConf.getMetaReplicaCallTimeoutMicroSecondScan());
this.scanTimeoutNs = TimeUnit.MILLISECONDS.toNanos(
conf.getInt(HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD,
DEFAULT_HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD));
this.scannerCaching =
conf.getInt(HBASE_CLIENT_SCANNER_CACHING, DEFAULT_HBASE_CLIENT_SCANNER_CACHING);
// fields not in connection configuration
this.startLogErrorsCnt =
conf.getInt(START_LOG_ERRORS_AFTER_COUNT_KEY, DEFAULT_START_LOG_ERRORS_AFTER_COUNT);
this.metaScannerCaching =
conf.getInt(HBASE_META_SCANNER_CACHING, DEFAULT_HBASE_META_SCANNER_CACHING);
this.scannerMaxResultSize = conf.getLong(HBASE_CLIENT_SCANNER_MAX_RESULT_SIZE_KEY,
DEFAULT_HBASE_CLIENT_SCANNER_MAX_RESULT_SIZE);
this.writeBufferSize = conf.getLong(WRITE_BUFFER_SIZE_KEY, WRITE_BUFFER_SIZE_DEFAULT);
this.writeBufferPeriodicFlushTimeoutNs =
TimeUnit.MILLISECONDS.toNanos(conf.getLong(WRITE_BUFFER_PERIODIC_FLUSH_TIMEOUT_MS,
WRITE_BUFFER_PERIODIC_FLUSH_TIMEOUT_MS_DEFAULT));
this.primaryCallTimeoutNs = TimeUnit.MICROSECONDS.toNanos(
conf.getLong(PRIMARY_CALL_TIMEOUT_MICROSECOND, PRIMARY_CALL_TIMEOUT_MICROSECOND_DEFAULT));
this.primaryScanTimeoutNs = TimeUnit.MICROSECONDS.toNanos(
conf.getLong(PRIMARY_SCAN_TIMEOUT_MICROSECOND, PRIMARY_SCAN_TIMEOUT_MICROSECOND_DEFAULT));
this.primaryMetaScanTimeoutNs =
TimeUnit.MICROSECONDS.toNanos(conf.getLong(HBASE_CLIENT_META_REPLICA_SCAN_TIMEOUT,
HBASE_CLIENT_META_REPLICA_SCAN_TIMEOUT_DEFAULT));
this.maxKeyValueSize = conf.getInt(MAX_KEYVALUE_SIZE_KEY, MAX_KEYVALUE_SIZE_DEFAULT);
}
long getMetaOperationTimeoutNs() {
@ -188,8 +160,8 @@ class AsyncConnectionConfiguration {
return pauseNs;
}
long getPauseForCQTBENs() {
return pauseForCQTBENs;
long getPauseNsForServerOverloaded() {
return pauseNsForServerOverloaded;
}
int getMaxRetries() {

View File

@ -44,10 +44,10 @@ public class AsyncMasterRequestRpcRetryingCaller<T> extends AsyncRpcRetryingCall
private final Callable<T> callable;
public AsyncMasterRequestRpcRetryingCaller(Timer retryTimer, AsyncConnectionImpl conn,
Callable<T> callable, int priority, long pauseNs, long pauseForCQTBENs, int maxRetries,
long operationTimeoutNs, long rpcTimeoutNs, int startLogErrorsCnt) {
super(retryTimer, conn, priority, pauseNs, pauseForCQTBENs, maxRetries, operationTimeoutNs,
rpcTimeoutNs, startLogErrorsCnt);
Callable<T> callable, int priority, long pauseNs, long pauseNsForServerOverloaded,
int maxRetries, long operationTimeoutNs, long rpcTimeoutNs, int startLogErrorsCnt) {
super(retryTimer, conn, priority, pauseNs, pauseNsForServerOverloaded, maxRetries,
operationTimeoutNs, rpcTimeoutNs, startLogErrorsCnt);
this.callable = callable;
}

View File

@ -134,14 +134,13 @@ class AsyncProcess {
final long id;
final ClusterConnection connection;
final ConnectionConfiguration connectionConfiguration;
private final RpcRetryingCallerFactory rpcCallerFactory;
final RpcControllerFactory rpcFactory;
// Start configuration settings.
final int startLogErrorsCnt;
final long pause;
final long pauseForCQTBE;// pause for CallQueueTooBigException, if specified
final int numTries;
long serverTrackerTimeout;
final long primaryCallTimeoutMicroseconds;
@ -156,30 +155,25 @@ class AsyncProcess {
public static final String LOG_DETAILS_PERIOD = "hbase.client.log.detail.period.ms";
private static final int DEFAULT_LOG_DETAILS_PERIOD = 10000;
private final int periodToLog;
AsyncProcess(ClusterConnection hc, Configuration conf,
RpcRetryingCallerFactory rpcCaller, RpcControllerFactory rpcFactory) {
this(hc, conf, rpcCaller, rpcFactory, hc.getConnectionConfiguration().getRetriesNumber());
}
AsyncProcess(ClusterConnection hc, Configuration conf,
RpcRetryingCallerFactory rpcCaller, RpcControllerFactory rpcFactory, int retriesNumber) {
if (hc == null) {
throw new IllegalArgumentException("ClusterConnection cannot be null.");
}
this.connection = hc;
this.connectionConfiguration = connection.getConnectionConfiguration();
this.id = COUNTER.incrementAndGet();
this.pause = conf.getLong(HConstants.HBASE_CLIENT_PAUSE,
HConstants.DEFAULT_HBASE_CLIENT_PAUSE);
long configuredPauseForCQTBE = conf.getLong(HConstants.HBASE_CLIENT_PAUSE_FOR_CQTBE, pause);
if (configuredPauseForCQTBE < pause) {
LOG.warn("The " + HConstants.HBASE_CLIENT_PAUSE_FOR_CQTBE + " setting: "
+ configuredPauseForCQTBE + " is smaller than " + HConstants.HBASE_CLIENT_PAUSE
+ ", will use " + pause + " instead.");
this.pauseForCQTBE = pause;
} else {
this.pauseForCQTBE = configuredPauseForCQTBE;
}
// how many times we could try in total, one more than retry number
this.numTries = conf.getInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER) + 1;
this.numTries = retriesNumber + 1;
this.primaryCallTimeoutMicroseconds = conf.getInt(PRIMARY_CALL_TIMEOUT_KEY, 10000);
this.startLogErrorsCnt =
conf.getInt(START_LOG_ERRORS_AFTER_COUNT_KEY, DEFAULT_START_LOG_ERRORS_AFTER_COUNT);
@ -193,7 +187,8 @@ class AsyncProcess {
// we will do more retries in aggregate, but the user will be none the wiser.
this.serverTrackerTimeout = 0L;
for (int i = 0; i < this.numTries; ++i) {
serverTrackerTimeout = serverTrackerTimeout + ConnectionUtils.getPauseTime(this.pause, i);
serverTrackerTimeout = serverTrackerTimeout
+ ConnectionUtils.getPauseTime(connectionConfiguration.getPauseMillis(), i);
}
this.rpcCallerFactory = rpcCaller;

View File

@ -36,8 +36,8 @@ import java.util.concurrent.ExecutorService;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.hadoop.hbase.CallQueueTooBigException;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.HBaseServerException;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.RegionLocations;
@ -738,11 +738,14 @@ class AsyncRequestFutureImpl<CResult> implements AsyncRequestFuture {
long backOffTime;
if (retryImmediately) {
backOffTime = 0;
} else if (throwable instanceof CallQueueTooBigException) {
// Give a special check on CQTBE, see #HBASE-17114
backOffTime = errorsByServer.calculateBackoffTime(oldServer, asyncProcess.pauseForCQTBE);
} else if (HBaseServerException.isServerOverloaded(throwable)) {
// Give a special check when encountering an exception indicating the server is overloaded.
// see #HBASE-17114 and HBASE-26807
backOffTime = errorsByServer.calculateBackoffTime(oldServer,
asyncProcess.connectionConfiguration.getPauseMillisForServerOverloaded());
} else {
backOffTime = errorsByServer.calculateBackoffTime(oldServer, asyncProcess.pause);
backOffTime = errorsByServer.calculateBackoffTime(oldServer,
asyncProcess.connectionConfiguration.getPauseMillis());
}
if (numAttempt > asyncProcess.startLogErrorsCnt) {
// We use this value to have some logs when we have multiple failures, but not too many

View File

@ -31,8 +31,8 @@ import java.util.concurrent.CompletableFuture;
import java.util.concurrent.TimeUnit;
import java.util.function.Consumer;
import java.util.function.Supplier;
import org.apache.hadoop.hbase.CallQueueTooBigException;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.HBaseServerException;
import org.apache.hadoop.hbase.NotServingRegionException;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.TableNotEnabledException;
@ -44,7 +44,6 @@ import org.apache.hadoop.hbase.util.FutureUtils;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hbase.thirdparty.io.netty.util.Timer;
@InterfaceAudience.Private
@ -60,7 +59,7 @@ public abstract class AsyncRpcRetryingCaller<T> {
private final long pauseNs;
private final long pauseForCQTBENs;
private final long pauseNsForServerOverloaded;
private int tries = 1;
@ -81,13 +80,13 @@ public abstract class AsyncRpcRetryingCaller<T> {
protected final HBaseRpcController controller;
public AsyncRpcRetryingCaller(Timer retryTimer, AsyncConnectionImpl conn, int priority,
long pauseNs, long pauseForCQTBENs, int maxAttempts, long operationTimeoutNs,
long pauseNs, long pauseNsForServerOverloaded, int maxAttempts, long operationTimeoutNs,
long rpcTimeoutNs, int startLogErrorsCnt) {
this.retryTimer = retryTimer;
this.conn = conn;
this.priority = priority;
this.pauseNs = pauseNs;
this.pauseForCQTBENs = pauseForCQTBENs;
this.pauseNsForServerOverloaded = pauseNsForServerOverloaded;
this.maxAttempts = maxAttempts;
this.operationTimeoutNs = operationTimeoutNs;
this.rpcTimeoutNs = rpcTimeoutNs;
@ -127,7 +126,8 @@ public abstract class AsyncRpcRetryingCaller<T> {
}
private void tryScheduleRetry(Throwable error) {
long pauseNsToUse = error instanceof CallQueueTooBigException ? pauseForCQTBENs : pauseNs;
long pauseNsToUse = HBaseServerException.isServerOverloaded(error) ?
pauseNsForServerOverloaded : pauseNs;
long delayNs;
if (operationTimeoutNs > 0) {
long maxDelayNs = remainingTimeNs() - SLEEP_DELTA_NS;

View File

@ -58,7 +58,7 @@ class AsyncRpcRetryingCallerFactory {
protected long pauseNs = conn.connConf.getPauseNs();
protected long pauseForCQTBENs = conn.connConf.getPauseForCQTBENs();
protected long pauseNsForServerOverloaded = conn.connConf.getPauseNsForServerOverloaded();
protected int maxAttempts = retries2Attempts(conn.connConf.getMaxRetries());
@ -119,8 +119,8 @@ class AsyncRpcRetryingCallerFactory {
return this;
}
public SingleRequestCallerBuilder<T> pauseForCQTBE(long pause, TimeUnit unit) {
this.pauseForCQTBENs = unit.toNanos(pause);
public SingleRequestCallerBuilder<T> pauseForServerOverloaded(long pause, TimeUnit unit) {
this.pauseNsForServerOverloaded = unit.toNanos(pause);
return this;
}
@ -156,8 +156,8 @@ class AsyncRpcRetryingCallerFactory {
public AsyncSingleRequestRpcRetryingCaller<T> build() {
preCheck();
return new AsyncSingleRequestRpcRetryingCaller<>(retryTimer, conn, tableName, row, replicaId,
locateType, callable, priority, pauseNs, pauseForCQTBENs, maxAttempts, operationTimeoutNs,
rpcTimeoutNs, startLogErrorsCnt);
locateType, callable, priority, pauseNs, pauseNsForServerOverloaded, maxAttempts,
operationTimeoutNs, rpcTimeoutNs, startLogErrorsCnt);
}
/**
@ -263,8 +263,8 @@ class AsyncRpcRetryingCallerFactory {
return this;
}
public ScanSingleRegionCallerBuilder pauseForCQTBE(long pause, TimeUnit unit) {
this.pauseForCQTBENs = unit.toNanos(pause);
public ScanSingleRegionCallerBuilder pauseForServerOverloaded(long pause, TimeUnit unit) {
this.pauseNsForServerOverloaded = unit.toNanos(pause);
return this;
}
@ -292,8 +292,8 @@ class AsyncRpcRetryingCallerFactory {
preCheck();
return new AsyncScanSingleRegionRpcRetryingCaller(retryTimer, conn, scan, scanMetrics,
scannerId, resultCache, consumer, stub, loc, isRegionServerRemote, priority,
scannerLeaseTimeoutPeriodNs, pauseNs, pauseForCQTBENs, maxAttempts, scanTimeoutNs,
rpcTimeoutNs, startLogErrorsCnt);
scannerLeaseTimeoutPeriodNs, pauseNs, pauseNsForServerOverloaded, maxAttempts,
scanTimeoutNs, rpcTimeoutNs, startLogErrorsCnt);
}
/**
@ -347,8 +347,8 @@ class AsyncRpcRetryingCallerFactory {
return this;
}
public BatchCallerBuilder pauseForCQTBE(long pause, TimeUnit unit) {
this.pauseForCQTBENs = unit.toNanos(pause);
public BatchCallerBuilder pauseForServerOverloaded(long pause, TimeUnit unit) {
this.pauseNsForServerOverloaded = unit.toNanos(pause);
return this;
}
@ -364,7 +364,8 @@ class AsyncRpcRetryingCallerFactory {
public <T> AsyncBatchRpcRetryingCaller<T> build() {
return new AsyncBatchRpcRetryingCaller<>(retryTimer, conn, tableName, actions, pauseNs,
pauseForCQTBENs, maxAttempts, operationTimeoutNs, rpcTimeoutNs, startLogErrorsCnt);
pauseNsForServerOverloaded, maxAttempts, operationTimeoutNs, rpcTimeoutNs,
startLogErrorsCnt);
}
public <T> List<CompletableFuture<T>> call() {
@ -406,8 +407,8 @@ class AsyncRpcRetryingCallerFactory {
return this;
}
public MasterRequestCallerBuilder<T> pauseForCQTBE(long pause, TimeUnit unit) {
this.pauseForCQTBENs = unit.toNanos(pause);
public MasterRequestCallerBuilder<T> pauseForServerOverloaded(long pause, TimeUnit unit) {
this.pauseNsForServerOverloaded = unit.toNanos(pause);
return this;
}
@ -438,7 +439,8 @@ class AsyncRpcRetryingCallerFactory {
public AsyncMasterRequestRpcRetryingCaller<T> build() {
preCheck();
return new AsyncMasterRequestRpcRetryingCaller<T>(retryTimer, conn, callable, priority,
pauseNs, pauseForCQTBENs, maxAttempts, operationTimeoutNs, rpcTimeoutNs, startLogErrorsCnt);
pauseNs, pauseNsForServerOverloaded, maxAttempts, operationTimeoutNs, rpcTimeoutNs,
startLogErrorsCnt);
}
/**
@ -487,8 +489,8 @@ class AsyncRpcRetryingCallerFactory {
return this;
}
public AdminRequestCallerBuilder<T> pauseForCQTBE(long pause, TimeUnit unit) {
this.pauseForCQTBENs = unit.toNanos(pause);
public AdminRequestCallerBuilder<T> pauseForServerOverloaded(long pause, TimeUnit unit) {
this.pauseNsForServerOverloaded = unit.toNanos(pause);
return this;
}
@ -514,8 +516,9 @@ class AsyncRpcRetryingCallerFactory {
public AsyncAdminRequestRetryingCaller<T> build() {
return new AsyncAdminRequestRetryingCaller<T>(retryTimer, conn, priority, pauseNs,
pauseForCQTBENs, maxAttempts, operationTimeoutNs, rpcTimeoutNs, startLogErrorsCnt,
checkNotNull(serverName, "serverName is null"), checkNotNull(callable, "action is null"));
pauseNsForServerOverloaded, maxAttempts, operationTimeoutNs, rpcTimeoutNs,
startLogErrorsCnt, checkNotNull(serverName, "serverName is null"),
checkNotNull(callable, "action is null"));
}
public CompletableFuture<T> call() {
@ -558,8 +561,8 @@ class AsyncRpcRetryingCallerFactory {
return this;
}
public ServerRequestCallerBuilder<T> pauseForCQTBE(long pause, TimeUnit unit) {
this.pauseForCQTBENs = unit.toNanos(pause);
public ServerRequestCallerBuilder<T> pauseForServerOverloaded(long pause, TimeUnit unit) {
this.pauseNsForServerOverloaded = unit.toNanos(pause);
return this;
}
@ -579,7 +582,8 @@ class AsyncRpcRetryingCallerFactory {
}
public AsyncServerRequestRpcRetryingCaller<T> build() {
return new AsyncServerRequestRpcRetryingCaller<T>(retryTimer, conn, pauseNs, pauseForCQTBENs,
return new AsyncServerRequestRpcRetryingCaller<T>(retryTimer, conn, pauseNs,
pauseNsForServerOverloaded,
maxAttempts, operationTimeoutNs, rpcTimeoutNs, startLogErrorsCnt,
checkNotNull(serverName, "serverName is null"), checkNotNull(callable, "action is null"));
}

View File

@ -36,8 +36,8 @@ import java.util.List;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.hbase.CallQueueTooBigException;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.HBaseServerException;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.NotServingRegionException;
@ -101,7 +101,7 @@ class AsyncScanSingleRegionRpcRetryingCaller {
private final long pauseNs;
private final long pauseForCQTBENs;
private final long pauseNsForServerOverloaded;
private final int maxAttempts;
@ -310,7 +310,7 @@ class AsyncScanSingleRegionRpcRetryingCaller {
Scan scan, ScanMetrics scanMetrics, long scannerId, ScanResultCache resultCache,
AdvancedScanResultConsumer consumer, Interface stub, HRegionLocation loc,
boolean isRegionServerRemote, int priority, long scannerLeaseTimeoutPeriodNs, long pauseNs,
long pauseForCQTBENs, int maxAttempts, long scanTimeoutNs, long rpcTimeoutNs,
long pauseNsForServerOverloaded, int maxAttempts, long scanTimeoutNs, long rpcTimeoutNs,
int startLogErrorsCnt) {
this.retryTimer = retryTimer;
this.scan = scan;
@ -323,7 +323,7 @@ class AsyncScanSingleRegionRpcRetryingCaller {
this.regionServerRemote = isRegionServerRemote;
this.scannerLeaseTimeoutPeriodNs = scannerLeaseTimeoutPeriodNs;
this.pauseNs = pauseNs;
this.pauseForCQTBENs = pauseForCQTBENs;
this.pauseNsForServerOverloaded = pauseNsForServerOverloaded;
this.maxAttempts = maxAttempts;
this.scanTimeoutNs = scanTimeoutNs;
this.rpcTimeoutNs = rpcTimeoutNs;
@ -413,7 +413,8 @@ class AsyncScanSingleRegionRpcRetryingCaller {
return;
}
long delayNs;
long pauseNsToUse = error instanceof CallQueueTooBigException ? pauseForCQTBENs : pauseNs;
long pauseNsToUse = HBaseServerException.isServerOverloaded(error) ?
pauseNsForServerOverloaded : pauseNs;
if (scanTimeoutNs > 0) {
long maxDelayNs = remainingTimeNs() - SLEEP_DELTA_NS;
if (maxDelayNs <= 0) {

View File

@ -46,10 +46,10 @@ public class AsyncServerRequestRpcRetryingCaller<T> extends AsyncRpcRetryingCall
private ServerName serverName;
public AsyncServerRequestRpcRetryingCaller(Timer retryTimer, AsyncConnectionImpl conn,
long pauseNs, long pauseForCQTBENs, int maxAttempts, long operationTimeoutNs,
long pauseNs, long pauseNsForServerOverloaded, int maxAttempts, long operationTimeoutNs,
long rpcTimeoutNs, int startLogErrorsCnt, ServerName serverName, Callable<T> callable) {
super(retryTimer, conn, HConstants.NORMAL_QOS, pauseNs, pauseForCQTBENs, maxAttempts,
operationTimeoutNs, rpcTimeoutNs, startLogErrorsCnt);
super(retryTimer, conn, HConstants.NORMAL_QOS, pauseNs, pauseNsForServerOverloaded,
maxAttempts, operationTimeoutNs, rpcTimeoutNs, startLogErrorsCnt);
this.serverName = serverName;
this.callable = callable;
}

View File

@ -56,10 +56,10 @@ class AsyncSingleRequestRpcRetryingCaller<T> extends AsyncRpcRetryingCaller<T> {
public AsyncSingleRequestRpcRetryingCaller(Timer retryTimer, AsyncConnectionImpl conn,
TableName tableName, byte[] row, int replicaId, RegionLocateType locateType,
Callable<T> callable, int priority, long pauseNs, long pauseForCQTBENs, int maxAttempts,
long operationTimeoutNs, long rpcTimeoutNs, int startLogErrorsCnt) {
super(retryTimer, conn, priority, pauseNs, pauseForCQTBENs, maxAttempts, operationTimeoutNs,
rpcTimeoutNs, startLogErrorsCnt);
Callable<T> callable, int priority, long pauseNs, long pauseNsForServerOverloaded,
int maxAttempts, long operationTimeoutNs, long rpcTimeoutNs, int startLogErrorsCnt) {
super(retryTimer, conn, priority, pauseNs, pauseNsForServerOverloaded, maxAttempts,
operationTimeoutNs, rpcTimeoutNs, startLogErrorsCnt);
this.tableName = tableName;
this.row = row;
this.replicaId = replicaId;

View File

@ -21,6 +21,7 @@ import static org.apache.hadoop.hbase.client.ConnectionUtils.retries2Attempts;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.hbase.HBaseServerException;
import org.apache.yetus.audience.InterfaceAudience;
/**
@ -76,21 +77,42 @@ public interface AsyncTableBuilder<C extends ScanResultConsumerBase> {
/**
* Set the base pause time for retrying. We use an exponential policy to generate sleep time when
* retrying.
* @see #setRetryPauseForCQTBE(long, TimeUnit)
* @see #setRetryPauseForServerOverloaded(long, TimeUnit)
*/
AsyncTableBuilder<C> setRetryPause(long pause, TimeUnit unit);
/**
* Set the base pause time for retrying when we hit {@code CallQueueTooBigException}. We use an
* exponential policy to generate sleep time when retrying.
* Set the base pause time for retrying when {@link HBaseServerException#isServerOverloaded()}.
* We use an exponential policy to generate sleep time when retrying.
* <p/>
* This value should be greater than the normal pause value which could be set with the above
* {@link #setRetryPause(long, TimeUnit)} method, as usually {@code CallQueueTooBigException}
* means the server is overloaded. We just use the normal pause value for
* {@code CallQueueTooBigException} if here you specify a smaller value.
* {@link #setRetryPause(long, TimeUnit)} method, as usually
* {@link HBaseServerException#isServerOverloaded()} means the server is overloaded. We just use
* the normal pause value for {@link HBaseServerException#isServerOverloaded()} if here you
* specify a smaller value.
*
* @see #setRetryPause(long, TimeUnit)
* @deprecated Since 2.5.0, will be removed in 4.0.0. Please use
* {@link #setRetryPauseForServerOverloaded(long, TimeUnit)} instead.
*/
@Deprecated
default AsyncTableBuilder<C> setRetryPauseForCQTBE(long pause, TimeUnit unit) {
return setRetryPauseForServerOverloaded(pause, unit);
}
/**
* Set the base pause time for retrying when {@link HBaseServerException#isServerOverloaded()}.
* We use an exponential policy to generate sleep time when retrying.
* <p/>
* This value should be greater than the normal pause value which could be set with the above
* {@link #setRetryPause(long, TimeUnit)} method, as usually
* {@link HBaseServerException#isServerOverloaded()} means the server is overloaded. We just use
* the normal pause value for {@link HBaseServerException#isServerOverloaded()} if here you
* specify a smaller value.
*
* @see #setRetryPause(long, TimeUnit)
*/
AsyncTableBuilder<C> setRetryPauseForCQTBE(long pause, TimeUnit unit);
AsyncTableBuilder<C> setRetryPauseForServerOverloaded(long pause, TimeUnit unit);
/**
* Set the max retry times for an operation. Usually it is the max attempt times minus 1.

View File

@ -45,7 +45,7 @@ abstract class AsyncTableBuilderBase<C extends ScanResultConsumerBase>
protected long pauseNs;
protected long pauseForCQTBENs;
protected long pauseNsForServerOverloaded;
protected int maxAttempts;
@ -60,7 +60,7 @@ abstract class AsyncTableBuilderBase<C extends ScanResultConsumerBase>
this.readRpcTimeoutNs = connConf.getReadRpcTimeoutNs();
this.writeRpcTimeoutNs = connConf.getWriteRpcTimeoutNs();
this.pauseNs = connConf.getPauseNs();
this.pauseForCQTBENs = connConf.getPauseForCQTBENs();
this.pauseNsForServerOverloaded = connConf.getPauseNsForServerOverloaded();
this.maxAttempts = retries2Attempts(connConf.getMaxRetries());
this.startLogErrorsCnt = connConf.getStartLogErrorsCnt();
}
@ -102,8 +102,8 @@ abstract class AsyncTableBuilderBase<C extends ScanResultConsumerBase>
}
@Override
public AsyncTableBuilderBase<C> setRetryPauseForCQTBE(long pause, TimeUnit unit) {
this.pauseForCQTBENs = unit.toNanos(pause);
public AsyncTableBuilderBase<C> setRetryPauseForServerOverloaded(long pause, TimeUnit unit) {
this.pauseNsForServerOverloaded = unit.toNanos(pause);
return this;
}

View File

@ -11,9 +11,13 @@
package org.apache.hadoop.hbase.client;
import static org.apache.hadoop.hbase.HConstants.DEFAULT_HBASE_CLIENT_PAUSE;
import static org.apache.hadoop.hbase.HConstants.HBASE_CLIENT_PAUSE;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Configuration parameters for the connection.
@ -25,6 +29,7 @@ import org.apache.yetus.audience.InterfaceAudience;
*/
@InterfaceAudience.Private
public class ConnectionConfiguration {
private static final Logger LOG = LoggerFactory.getLogger(ConnectionConfiguration.class);
public static final String WRITE_BUFFER_SIZE_KEY = "hbase.client.write.buffer";
public static final long WRITE_BUFFER_SIZE_DEFAULT = 2097152;
@ -43,6 +48,24 @@ public class ConnectionConfiguration {
"hbase.client.replicaCallTimeout.scan";
public static final int PRIMARY_SCAN_TIMEOUT_MICROSECOND_DEFAULT = 1000000; // 1s
/**
* Parameter name for client pause when server is overloaded, denoted by an exception
* where {@link org.apache.hadoop.hbase.HBaseServerException#isServerOverloaded(Throwable)}
* is true.
*/
public static final String HBASE_CLIENT_PAUSE_FOR_SERVER_OVERLOADED =
"hbase.client.pause.server.overloaded";
static {
// This is added where the configs are referenced. It may be too late to happen before
// any user _sets_ the old cqtbe config onto a Configuration option. So we still need
// to handle checking both properties in parsing below. The benefit of calling this is
// that it should still cause Configuration to log a warning if we do end up falling
// through to the old deprecated config.
Configuration.addDeprecation(
HConstants.HBASE_CLIENT_PAUSE_FOR_CQTBE, HBASE_CLIENT_PAUSE_FOR_SERVER_OVERLOADED);
}
private final long writeBufferSize;
private final long writeBufferPeriodicFlushTimeoutMs;
private final long writeBufferPeriodicFlushTimerTickMs;
@ -60,6 +83,8 @@ public class ConnectionConfiguration {
private final int writeRpcTimeout;
// toggle for async/sync prefetch
private final boolean clientScannerAsyncPrefetch;
private final long pauseMs;
private final long pauseMsForServerOverloaded;
/**
* Constructor
@ -115,6 +140,21 @@ public class ConnectionConfiguration {
this.writeRpcTimeout = conf.getInt(HConstants.HBASE_RPC_WRITE_TIMEOUT_KEY,
conf.getInt(HConstants.HBASE_RPC_TIMEOUT_KEY, HConstants.DEFAULT_HBASE_RPC_TIMEOUT));
long pauseMs = conf.getLong(HBASE_CLIENT_PAUSE, DEFAULT_HBASE_CLIENT_PAUSE);
long pauseMsForServerOverloaded = conf.getLong(HBASE_CLIENT_PAUSE_FOR_SERVER_OVERLOADED,
conf.getLong(HConstants.HBASE_CLIENT_PAUSE_FOR_CQTBE, pauseMs));
if (pauseMsForServerOverloaded < pauseMs) {
LOG.warn(
"The {} setting: {} ms is less than the {} setting: {} ms, use the greater one instead",
HBASE_CLIENT_PAUSE_FOR_SERVER_OVERLOADED, pauseMsForServerOverloaded,
HBASE_CLIENT_PAUSE, pauseMs);
pauseMsForServerOverloaded = pauseMs;
}
this.pauseMs = pauseMs;
this.pauseMsForServerOverloaded = pauseMsForServerOverloaded;
}
/**
@ -140,6 +180,8 @@ public class ConnectionConfiguration {
this.readRpcTimeout = HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
this.writeRpcTimeout = HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
this.rpcTimeout = HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
this.pauseMs = DEFAULT_HBASE_CLIENT_PAUSE;
this.pauseMsForServerOverloaded = DEFAULT_HBASE_CLIENT_PAUSE;
}
public int getReadRpcTimeout() {
@ -205,4 +247,12 @@ public class ConnectionConfiguration {
public int getRpcTimeout() {
return rpcTimeout;
}
public long getPauseMillis() {
return pauseMs;
}
public long getPauseMillisForServerOverloaded() {
return pauseMsForServerOverloaded;
}
}

View File

@ -54,9 +54,9 @@ import java.util.function.Supplier;
import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.AuthUtil;
import org.apache.hadoop.hbase.CallQueueTooBigException;
import org.apache.hadoop.hbase.ChoreService;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.HBaseServerException;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.MasterNotRunningException;
@ -176,8 +176,6 @@ public class ConnectionImplementation implements ClusterConnection, Closeable {
public static final String RETRIES_BY_SERVER_KEY = "hbase.client.retries.by.server";
private static final Logger LOG = LoggerFactory.getLogger(ConnectionImplementation.class);
private final long pause;
private final long pauseForCQTBE;// pause for CallQueueTooBigException, if specified
// The mode tells if HedgedRead, LoadBalance mode is supported.
// The default mode is CatalogReplicaMode.None.
private CatalogReplicaMode metaReplicaMode;
@ -275,16 +273,6 @@ public class ConnectionImplementation implements ClusterConnection, Closeable {
this.batchPool = (ThreadPoolExecutor) pool;
this.connectionConfig = new ConnectionConfiguration(conf);
this.closed = false;
this.pause = conf.getLong(HConstants.HBASE_CLIENT_PAUSE, HConstants.DEFAULT_HBASE_CLIENT_PAUSE);
long configuredPauseForCQTBE = conf.getLong(HConstants.HBASE_CLIENT_PAUSE_FOR_CQTBE, pause);
if (configuredPauseForCQTBE < pause) {
LOG.warn("The " + HConstants.HBASE_CLIENT_PAUSE_FOR_CQTBE + " setting: "
+ configuredPauseForCQTBE + " is smaller than " + HConstants.HBASE_CLIENT_PAUSE
+ ", will use " + pause + " instead.");
this.pauseForCQTBE = pause;
} else {
this.pauseForCQTBE = configuredPauseForCQTBE;
}
this.metaReplicaCallTimeoutScanInMicroSecond =
connectionConfig.getMetaReplicaCallTimeoutMicroSecondScan();
@ -954,7 +942,7 @@ public class ConnectionImplementation implements ClusterConnection, Closeable {
metaCache.clearCache(tableName, row, replicaId);
}
// Query the meta region
long pauseBase = this.pause;
long pauseBase = connectionConfig.getPauseMillis();
takeUserRegionLock();
try {
// We don't need to check if useCache is enabled or not. Even if useCache is false
@ -1044,9 +1032,10 @@ public class ConnectionImplementation implements ClusterConnection, Closeable {
if (e instanceof RemoteException) {
e = ((RemoteException) e).unwrapRemoteException();
}
if (e instanceof CallQueueTooBigException) {
// Give a special check on CallQueueTooBigException, see #HBASE-17114
pauseBase = this.pauseForCQTBE;
if (HBaseServerException.isServerOverloaded(e)) {
// Give a special pause when encountering an exception indicating the server
// is overloaded. see #HBASE-17114 and HBASE-26807
pauseBase = connectionConfig.getPauseMillisForServerOverloaded();
}
if (tries < maxAttempts - 1) {
LOG.debug(

View File

@ -35,7 +35,6 @@ import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.ServerName;
@ -77,7 +76,7 @@ public class HTableMultiplexer {
private final Map<HRegionLocation, FlushWorker> serverToFlushWorkerMap =
new ConcurrentHashMap<>();
private final Configuration workerConf;
private final Configuration conf;
private final ClusterConnection conn;
private final ExecutorService pool;
private final int maxAttempts;
@ -116,11 +115,7 @@ public class HTableMultiplexer {
this.executor =
Executors.newScheduledThreadPool(initThreads,
new ThreadFactoryBuilder().setDaemon(true).setNameFormat("HTableFlushWorker-%d").build());
this.workerConf = HBaseConfiguration.create(conf);
// We do not do the retry because we need to reassign puts to different queues if regions are
// moved.
this.workerConf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 0);
this.conf = conf;
}
/**
@ -245,7 +240,7 @@ public class HTableMultiplexer {
worker = serverToFlushWorkerMap.get(addr);
if (worker == null) {
// Create the flush worker
worker = new FlushWorker(workerConf, this.conn, addr, this,
worker = new FlushWorker(conf, this.conn, addr, this,
perRegionServerBufferQueueSize, pool, executor);
this.serverToFlushWorkerMap.put(addr, worker);
executor.scheduleAtFixedRate(worker, flushPeriod, flushPeriod, TimeUnit.MILLISECONDS);
@ -454,7 +449,9 @@ public class HTableMultiplexer {
HConstants.DEFAULT_HBASE_RPC_TIMEOUT));
this.operationTimeout = conf.getInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT,
HConstants.DEFAULT_HBASE_CLIENT_OPERATION_TIMEOUT);
this.ap = new AsyncProcess(conn, conf, rpcCallerFactory, rpcControllerFactory);
// Specify 0 retries in AsyncProcess because we need to reassign puts to different queues
// if regions are moved.
this.ap = new AsyncProcess(conn, conf, rpcCallerFactory, rpcControllerFactory, 0);
this.executor = executor;
this.maxRetryInQueue = conf.getInt(TABLE_MULTIPLEXER_MAX_RETRIES_IN_QUEUE, 10000);
this.pool = pool;

View File

@ -333,7 +333,7 @@ class RawAsyncHBaseAdmin implements AsyncAdmin {
private final long pauseNs;
private final long pauseForCQTBENs;
private final long pauseNsForServerOverloaded;
private final int maxAttempts;
@ -349,15 +349,15 @@ class RawAsyncHBaseAdmin implements AsyncAdmin {
this.rpcTimeoutNs = builder.rpcTimeoutNs;
this.operationTimeoutNs = builder.operationTimeoutNs;
this.pauseNs = builder.pauseNs;
if (builder.pauseForCQTBENs < builder.pauseNs) {
if (builder.pauseNsForServerOverloaded < builder.pauseNs) {
LOG.warn(
"Configured value of pauseForCQTBENs is {} ms, which is less than" +
"Configured value of pauseNsForServerOverloaded is {} ms, which is less than" +
" the normal pause value {} ms, use the greater one instead",
TimeUnit.NANOSECONDS.toMillis(builder.pauseForCQTBENs),
TimeUnit.NANOSECONDS.toMillis(builder.pauseNsForServerOverloaded),
TimeUnit.NANOSECONDS.toMillis(builder.pauseNs));
this.pauseForCQTBENs = builder.pauseNs;
this.pauseNsForServerOverloaded = builder.pauseNs;
} else {
this.pauseForCQTBENs = builder.pauseForCQTBENs;
this.pauseNsForServerOverloaded = builder.pauseNsForServerOverloaded;
}
this.maxAttempts = builder.maxAttempts;
this.startLogErrorsCnt = builder.startLogErrorsCnt;
@ -368,7 +368,8 @@ class RawAsyncHBaseAdmin implements AsyncAdmin {
return this.connection.callerFactory.<T> masterRequest()
.rpcTimeout(rpcTimeoutNs, TimeUnit.NANOSECONDS)
.operationTimeout(operationTimeoutNs, TimeUnit.NANOSECONDS)
.pause(pauseNs, TimeUnit.NANOSECONDS).pauseForCQTBE(pauseForCQTBENs, TimeUnit.NANOSECONDS)
.pause(pauseNs, TimeUnit.NANOSECONDS)
.pauseForServerOverloaded(pauseNsForServerOverloaded, TimeUnit.NANOSECONDS)
.maxAttempts(maxAttempts).startLogErrorsCnt(startLogErrorsCnt);
}
@ -376,7 +377,8 @@ class RawAsyncHBaseAdmin implements AsyncAdmin {
return this.connection.callerFactory.<T> adminRequest()
.rpcTimeout(rpcTimeoutNs, TimeUnit.NANOSECONDS)
.operationTimeout(operationTimeoutNs, TimeUnit.NANOSECONDS)
.pause(pauseNs, TimeUnit.NANOSECONDS).pauseForCQTBE(pauseForCQTBENs, TimeUnit.NANOSECONDS)
.pause(pauseNs, TimeUnit.NANOSECONDS)
.pauseForServerOverloaded(pauseNsForServerOverloaded, TimeUnit.NANOSECONDS)
.maxAttempts(maxAttempts).startLogErrorsCnt(startLogErrorsCnt);
}
@ -3486,7 +3488,8 @@ class RawAsyncHBaseAdmin implements AsyncAdmin {
return this.connection.callerFactory.<T> serverRequest()
.rpcTimeout(rpcTimeoutNs, TimeUnit.NANOSECONDS)
.operationTimeout(operationTimeoutNs, TimeUnit.NANOSECONDS)
.pause(pauseNs, TimeUnit.NANOSECONDS).pauseForCQTBE(pauseForCQTBENs, TimeUnit.NANOSECONDS)
.pause(pauseNs, TimeUnit.NANOSECONDS)
.pauseForServerOverloaded(pauseNsForServerOverloaded, TimeUnit.NANOSECONDS)
.maxAttempts(maxAttempts).startLogErrorsCnt(startLogErrorsCnt);
}

View File

@ -26,7 +26,6 @@ import static org.apache.hadoop.hbase.client.ConnectionUtils.validatePutsInRowMu
import static org.apache.hadoop.hbase.trace.TraceUtil.tracedFuture;
import static org.apache.hadoop.hbase.trace.TraceUtil.tracedFutures;
import static org.apache.hadoop.hbase.util.FutureUtils.addListener;
import com.google.protobuf.RpcChannel;
import io.opentelemetry.api.trace.Span;
import io.opentelemetry.api.trace.StatusCode;
@ -59,11 +58,9 @@ import org.apache.hadoop.hbase.util.ReflectionUtils;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
import org.apache.hbase.thirdparty.com.google.protobuf.RpcCallback;
import org.apache.hbase.thirdparty.io.netty.util.Timer;
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.RequestConverter;
import org.apache.hadoop.hbase.shaded.protobuf.ResponseConverter;
@ -112,7 +109,7 @@ class RawAsyncTableImpl implements AsyncTable<AdvancedScanResultConsumer> {
private final long pauseNs;
private final long pauseForCQTBENs;
private final long pauseNsForServerOverloaded;
private final int maxAttempts;
@ -128,15 +125,15 @@ class RawAsyncTableImpl implements AsyncTable<AdvancedScanResultConsumer> {
this.operationTimeoutNs = builder.operationTimeoutNs;
this.scanTimeoutNs = builder.scanTimeoutNs;
this.pauseNs = builder.pauseNs;
if (builder.pauseForCQTBENs < builder.pauseNs) {
if (builder.pauseNsForServerOverloaded < builder.pauseNs) {
LOG.warn(
"Configured value of pauseForCQTBENs is {} ms, which is less than"
+ " the normal pause value {} ms, use the greater one instead",
TimeUnit.NANOSECONDS.toMillis(builder.pauseForCQTBENs),
"Configured value of pauseNsForServerOverloaded is {} ms, which is less than" +
" the normal pause value {} ms, use the greater one instead",
TimeUnit.NANOSECONDS.toMillis(builder.pauseNsForServerOverloaded),
TimeUnit.NANOSECONDS.toMillis(builder.pauseNs));
this.pauseForCQTBENs = builder.pauseNs;
this.pauseNsForServerOverloaded = builder.pauseNs;
} else {
this.pauseForCQTBENs = builder.pauseForCQTBENs;
this.pauseNsForServerOverloaded = builder.pauseNsForServerOverloaded;
}
this.maxAttempts = builder.maxAttempts;
this.startLogErrorsCnt = builder.startLogErrorsCnt;
@ -242,10 +239,11 @@ class RawAsyncTableImpl implements AsyncTable<AdvancedScanResultConsumer> {
}
private <T> SingleRequestCallerBuilder<T> newCaller(byte[] row, int priority, long rpcTimeoutNs) {
return conn.callerFactory.<T> single().table(tableName).row(row).priority(priority)
return conn.callerFactory.<T>single().table(tableName).row(row).priority(priority)
.rpcTimeout(rpcTimeoutNs, TimeUnit.NANOSECONDS)
.operationTimeout(operationTimeoutNs, TimeUnit.NANOSECONDS)
.pause(pauseNs, TimeUnit.NANOSECONDS).pauseForCQTBE(pauseForCQTBENs, TimeUnit.NANOSECONDS)
.pause(pauseNs, TimeUnit.NANOSECONDS)
.pauseForServerOverloaded(pauseNsForServerOverloaded, TimeUnit.NANOSECONDS)
.maxAttempts(maxAttempts).startLogErrorsCnt(startLogErrorsCnt);
}
@ -648,8 +646,8 @@ class RawAsyncTableImpl implements AsyncTable<AdvancedScanResultConsumer> {
@Override
public void scan(Scan scan, AdvancedScanResultConsumer consumer) {
new AsyncClientScanner(setDefaultScanConfig(scan), consumer, tableName, conn, retryTimer,
pauseNs, pauseForCQTBENs, maxAttempts, scanTimeoutNs, readRpcTimeoutNs, startLogErrorsCnt)
.start();
pauseNs, pauseNsForServerOverloaded, maxAttempts, scanTimeoutNs, readRpcTimeoutNs,
startLogErrorsCnt).start();
}
private long resultSize2CacheSize(long maxResultSize) {
@ -744,7 +742,7 @@ class RawAsyncTableImpl implements AsyncTable<AdvancedScanResultConsumer> {
return conn.callerFactory.batch().table(tableName).actions(actions)
.operationTimeout(operationTimeoutNs, TimeUnit.NANOSECONDS)
.rpcTimeout(rpcTimeoutNs, TimeUnit.NANOSECONDS).pause(pauseNs, TimeUnit.NANOSECONDS)
.pauseForCQTBE(pauseForCQTBENs, TimeUnit.NANOSECONDS).maxAttempts(maxAttempts)
.pauseForServerOverloaded(pauseNsForServerOverloaded, TimeUnit.NANOSECONDS)
.startLogErrorsCnt(startLogErrorsCnt).call();
}

View File

@ -21,8 +21,6 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.util.ReflectionUtils;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Factory to create an {@link RpcRetryingCaller}
@ -32,12 +30,8 @@ public class RpcRetryingCallerFactory {
/** Configuration key for a custom {@link RpcRetryingCaller} */
public static final String CUSTOM_CALLER_CONF_KEY = "hbase.rpc.callerfactory.class";
private static final Logger LOG = LoggerFactory.getLogger(RpcRetryingCallerFactory.class);
protected final Configuration conf;
private final long pause;
private final long pauseForCQTBE;// pause for CallQueueTooBigException, if specified
private final int retries;
private final int rpcTimeout;
private final ConnectionConfiguration connectionConf;
private final RetryingCallerInterceptor interceptor;
private final int startLogErrorsCnt;
/* These below data members are UNUSED!!!*/
@ -50,25 +44,12 @@ public class RpcRetryingCallerFactory {
public RpcRetryingCallerFactory(Configuration conf, RetryingCallerInterceptor interceptor) {
this.conf = conf;
pause = conf.getLong(HConstants.HBASE_CLIENT_PAUSE,
HConstants.DEFAULT_HBASE_CLIENT_PAUSE);
long configuredPauseForCQTBE = conf.getLong(HConstants.HBASE_CLIENT_PAUSE_FOR_CQTBE, pause);
if (configuredPauseForCQTBE < pause) {
LOG.warn("The " + HConstants.HBASE_CLIENT_PAUSE_FOR_CQTBE + " setting: "
+ configuredPauseForCQTBE + " is smaller than " + HConstants.HBASE_CLIENT_PAUSE
+ ", will use " + pause + " instead.");
this.pauseForCQTBE = pause;
} else {
this.pauseForCQTBE = configuredPauseForCQTBE;
}
retries = conf.getInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER);
this.connectionConf = new ConnectionConfiguration(conf);
startLogErrorsCnt = conf.getInt(AsyncProcess.START_LOG_ERRORS_AFTER_COUNT_KEY,
AsyncProcess.DEFAULT_START_LOG_ERRORS_AFTER_COUNT);
this.interceptor = interceptor;
enableBackPressure = conf.getBoolean(HConstants.ENABLE_CLIENT_BACKPRESSURE,
HConstants.DEFAULT_ENABLE_CLIENT_BACKPRESSURE);
rpcTimeout = conf.getInt(HConstants.HBASE_RPC_TIMEOUT_KEY,HConstants.DEFAULT_HBASE_RPC_TIMEOUT);
}
/**
@ -84,9 +65,11 @@ public class RpcRetryingCallerFactory {
public <T> RpcRetryingCaller<T> newCaller(int rpcTimeout) {
// We store the values in the factory instance. This way, constructing new objects
// is cheap as it does not require parsing a complex structure.
RpcRetryingCaller<T> caller = new RpcRetryingCallerImpl<>(pause, pauseForCQTBE, retries,
return new RpcRetryingCallerImpl<>(
connectionConf.getPauseMillis(),
connectionConf.getPauseMillisForServerOverloaded(),
connectionConf.getRetriesNumber(),
interceptor, startLogErrorsCnt, rpcTimeout);
return caller;
}
/**
@ -95,9 +78,12 @@ public class RpcRetryingCallerFactory {
public <T> RpcRetryingCaller<T> newCaller() {
// We store the values in the factory instance. This way, constructing new objects
// is cheap as it does not require parsing a complex structure.
RpcRetryingCaller<T> caller = new RpcRetryingCallerImpl<>(pause, pauseForCQTBE, retries,
interceptor, startLogErrorsCnt, rpcTimeout);
return caller;
return new RpcRetryingCallerImpl<>(
connectionConf.getPauseMillis(),
connectionConf.getPauseMillisForServerOverloaded(),
connectionConf.getRetriesNumber(),
interceptor, startLogErrorsCnt,
connectionConf.getRpcTimeout());
}
public static RpcRetryingCallerFactory instantiate(Configuration configuration) {

View File

@ -30,8 +30,8 @@ import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.hadoop.hbase.CallQueueTooBigException;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.HBaseServerException;
import org.apache.hadoop.hbase.exceptions.PreemptiveFastFailException;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.ExceptionUtil;
@ -60,7 +60,7 @@ public class RpcRetryingCallerImpl<T> implements RpcRetryingCaller<T> {
private final int startLogErrorsCnt;
private final long pause;
private final long pauseForCQTBE;
private final long pauseForServerOverloaded;
private final int maxAttempts;// how many times to try
private final int rpcTimeout;// timeout for each rpc request
private final AtomicBoolean cancelled = new AtomicBoolean(false);
@ -68,15 +68,16 @@ public class RpcRetryingCallerImpl<T> implements RpcRetryingCaller<T> {
private final RetryingCallerInterceptorContext context;
private final RetryingTimeTracker tracker;
public RpcRetryingCallerImpl(long pause, long pauseForCQTBE, int retries, int startLogErrorsCnt) {
this(pause, pauseForCQTBE, retries, RetryingCallerInterceptorFactory.NO_OP_INTERCEPTOR,
startLogErrorsCnt, 0);
public RpcRetryingCallerImpl(long pause, long pauseForServerOverloaded, int retries,
int startLogErrorsCnt) {
this(pause, pauseForServerOverloaded, retries,
RetryingCallerInterceptorFactory.NO_OP_INTERCEPTOR, startLogErrorsCnt, 0);
}
public RpcRetryingCallerImpl(long pause, long pauseForCQTBE, int retries,
public RpcRetryingCallerImpl(long pause, long pauseForServerOverloaded, int retries,
RetryingCallerInterceptor interceptor, int startLogErrorsCnt, int rpcTimeout) {
this.pause = pause;
this.pauseForCQTBE = pauseForCQTBE;
this.pauseForServerOverloaded = pauseForServerOverloaded;
this.maxAttempts = retries2Attempts(retries);
this.interceptor = interceptor;
context = interceptor.createEmptyContext();
@ -148,8 +149,10 @@ public class RpcRetryingCallerImpl<T> implements RpcRetryingCaller<T> {
// If the server is dead, we need to wait a little before retrying, to give
// a chance to the regions to be moved
// get right pause time, start by RETRY_BACKOFF[0] * pauseBase, where pauseBase might be
// special when encountering CallQueueTooBigException, see #HBASE-17114
long pauseBase = (t instanceof CallQueueTooBigException) ? pauseForCQTBE : pause;
// special when encountering an exception indicating the server is overloaded.
// see #HBASE-17114 and HBASE-26807
long pauseBase = HBaseServerException.isServerOverloaded(t)
? pauseForServerOverloaded : pause;
expectedSleep = callable.sleep(pauseBase, tries);
// If, after the planned sleep, there won't be enough time left, we stop now.

View File

@ -111,28 +111,6 @@ public final class ClientExceptionsUtil {
return null;
}
/**
* Checks if the exception is CallQueueTooBig exception (maybe wrapped
* into some RemoteException).
* @param t exception to check
* @return true if it's a CQTBE, false otherwise
*/
public static boolean isCallQueueTooBigException(Throwable t) {
t = findException(t);
return (t instanceof CallQueueTooBigException);
}
/**
* Checks if the exception is CallDroppedException (maybe wrapped
* into some RemoteException).
* @param t exception to check
* @return true if it's a CQTBE, false otherwise
*/
public static boolean isCallDroppedException(Throwable t) {
t = findException(t);
return (t instanceof CallDroppedException);
}
// This list covers most connectivity exceptions but not all.
// For example, in SocketOutputStream a plain IOException is thrown at times when the channel is
// closed.

View File

@ -40,14 +40,12 @@ import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.yetus.audience.InterfaceAudience;
import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
import org.apache.hbase.thirdparty.com.google.protobuf.CodedOutputStream;
import org.apache.hbase.thirdparty.com.google.protobuf.Message;
import org.apache.hbase.thirdparty.io.netty.buffer.ByteBuf;
import org.apache.hbase.thirdparty.io.netty.channel.EventLoop;
import org.apache.hbase.thirdparty.io.netty.util.concurrent.FastThreadLocal;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RPCProtos.CellBlockMeta;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RPCProtos.ExceptionResponse;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RPCProtos.RequestHeader;
@ -140,11 +138,13 @@ class IPCUtil {
static RemoteException createRemoteException(final ExceptionResponse e) {
String innerExceptionClassName = e.getExceptionClassName();
boolean doNotRetry = e.getDoNotRetry();
boolean serverOverloaded = e.hasServerOverloaded() && e.getServerOverloaded();
return e.hasHostname() ?
// If a hostname then add it to the RemoteWithExtrasException
new RemoteWithExtrasException(innerExceptionClassName, e.getStackTrace(), e.getHostname(),
e.getPort(), doNotRetry)
: new RemoteWithExtrasException(innerExceptionClassName, e.getStackTrace(), doNotRetry);
e.getPort(), doNotRetry, serverOverloaded) :
new RemoteWithExtrasException(innerExceptionClassName, e.getStackTrace(), doNotRetry,
serverOverloaded);
}
/**

View File

@ -25,6 +25,7 @@ import java.security.PrivilegedAction;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HBaseServerException;
import org.apache.hadoop.hbase.util.DynamicClassLoader;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.yetus.audience.InterfaceAudience;
@ -41,6 +42,7 @@ public class RemoteWithExtrasException extends RemoteException {
private final String hostname;
private final int port;
private final boolean doNotRetry;
private final boolean serverOverloaded;
/**
* Dynamic class loader to load filter/comparators
@ -58,15 +60,26 @@ public class RemoteWithExtrasException extends RemoteException {
}
public RemoteWithExtrasException(String className, String msg, final boolean doNotRetry) {
this(className, msg, null, -1, doNotRetry);
this(className, msg, doNotRetry, false);
}
public RemoteWithExtrasException(String className, String msg, final boolean doNotRetry,
final boolean serverOverloaded) {
this(className, msg, null, -1, doNotRetry, serverOverloaded);
}
public RemoteWithExtrasException(String className, String msg, final String hostname,
final int port, final boolean doNotRetry) {
this(className, msg, hostname, port, doNotRetry, false);
}
public RemoteWithExtrasException(String className, String msg, final String hostname,
final int port, final boolean doNotRetry, final boolean serverOverloaded) {
super(className, msg);
this.hostname = hostname;
this.port = port;
this.doNotRetry = doNotRetry;
this.serverOverloaded = serverOverloaded;
}
@Override
@ -98,6 +111,17 @@ public class RemoteWithExtrasException extends RemoteException {
cn.setAccessible(true);
IOException ex = cn.newInstance(this.getMessage());
ex.initCause(this);
if (ex instanceof HBaseServerException) {
// this is a newly constructed exception.
// if an exception defaults to meaning isServerOverloaded, we use that.
// otherwise, see if the remote exception value should mean setting to true.
HBaseServerException serverException = (HBaseServerException) ex;
if (serverOverloaded && !serverException.isServerOverloaded()) {
serverException.setServerOverloaded(true);
}
}
return ex;
}
@ -121,4 +145,11 @@ public class RemoteWithExtrasException extends RemoteException {
public boolean isDoNotRetry() {
return this.doNotRetry;
}
/**
* @return True if the server was considered overloaded when the exception was thrown.
*/
public boolean isServerOverloaded() {
return serverOverloaded;
}
}

View File

@ -18,7 +18,7 @@
package org.apache.hadoop.hbase.client;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseClassTestRule;
@ -40,6 +40,23 @@ public class TestAsyncConnectionConfiguration {
public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestAsyncConnectionConfiguration.class);
@Test
public void itHandlesDeprecatedPauseForCQTBE() {
Configuration conf = new Configuration();
long timeoutMs = 1000;
conf.setLong(HConstants.HBASE_CLIENT_PAUSE_FOR_CQTBE, timeoutMs);
AsyncConnectionConfiguration config = new AsyncConnectionConfiguration(conf);
assertTrue(Configuration.isDeprecated(HConstants.HBASE_CLIENT_PAUSE_FOR_CQTBE));
long expected = TimeUnit.MILLISECONDS.toNanos(timeoutMs);
assertEquals(expected, config.getPauseNsForServerOverloaded());
conf = new Configuration();
conf.setLong(ConnectionConfiguration.HBASE_CLIENT_PAUSE_FOR_SERVER_OVERLOADED, timeoutMs);
config = new AsyncConnectionConfiguration(conf);
assertEquals(expected, config.getPauseNsForServerOverloaded());
}
@Test
public void testDefaultReadWriteRpcTimeout() {
Configuration conf = HBaseConfiguration.create();

View File

@ -48,9 +48,11 @@ import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.CallDroppedException;
import org.apache.hadoop.hbase.CallQueueTooBigException;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.HBaseServerException;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HRegionLocation;
@ -1038,7 +1040,12 @@ public class TestAsyncProcess {
}
private ClusterConnection createHConnection() throws IOException {
ClusterConnection hc = createHConnectionCommon();
return createHConnection(CONNECTION_CONFIG);
}
private ClusterConnection createHConnection(ConnectionConfiguration configuration)
throws IOException {
ClusterConnection hc = createHConnectionCommon(configuration);
setMockLocation(hc, DUMMY_BYTES_1, new RegionLocations(loc1));
setMockLocation(hc, DUMMY_BYTES_2, new RegionLocations(loc2));
setMockLocation(hc, DUMMY_BYTES_3, new RegionLocations(loc3));
@ -1048,8 +1055,9 @@ public class TestAsyncProcess {
return hc;
}
private ClusterConnection createHConnectionWithReplicas() throws IOException {
ClusterConnection hc = createHConnectionCommon();
private ClusterConnection createHConnectionWithReplicas(ConnectionConfiguration configuration)
throws IOException {
ClusterConnection hc = createHConnectionCommon(configuration);
setMockLocation(hc, DUMMY_BYTES_1, hrls1);
setMockLocation(hc, DUMMY_BYTES_2, hrls2);
setMockLocation(hc, DUMMY_BYTES_3, hrls3);
@ -1076,13 +1084,14 @@ public class TestAsyncProcess {
Mockito.anyBoolean(), Mockito.anyBoolean())).thenReturn(result);
}
private ClusterConnection createHConnectionCommon() {
private ClusterConnection createHConnectionCommon(
ConnectionConfiguration connectionConfiguration) {
ClusterConnection hc = Mockito.mock(ClusterConnection.class);
NonceGenerator ng = Mockito.mock(NonceGenerator.class);
Mockito.when(ng.getNonceGroup()).thenReturn(HConstants.NO_NONCE);
Mockito.when(hc.getNonceGenerator()).thenReturn(ng);
Mockito.when(hc.getConfiguration()).thenReturn(CONF);
Mockito.when(hc.getConnectionConfiguration()).thenReturn(CONNECTION_CONFIG);
Mockito.when(hc.getConnectionConfiguration()).thenReturn(connectionConfiguration);
return hc;
}
@ -1608,11 +1617,11 @@ public class TestAsyncProcess {
// TODO: this is kind of timing dependent... perhaps it should detect from createCaller
// that the replica call has happened and that way control the ordering.
Configuration conf = new Configuration();
ClusterConnection conn = createHConnectionWithReplicas();
conf.setInt(AsyncProcess.PRIMARY_CALL_TIMEOUT_KEY, replicaAfterMs * 1000);
if (retries >= 0) {
conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, retries);
}
ClusterConnection conn = createHConnectionWithReplicas(new ConnectionConfiguration(conf));
MyAsyncProcessWithReplicas ap = new MyAsyncProcessWithReplicas(conn, conf);
ap.setCallDelays(primaryMs, replicaMs);
return ap;
@ -1736,20 +1745,31 @@ public class TestAsyncProcess {
}
/**
* Test and make sure we could use a special pause setting when retry with
* CallQueueTooBigException, see HBASE-17114
* @throws Exception if unexpected error happened during test
* Below tests make sure we could use a special pause setting when retry an exception
* where {@link HBaseServerException#isServerOverloaded(Throwable)} is true, see HBASE-17114
*/
@Test
public void testRetryPauseWithCallQueueTooBigException() throws Exception {
Configuration myConf = new Configuration(CONF);
public void testRetryPauseWhenServerOverloadedDueToCQTBE() throws Exception {
testRetryPauseWhenServerIsOverloaded(new CallQueueTooBigException());
}
@Test
public void testRetryPauseWhenServerOverloadedDueToCDE() throws Exception {
testRetryPauseWhenServerIsOverloaded(new CallDroppedException());
}
private void testRetryPauseWhenServerIsOverloaded(
HBaseServerException exception) throws IOException {
Configuration conf = new Configuration(CONF);
final long specialPause = 500L;
final int retries = 1;
myConf.setLong(HConstants.HBASE_CLIENT_PAUSE_FOR_CQTBE, specialPause);
myConf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, retries);
ClusterConnection conn = new MyConnectionImpl(myConf);
conf.setLong(ConnectionConfiguration.HBASE_CLIENT_PAUSE_FOR_SERVER_OVERLOADED, specialPause);
conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, retries);
ClusterConnection conn = new MyConnectionImpl(conf);
AsyncProcessWithFailure ap =
new AsyncProcessWithFailure(conn, myConf, new CallQueueTooBigException());
new AsyncProcessWithFailure(conn, conf, exception);
BufferedMutatorParams bufferParam = createBufferedMutatorParams(ap, DUMMY_TABLE);
BufferedMutatorImpl mutator = new BufferedMutatorImpl(conn, bufferParam, ap);
@ -1779,8 +1799,8 @@ public class TestAsyncProcess {
// check and confirm normal IOE will use the normal pause
final long normalPause =
myConf.getLong(HConstants.HBASE_CLIENT_PAUSE, HConstants.DEFAULT_HBASE_CLIENT_PAUSE);
ap = new AsyncProcessWithFailure(conn, myConf, new IOException());
conf.getLong(HConstants.HBASE_CLIENT_PAUSE, HConstants.DEFAULT_HBASE_CLIENT_PAUSE);
ap = new AsyncProcessWithFailure(conn, conf, new IOException());
bufferParam = createBufferedMutatorParams(ap, DUMMY_TABLE);
mutator = new BufferedMutatorImpl(conn, bufferParam, ap);
Assert.assertNotNull(mutator.getAsyncProcess().createServerErrorTracker());
@ -1806,9 +1826,9 @@ public class TestAsyncProcess {
@Test
public void testRetryWithExceptionClearsMetaCache() throws Exception {
ClusterConnection conn = createHConnection();
Configuration myConf = conn.getConfiguration();
Configuration myConf = new Configuration(CONF);
myConf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 0);
ClusterConnection conn = createHConnection(new ConnectionConfiguration(myConf));
AsyncProcessWithFailure ap =
new AsyncProcessWithFailure(conn, myConf, new RegionOpeningException("test"));

View File

@ -0,0 +1,54 @@
/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.client;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.testclassification.ClientTests;
import org.apache.hadoop.hbase.testclassification.SmallTests;
import org.junit.ClassRule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
@Category({ ClientTests.class, SmallTests.class})
public class TestConnectionConfiguration {
@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestConnectionConfiguration.class);
@Test
public void itHandlesDeprecatedPauseForCQTBE() {
Configuration conf = new Configuration();
long timeoutMs = 1000;
conf.setLong(HConstants.HBASE_CLIENT_PAUSE_FOR_CQTBE, timeoutMs);
ConnectionConfiguration config = new ConnectionConfiguration(conf);
assertTrue(Configuration.isDeprecated(HConstants.HBASE_CLIENT_PAUSE_FOR_CQTBE));
assertEquals(timeoutMs, config.getPauseMillisForServerOverloaded());
conf = new Configuration();
conf.setLong(ConnectionConfiguration.HBASE_CLIENT_PAUSE_FOR_SERVER_OVERLOADED, timeoutMs);
config = new ConnectionConfiguration(conf);
assertEquals(timeoutMs, config.getPauseMillisForServerOverloaded());
}
}

View File

@ -0,0 +1,110 @@
/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.client;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.IOException;
import org.apache.hadoop.hbase.CallDroppedException;
import org.apache.hadoop.hbase.CallQueueTooBigException;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.HBaseServerException;
import org.apache.hadoop.hbase.testclassification.ClientTests;
import org.apache.hadoop.hbase.testclassification.SmallTests;
import org.junit.ClassRule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
@Category({ ClientTests.class, SmallTests.class})
public class TestRpcRetryingCallerImpl {
@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestRpcRetryingCallerImpl.class);
@Test
public void itUsesSpecialPauseForCQTBE() throws Exception {
itUsesSpecialPauseForServerOverloaded(CallQueueTooBigException.class);
}
@Test
public void itUsesSpecialPauseForCDE() throws Exception {
itUsesSpecialPauseForServerOverloaded(CallDroppedException.class);
}
private void itUsesSpecialPauseForServerOverloaded(
Class<? extends HBaseServerException> exceptionClass) throws Exception {
// the actual values don't matter here as long as they're distinct.
// the ThrowingCallable will assert that the passed in pause from RpcRetryingCallerImpl
// matches the specialPauseMillis
long pauseMillis = 1;
long specialPauseMillis = 2;
RpcRetryingCallerImpl<Void> caller =
new RpcRetryingCallerImpl<>(pauseMillis, specialPauseMillis, 2, 0);
RetryingCallable<Void> callable = new ThrowingCallable(
CallQueueTooBigException.class, specialPauseMillis);
try {
caller.callWithRetries(callable, 5000);
fail("Expected " + exceptionClass.getSimpleName());
} catch (RetriesExhaustedException e) {
assertTrue(e.getCause() instanceof HBaseServerException);
}
}
private static class ThrowingCallable implements RetryingCallable<Void> {
private final Class<? extends HBaseServerException> exceptionClass;
private final long specialPauseMillis;
public ThrowingCallable(Class<? extends HBaseServerException> exceptionClass,
long specialPauseMillis) {
this.exceptionClass = exceptionClass;
this.specialPauseMillis = specialPauseMillis;
}
@Override
public void prepare(boolean reload) throws IOException {
}
@Override
public void throwable(Throwable t, boolean retrying) {
}
@Override
public String getExceptionMessageAdditionalDetail() {
return null;
}
@Override
public long sleep(long pause, int tries) {
assertEquals(pause, specialPauseMillis);
return 0;
}
@Override
public Void call(int callTimeout) throws Exception {
throw exceptionClass.getConstructor().newInstance();
}
}
}

View File

@ -0,0 +1,84 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.ipc;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.HBaseServerException;
import org.apache.hadoop.hbase.testclassification.ClientTests;
import org.apache.hadoop.hbase.testclassification.SmallTests;
import org.junit.ClassRule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
@Category({ ClientTests.class, SmallTests.class })
public class TestRemoteWithExtrasException {
@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestRemoteWithExtrasException.class);
/**
* test verifies that we honor the inherent value of an exception for isServerOverloaded.
* We don't want a false value passed into RemoteWithExtrasExceptions to override the
* inherent value of an exception if it's already true. This could be due to an out of date
* server not sending the proto field we expect.
*/
@Test
public void itUsesExceptionDefaultValueForServerOverloaded() {
// pass false for server overloaded, we still expect the exception to be true due to
// the exception type
RemoteWithExtrasException ex =
new RemoteWithExtrasException(ServerOverloadedException.class.getName(),
"server is overloaded", false, false);
IOException result = ex.unwrapRemoteException();
assertEquals(result.getClass(), ServerOverloadedException.class);
assertTrue(((ServerOverloadedException) result).isServerOverloaded());
}
@Test
public void itUsesPassedServerOverloadedValue() {
String exceptionClass = HBaseServerException.class.getName();
String message = "server is overloaded";
RemoteWithExtrasException ex =
new RemoteWithExtrasException(exceptionClass, message, false, false);
IOException result = ex.unwrapRemoteException();
assertTrue(result instanceof HBaseServerException);
assertFalse(((HBaseServerException) result).isServerOverloaded());
// run again with true value passed in
ex = new RemoteWithExtrasException(exceptionClass, message, false, true);
result = ex.unwrapRemoteException();
assertTrue(result instanceof HBaseServerException);
// expect true this time
assertTrue(((HBaseServerException) result).isServerOverloaded());
}
private static class ServerOverloadedException extends HBaseServerException {
public ServerOverloadedException(String message) {
super(true, message);
}
}
}

View File

@ -831,7 +831,10 @@ public final class HConstants {
/**
* Parameter name for client pause value for special case such as call queue too big, etc.
* @deprecated Since 2.5.0, will be removed in 4.0.0. Please use
* hbase.client.pause.server.overloaded instead.
*/
@Deprecated
public static final String HBASE_CLIENT_PAUSE_FOR_CQTBE = "hbase.client.pause.cqtbe";
/**

View File

@ -493,12 +493,14 @@ possible configurations would overwhelm and obscure the important.
this initial pause amount and how this pause works w/ retries.</description>
</property>
<property>
<name>hbase.client.pause.cqtbe</name>
<name>hbase.client.pause.server.overloaded</name>
<value></value>
<description>Whether or not to use a special client pause for
CallQueueTooBigException (cqtbe). Set this property to a higher value
than hbase.client.pause if you observe frequent CQTBE from the same
RegionServer and the call queue there keeps full</description>
<description>Pause time when encountering an exception indicating a
server is overloaded, CallQueueTooBigException or CallDroppedException.
Set this property to a higher value than hbase.client.pause if you
observe frequent CallQueueTooBigException or CallDroppedException from the same
RegionServer and the call queue there keeps filling up. This config used to be
called hbase.client.pause.cqtbe, which has been deprecated as of 2.5.0.</description>
</property>
<property>
<name>hbase.client.retries.number</name>

View File

@ -119,6 +119,8 @@ message ExceptionResponse {
optional int32 port = 4;
// Set if we are NOT to retry on receipt of this exception
optional bool do_not_retry = 5;
// Set true if the server was considered to be overloaded when exception was thrown
optional bool server_overloaded = 6;
}
/**

View File

@ -30,6 +30,7 @@ import java.util.Optional;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.hadoop.hbase.CellScanner;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.HBaseServerException;
import org.apache.hadoop.hbase.exceptions.RegionMovedException;
import org.apache.hadoop.hbase.io.ByteBuffAllocator;
import org.apache.hadoop.hbase.io.ByteBufferListOutputStream;
@ -320,6 +321,9 @@ public abstract class ServerCall<T extends ServerRpcConnection> implements RpcCa
RegionMovedException rme = (RegionMovedException)t;
exceptionBuilder.setHostname(rme.getHostname());
exceptionBuilder.setPort(rme.getPort());
} else if (t instanceof HBaseServerException) {
HBaseServerException hse = (HBaseServerException) t;
exceptionBuilder.setServerOverloaded(hse.isServerOverloaded());
}
// Set the exception as the result of the method invocation.
headerBuilder.setException(exceptionBuilder.build());

View File

@ -111,6 +111,8 @@ public class HConnectionTestingUtility {
throws IOException {
ConnectionImplementation c = Mockito.mock(ConnectionImplementation.class);
Mockito.when(c.getConfiguration()).thenReturn(conf);
ConnectionConfiguration connectionConfiguration = new ConnectionConfiguration(conf);
Mockito.when(c.getConnectionConfiguration()).thenReturn(connectionConfiguration);
Mockito.doNothing().when(c).close();
// Make it so we return a particular location when asked.
final HRegionLocation loc = new HRegionLocation(hri, sn);
@ -134,9 +136,10 @@ public class HConnectionTestingUtility {
}
NonceGenerator ng = Mockito.mock(NonceGenerator.class);
Mockito.when(c.getNonceGenerator()).thenReturn(ng);
Mockito.when(c.getAsyncProcess()).thenReturn(
AsyncProcess asyncProcess =
new AsyncProcess(c, conf, RpcRetryingCallerFactory.instantiate(conf),
RpcControllerFactory.instantiate(conf)));
RpcControllerFactory.instantiate(conf));
Mockito.when(c.getAsyncProcess()).thenReturn(asyncProcess);
Mockito.when(c.getNewRpcRetryingCallerFactory(conf)).thenReturn(
RpcRetryingCallerFactory.instantiate(conf,
RetryingCallerInterceptorFactory.NO_OP_INTERCEPTOR, null));

View File

@ -17,6 +17,7 @@
*/
package org.apache.hadoop.hbase.client;
import static org.apache.hadoop.hbase.client.ConnectionConfiguration.HBASE_CLIENT_PAUSE_FOR_SERVER_OVERLOADED;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
@ -36,9 +37,11 @@ import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.ipc.CallRunner;
import org.apache.hadoop.hbase.ipc.PluggableBlockingQueue;
import org.apache.hadoop.hbase.ipc.PriorityFunction;
import org.apache.hadoop.hbase.ipc.RpcScheduler;
import org.apache.hadoop.hbase.ipc.SimpleRpcScheduler;
import org.apache.hadoop.hbase.ipc.TestPluggableQueueImpl;
import org.apache.hadoop.hbase.regionserver.RSRpcServices;
import org.apache.hadoop.hbase.regionserver.RpcSchedulerFactory;
import org.apache.hadoop.hbase.regionserver.SimpleRpcSchedulerFactory;
@ -56,31 +59,46 @@ import org.apache.hbase.thirdparty.com.google.common.io.Closeables;
import org.apache.hbase.thirdparty.com.google.protobuf.Descriptors.MethodDescriptor;
@Category({ MediumTests.class, ClientTests.class })
public class TestAsyncClientPauseForCallQueueTooBig {
public class TestAsyncClientPauseForServerOverloaded {
@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestAsyncClientPauseForCallQueueTooBig.class);
HBaseClassTestRule.forClass(TestAsyncClientPauseForServerOverloaded.class);
private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
private static TableName TABLE_NAME = TableName.valueOf("CQTBE");
private static TableName TABLE_NAME = TableName.valueOf("ServerOverloaded");
private static byte[] FAMILY = Bytes.toBytes("Family");
private static byte[] QUALIFIER = Bytes.toBytes("Qualifier");
private static long PAUSE_FOR_CQTBE_NS = TimeUnit.SECONDS.toNanos(1);
private static long PAUSE_FOR_SERVER_OVERLOADED_NANOS = TimeUnit.SECONDS.toNanos(1);
private static long PAUSE_FOR_SERVER_OVERLOADED_MILLIS = TimeUnit.NANOSECONDS.toMillis(
PAUSE_FOR_SERVER_OVERLOADED_NANOS);
private static AsyncConnection CONN;
private static boolean FAIL = false;
private static volatile FailMode MODE = null;
private static ConcurrentMap<MethodDescriptor, AtomicInteger> INVOKED = new ConcurrentHashMap<>();
enum FailMode {
CALL_QUEUE_TOO_BIG,
CALL_DROPPED;
public static final class CQTBERpcScheduler extends SimpleRpcScheduler {
private ConcurrentMap<MethodDescriptor, AtomicInteger> invoked = new ConcurrentHashMap<>();
public CQTBERpcScheduler(Configuration conf, int handlerCount, int priorityHandlerCount,
// this is for test scan, where we will send a open scanner first and then a next, and we
// expect that we hit CQTBE two times.
private boolean shouldFail(CallRunner callRunner) {
MethodDescriptor method = callRunner.getRpcCall().getMethod();
return invoked.computeIfAbsent(method,
k -> new AtomicInteger(0)).getAndIncrement() % 2 == 0;
}
}
public static final class OverloadedRpcScheduler extends SimpleRpcScheduler {
public OverloadedRpcScheduler(Configuration conf, int handlerCount, int priorityHandlerCount,
int replicationHandlerCount, int metaTransitionHandler, PriorityFunction priority,
Abortable server, int highPriorityLevel) {
super(conf, handlerCount, priorityHandlerCount, replicationHandlerCount,
@ -89,25 +107,36 @@ public class TestAsyncClientPauseForCallQueueTooBig {
@Override
public boolean dispatch(CallRunner callTask) {
if (FAIL) {
MethodDescriptor method = callTask.getRpcCall().getMethod();
// this is for test scan, where we will send a open scanner first and then a next, and we
// expect that we hit CQTBE two times.
if (INVOKED.computeIfAbsent(method, k -> new AtomicInteger(0)).getAndIncrement() % 2 == 0) {
if (MODE == FailMode.CALL_QUEUE_TOO_BIG && MODE.shouldFail(callTask)) {
return false;
}
}
return super.dispatch(callTask);
}
}
public static final class CQTBERpcSchedulerFactory extends SimpleRpcSchedulerFactory {
public static final class OverloadedQueue extends TestPluggableQueueImpl {
public OverloadedQueue(int maxQueueLength, PriorityFunction priority, Configuration conf) {
super(maxQueueLength, priority, conf);
}
@Override
public boolean offer(CallRunner callRunner) {
if (MODE == FailMode.CALL_DROPPED && MODE.shouldFail(callRunner)) {
callRunner.drop();
return true;
}
return super.offer(callRunner);
}
}
public static final class OverloadedRpcSchedulerFactory extends SimpleRpcSchedulerFactory {
@Override
public RpcScheduler create(Configuration conf, PriorityFunction priority, Abortable server) {
int handlerCount = conf.getInt(HConstants.REGION_SERVER_HANDLER_COUNT,
HConstants.DEFAULT_REGION_SERVER_HANDLER_COUNT);
return new CQTBERpcScheduler(conf, handlerCount,
return new OverloadedRpcScheduler(conf, handlerCount,
conf.getInt(HConstants.REGION_SERVER_HIGH_PRIORITY_HANDLER_COUNT,
HConstants.DEFAULT_REGION_SERVER_HIGH_PRIORITY_HANDLER_COUNT),
conf.getInt(HConstants.REGION_SERVER_REPLICATION_HANDLER_COUNT,
@ -122,12 +151,16 @@ public class TestAsyncClientPauseForCallQueueTooBig {
@BeforeClass
public static void setUp() throws Exception {
UTIL.getConfiguration().setLong(HConstants.HBASE_CLIENT_PAUSE, 10);
UTIL.getConfiguration().setLong(HConstants.HBASE_CLIENT_PAUSE_FOR_CQTBE,
TimeUnit.NANOSECONDS.toMillis(PAUSE_FOR_CQTBE_NS));
UTIL.getConfiguration().set("hbase.ipc.server.callqueue.type", "pluggable");
UTIL.getConfiguration().setClass("hbase.ipc.server.callqueue.pluggable.queue.class.name",
OverloadedQueue.class, PluggableBlockingQueue.class);
UTIL.getConfiguration().setClass(RSRpcServices.REGION_SERVER_RPC_SCHEDULER_FACTORY_CLASS,
CQTBERpcSchedulerFactory.class, RpcSchedulerFactory.class);
OverloadedRpcSchedulerFactory.class, RpcSchedulerFactory.class);
UTIL.startMiniCluster(1);
CONN = ConnectionFactory.createAsyncConnection(UTIL.getConfiguration()).get();
Configuration conf = new Configuration(UTIL.getConfiguration());
conf.setLong(HBASE_CLIENT_PAUSE_FOR_SERVER_OVERLOADED, PAUSE_FOR_SERVER_OVERLOADED_MILLIS);
CONN = ConnectionFactory.createAsyncConnection(conf).get();
}
@AfterClass
@ -143,23 +176,29 @@ public class TestAsyncClientPauseForCallQueueTooBig {
table.put(new Put(Bytes.toBytes(i)).addColumn(FAMILY, QUALIFIER, Bytes.toBytes(i)));
}
}
FAIL = true;
MODE = FailMode.CALL_QUEUE_TOO_BIG;
}
@After
public void tearDownAfterTest() throws IOException {
FAIL = false;
INVOKED.clear();
for (FailMode mode : FailMode.values()) {
mode.invoked.clear();
}
MODE = null;
UTIL.getAdmin().disableTable(TABLE_NAME);
UTIL.getAdmin().deleteTable(TABLE_NAME);
}
private void assertTime(Callable<Void> callable, long time) throws Exception {
for (FailMode mode : FailMode.values()) {
MODE = mode;
long startNs = System.nanoTime();
callable.call();
long costNs = System.nanoTime() - startNs;
assertTrue(costNs > time);
}
}
@Test
public void testGet() throws Exception {
@ -167,7 +206,7 @@ public class TestAsyncClientPauseForCallQueueTooBig {
Result result = CONN.getTable(TABLE_NAME).get(new Get(Bytes.toBytes(0))).get();
assertArrayEquals(Bytes.toBytes(0), result.getValue(FAMILY, QUALIFIER));
return null;
}, PAUSE_FOR_CQTBE_NS);
}, PAUSE_FOR_SERVER_OVERLOADED_NANOS);
}
@Test
@ -181,7 +220,7 @@ public class TestAsyncClientPauseForCallQueueTooBig {
}
}
return CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).get();
}, PAUSE_FOR_CQTBE_NS);
}, PAUSE_FOR_SERVER_OVERLOADED_NANOS);
}
@Test
@ -197,6 +236,6 @@ public class TestAsyncClientPauseForCallQueueTooBig {
assertNull(scanner.next());
}
return null;
}, PAUSE_FOR_CQTBE_NS * 2);
}, PAUSE_FOR_SERVER_OVERLOADED_NANOS * 2);
}
}

View File

@ -22,9 +22,9 @@ import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.lang.reflect.Field;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Modifier;
import java.net.SocketTimeoutException;
import java.util.ArrayList;
@ -41,8 +41,11 @@ import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.CallDroppedException;
import org.apache.hadoop.hbase.CallQueueTooBigException;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.HBaseServerException;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionLocation;
@ -79,7 +82,6 @@ import org.junit.experimental.categories.Category;
import org.junit.rules.TestName;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
import org.apache.hbase.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.apache.hbase.thirdparty.io.netty.util.ResourceLeakDetector;
@ -1086,6 +1088,100 @@ public class TestConnectionImplementation {
}
}
@Test
public void testLocateRegionsRetrySpecialPauseCQTBE() throws IOException {
testLocateRegionsRetrySpecialPause(CallQueueTooBigException.class);
}
@Test
public void testLocateRegionsRetrySpecialPauseCDE() throws IOException {
testLocateRegionsRetrySpecialPause(CallDroppedException.class);
}
private void testLocateRegionsRetrySpecialPause(
Class<? extends HBaseServerException> exceptionClass) throws IOException {
int regionReplication = 3;
byte[] family = Bytes.toBytes("cf");
TableName tableName = TableName.valueOf(name.getMethodName());
// Create a table with region replicas
TableDescriptorBuilder builder = TableDescriptorBuilder
.newBuilder(tableName)
.setRegionReplication(regionReplication)
.setColumnFamily(ColumnFamilyDescriptorBuilder.of(family));
TEST_UTIL.getAdmin().createTable(builder.build());
Configuration conf = new Configuration(TEST_UTIL.getConfiguration());
conf.setClass(RpcRetryingCallerFactory.CUSTOM_CALLER_CONF_KEY,
ThrowingCallerFactory.class, RpcRetryingCallerFactory.class);
conf.setClass("testSpecialPauseException", exceptionClass, HBaseServerException.class);
conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 2);
// normal pause very short, 10 millis
conf.setInt(HConstants.HBASE_CLIENT_PAUSE, 10);
// special pause 10x longer, so we can detect it
long specialPauseMillis = 1000;
conf.setLong(ConnectionConfiguration.HBASE_CLIENT_PAUSE_FOR_SERVER_OVERLOADED,
specialPauseMillis);
try (ConnectionImplementation con =
(ConnectionImplementation) ConnectionFactory.createConnection(conf)) {
// Get locations of the regions of the table
long start = System.nanoTime();
try {
con.locateRegion(tableName, new byte[0], false, true, 0);
} catch (HBaseServerException e) {
assertTrue(e.isServerOverloaded());
// pass: expected
}
assertTrue(System.nanoTime() - start > TimeUnit.MILLISECONDS.toNanos(specialPauseMillis));
} finally {
TEST_UTIL.deleteTable(tableName);
}
}
private static class ThrowingCallerFactory extends RpcRetryingCallerFactory {
private final Class<? extends HBaseServerException> exceptionClass;
public ThrowingCallerFactory(Configuration conf) {
super(conf);
this.exceptionClass = conf.getClass("testSpecialPauseException",
null, HBaseServerException.class);
}
@Override public <T> RpcRetryingCaller<T> newCaller(int rpcTimeout) {
return newCaller();
}
@Override public <T> RpcRetryingCaller<T> newCaller() {
return new RpcRetryingCaller<T>() {
@Override public void cancel() {
}
@Override public T callWithRetries(RetryingCallable<T> callable, int callTimeout)
throws IOException, RuntimeException {
return callWithoutRetries(null, 0);
}
@Override public T callWithoutRetries(RetryingCallable<T> callable, int callTimeout)
throws IOException, RuntimeException {
try {
throw exceptionClass.getConstructor().newInstance();
} catch (IllegalAccessException | InstantiationException
| InvocationTargetException | NoSuchMethodException e) {
throw new RuntimeException(e);
}
}
};
}
}
@Test
public void testMetaLookupThreadPoolCreated() throws Exception {
final TableName tableName = TableName.valueOf(name.getMethodName());