HBASE-22287 inifinite retries on failed server in RSProcedureDispatcher (#1800)
Adds backoff in place of retry every 100ms. Signed-off-by: Duo Zhang <zhangduo@apache.org>
This commit is contained in:
parent
da2e03bb3b
commit
bda2094ae5
|
@ -251,31 +251,33 @@ public class RSProcedureDispatcher
|
|||
}
|
||||
|
||||
private boolean scheduleForRetry(IOException e) {
|
||||
LOG.debug("request to {} failed, try={}", serverName, numberOfAttemptsSoFar, e);
|
||||
LOG.debug("Request to {} failed, try={}", serverName, numberOfAttemptsSoFar, e);
|
||||
// Should we wait a little before retrying? If the server is starting it's yes.
|
||||
if (e instanceof ServerNotRunningYetException) {
|
||||
long remainingTime = getMaxWaitTime() - EnvironmentEdgeManager.currentTime();
|
||||
if (remainingTime > 0) {
|
||||
LOG.warn("waiting a little before trying on the same server={}," +
|
||||
" try={}, can wait up to {}ms", serverName, numberOfAttemptsSoFar, remainingTime);
|
||||
LOG.warn("Waiting a little before retrying {}, try={}, can wait up to {}ms",
|
||||
serverName, numberOfAttemptsSoFar, remainingTime);
|
||||
numberOfAttemptsSoFar++;
|
||||
// Retry every 100ms up to maximum wait time.
|
||||
submitTask(this, 100, TimeUnit.MILLISECONDS);
|
||||
return true;
|
||||
}
|
||||
LOG.warn("server {} is not up for a while; try a new one", serverName);
|
||||
LOG.warn("{} is throwing ServerNotRunningYetException for {}ms; trying another server",
|
||||
serverName, getMaxWaitTime());
|
||||
return false;
|
||||
}
|
||||
if (e instanceof DoNotRetryIOException) {
|
||||
LOG.warn("server {} tells us do not retry due to {}, try={}, give up", serverName,
|
||||
LOG.warn("{} tells us DoNotRetry due to {}, try={}, give up", serverName,
|
||||
e.toString(), numberOfAttemptsSoFar);
|
||||
return false;
|
||||
}
|
||||
// this exception is thrown in the rpc framework, where we can make sure that the call has not
|
||||
// This exception is thrown in the rpc framework, where we can make sure that the call has not
|
||||
// been executed yet, so it is safe to mark it as fail. Especially for open a region, we'd
|
||||
// better choose another region server
|
||||
// notice that, it is safe to quit only if this is the first time we send request to region
|
||||
// server. Maybe the region server has accept our request the first time, and then there is a
|
||||
// network error which prevents we receive the response, and the second time we hit a
|
||||
// better choose another region server.
|
||||
// Notice that, it is safe to quit only if this is the first time we send request to region
|
||||
// server. Maybe the region server has accepted our request the first time, and then there is
|
||||
// a network error which prevents we receive the response, and the second time we hit a
|
||||
// CallQueueTooBigException, obviously it is not safe to quit here, otherwise it may lead to a
|
||||
// double assign...
|
||||
if (e instanceof CallQueueTooBigException && numberOfAttemptsSoFar == 0) {
|
||||
|
@ -285,7 +287,7 @@ public class RSProcedureDispatcher
|
|||
}
|
||||
// Always retry for other exception types if the region server is not dead yet.
|
||||
if (!master.getServerManager().isServerOnline(serverName)) {
|
||||
LOG.warn("request to {} failed due to {}, try={}, and the server is dead, give up",
|
||||
LOG.warn("Request to {} failed due to {}, try={} and the server is not online, give up",
|
||||
serverName, e.toString(), numberOfAttemptsSoFar);
|
||||
return false;
|
||||
}
|
||||
|
@ -294,14 +296,20 @@ public class RSProcedureDispatcher
|
|||
// background task to check whether the region server is dead. And if it is dead, call
|
||||
// remoteCallFailed to tell the upper layer. Keep retrying here does not lead to incorrect
|
||||
// result, but waste some resources.
|
||||
LOG.warn("server {} is aborted or stopped, for safety we still need to" +
|
||||
LOG.warn("{} is aborted or stopped, for safety we still need to" +
|
||||
" wait until it is fully dead, try={}", serverName, numberOfAttemptsSoFar);
|
||||
} else {
|
||||
LOG.warn("request to server {} failed due to {}, try={}, retrying...", serverName,
|
||||
LOG.warn("request to {} failed due to {}, try={}, retrying...", serverName,
|
||||
e.toString(), numberOfAttemptsSoFar);
|
||||
}
|
||||
numberOfAttemptsSoFar++;
|
||||
submitTask(this, 100, TimeUnit.MILLISECONDS);
|
||||
// Add some backoff here as the attempts rise otherwise if a stuck condition, will fill logs
|
||||
// with failed attempts. None of our backoff classes -- RetryCounter or ClientBackoffPolicy
|
||||
// -- fit here nicely so just do something simple; increment by 100ms * retry^2 on each try
|
||||
// up to max of 10 seconds (don't want to back off too much in case of situation change).
|
||||
submitTask(this,
|
||||
Math.min(100 * (this.numberOfAttemptsSoFar * this.numberOfAttemptsSoFar), 10 * 1000),
|
||||
TimeUnit.MILLISECONDS);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue