HBASE-21227 Implement exponential retrying backoff for Assign/UnassignRegionHandler introduced in HBASE-21217
This commit is contained in:
parent
7b2f5950ed
commit
d7e08317d2
|
@ -18,12 +18,15 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.hadoop.hbase.util;
|
package org.apache.hadoop.hbase.util;
|
||||||
|
|
||||||
|
import java.util.concurrent.ThreadLocalRandom;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
import org.apache.yetus.audience.InterfaceAudience;
|
import org.apache.yetus.audience.InterfaceAudience;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Operation retry accounting.
|
* Operation retry accounting.
|
||||||
* Use to calculate wait period, {@link #getBackoffTimeAndIncrementAttempts()}}, or for performing
|
* Use to calculate wait period, {@link #getBackoffTimeAndIncrementAttempts()}}, or for performing
|
||||||
|
@ -44,6 +47,7 @@ public class RetryCounter {
|
||||||
private long maxSleepTime;
|
private long maxSleepTime;
|
||||||
private TimeUnit timeUnit;
|
private TimeUnit timeUnit;
|
||||||
private BackoffPolicy backoffPolicy;
|
private BackoffPolicy backoffPolicy;
|
||||||
|
private float jitter;
|
||||||
|
|
||||||
private static final BackoffPolicy DEFAULT_BACKOFF_POLICY = new ExponentialBackoffPolicy();
|
private static final BackoffPolicy DEFAULT_BACKOFF_POLICY = new ExponentialBackoffPolicy();
|
||||||
|
|
||||||
|
@ -53,6 +57,7 @@ public class RetryCounter {
|
||||||
maxSleepTime = -1;
|
maxSleepTime = -1;
|
||||||
timeUnit = TimeUnit.MILLISECONDS;
|
timeUnit = TimeUnit.MILLISECONDS;
|
||||||
backoffPolicy = DEFAULT_BACKOFF_POLICY;
|
backoffPolicy = DEFAULT_BACKOFF_POLICY;
|
||||||
|
jitter = 0.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
public RetryConfig(int maxAttempts, long sleepInterval, long maxSleepTime,
|
public RetryConfig(int maxAttempts, long sleepInterval, long maxSleepTime,
|
||||||
|
@ -89,6 +94,13 @@ public class RetryCounter {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public RetryConfig setJitter(float jitter) {
|
||||||
|
Preconditions.checkArgument(jitter >= 0.0f && jitter < 1.0f,
|
||||||
|
"Invalid jitter: %s, should be in range [0.0, 1.0)", jitter);
|
||||||
|
this.jitter = jitter;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
public int getMaxAttempts() {
|
public int getMaxAttempts() {
|
||||||
return maxAttempts;
|
return maxAttempts;
|
||||||
}
|
}
|
||||||
|
@ -105,17 +117,26 @@ public class RetryCounter {
|
||||||
return timeUnit;
|
return timeUnit;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public float getJitter() {
|
||||||
|
return jitter;
|
||||||
|
}
|
||||||
|
|
||||||
public BackoffPolicy getBackoffPolicy() {
|
public BackoffPolicy getBackoffPolicy() {
|
||||||
return backoffPolicy;
|
return backoffPolicy;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static long addJitter(long interval, float jitter) {
|
||||||
|
long jitterInterval = (long) (interval * ThreadLocalRandom.current().nextFloat() * jitter);
|
||||||
|
return interval + jitterInterval;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Policy for calculating sleeping intervals between retry attempts
|
* Policy for calculating sleeping intervals between retry attempts
|
||||||
*/
|
*/
|
||||||
public static class BackoffPolicy {
|
public static class BackoffPolicy {
|
||||||
public long getBackoffTime(RetryConfig config, int attempts) {
|
public long getBackoffTime(RetryConfig config, int attempts) {
|
||||||
return config.getSleepInterval();
|
return addJitter(config.getSleepInterval(), config.getJitter());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -123,7 +144,7 @@ public class RetryCounter {
|
||||||
@Override
|
@Override
|
||||||
public long getBackoffTime(RetryConfig config, int attempts) {
|
public long getBackoffTime(RetryConfig config, int attempts) {
|
||||||
long backoffTime = (long) (config.getSleepInterval() * Math.pow(2, attempts));
|
long backoffTime = (long) (config.getSleepInterval() * Math.pow(2, attempts));
|
||||||
return backoffTime;
|
return addJitter(backoffTime, config.getJitter());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -155,7 +176,6 @@ public class RetryCounter {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sleep for a back off time as supplied by the backoff policy, and increases the attempts
|
* Sleep for a back off time as supplied by the backoff policy, and increases the attempts
|
||||||
* @throws InterruptedException
|
|
||||||
*/
|
*/
|
||||||
public void sleepUntilNextRetry() throws InterruptedException {
|
public void sleepUntilNextRetry() throws InterruptedException {
|
||||||
int attempts = getAttemptTimes();
|
int attempts = getAttemptTimes();
|
||||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.regionserver.Region;
|
||||||
import org.apache.hadoop.hbase.regionserver.RegionServerServices;
|
import org.apache.hadoop.hbase.regionserver.RegionServerServices;
|
||||||
import org.apache.hadoop.hbase.regionserver.RegionServerServices.PostOpenDeployContext;
|
import org.apache.hadoop.hbase.regionserver.RegionServerServices.PostOpenDeployContext;
|
||||||
import org.apache.hadoop.hbase.regionserver.RegionServerServices.RegionStateTransitionContext;
|
import org.apache.hadoop.hbase.regionserver.RegionServerServices.RegionStateTransitionContext;
|
||||||
|
import org.apache.hadoop.hbase.util.RetryCounter;
|
||||||
import org.apache.yetus.audience.InterfaceAudience;
|
import org.apache.yetus.audience.InterfaceAudience;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -54,12 +55,15 @@ public class AssignRegionHandler extends EventHandler {
|
||||||
|
|
||||||
private final long masterSystemTime;
|
private final long masterSystemTime;
|
||||||
|
|
||||||
|
private final RetryCounter retryCounter;
|
||||||
|
|
||||||
public AssignRegionHandler(RegionServerServices server, RegionInfo regionInfo,
|
public AssignRegionHandler(RegionServerServices server, RegionInfo regionInfo,
|
||||||
@Nullable TableDescriptor tableDesc, long masterSystemTime, EventType eventType) {
|
@Nullable TableDescriptor tableDesc, long masterSystemTime, EventType eventType) {
|
||||||
super(server, eventType);
|
super(server, eventType);
|
||||||
this.regionInfo = regionInfo;
|
this.regionInfo = regionInfo;
|
||||||
this.tableDesc = tableDesc;
|
this.tableDesc = tableDesc;
|
||||||
this.masterSystemTime = masterSystemTime;
|
this.masterSystemTime = masterSystemTime;
|
||||||
|
this.retryCounter = HandlerUtil.getRetryCounter();
|
||||||
}
|
}
|
||||||
|
|
||||||
private RegionServerServices getServer() {
|
private RegionServerServices getServer() {
|
||||||
|
@ -106,10 +110,11 @@ public class AssignRegionHandler extends EventHandler {
|
||||||
// calling reportRegionStateTransition, so the HMaster will think the region is offline,
|
// calling reportRegionStateTransition, so the HMaster will think the region is offline,
|
||||||
// before we actually close the region, as reportRegionStateTransition is part of the
|
// before we actually close the region, as reportRegionStateTransition is part of the
|
||||||
// closing process.
|
// closing process.
|
||||||
LOG.info("Receiving OPEN for the region:{}, which we are trying to close, try again later",
|
long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
|
||||||
regionName);
|
LOG.info(
|
||||||
// TODO: backoff
|
"Receiving OPEN for the region:{}, which we are trying to close, try again after {}ms",
|
||||||
rs.getExecutorService().delayedSubmit(this, 1, TimeUnit.SECONDS);
|
regionName, backoff);
|
||||||
|
rs.getExecutorService().delayedSubmit(this, backoff, TimeUnit.MILLISECONDS);
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.hbase.regionserver.handler;
|
||||||
|
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import org.apache.hadoop.hbase.util.RetryCounter;
|
||||||
|
import org.apache.hadoop.hbase.util.RetryCounterFactory;
|
||||||
|
import org.apache.yetus.audience.InterfaceAudience;
|
||||||
|
|
||||||
|
@InterfaceAudience.Private
|
||||||
|
final class HandlerUtil {
|
||||||
|
|
||||||
|
private HandlerUtil() {
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get an exponential backoff retry counter. The base unit is 100 milliseconds, and the max
|
||||||
|
* backoff time is 30 seconds.
|
||||||
|
*/
|
||||||
|
public static RetryCounter getRetryCounter() {
|
||||||
|
return new RetryCounterFactory(
|
||||||
|
new RetryCounter.RetryConfig().setBackoffPolicy(new RetryCounter.ExponentialBackoffPolicy())
|
||||||
|
.setSleepInterval(100).setMaxSleepTime(30000).setMaxAttempts(Integer.MAX_VALUE)
|
||||||
|
.setTimeUnit(TimeUnit.MILLISECONDS).setJitter(0.01f)).create();
|
||||||
|
}
|
||||||
|
}
|
|
@ -29,6 +29,7 @@ import org.apache.hadoop.hbase.regionserver.Region;
|
||||||
import org.apache.hadoop.hbase.regionserver.RegionServerServices;
|
import org.apache.hadoop.hbase.regionserver.RegionServerServices;
|
||||||
import org.apache.hadoop.hbase.regionserver.RegionServerServices.RegionStateTransitionContext;
|
import org.apache.hadoop.hbase.regionserver.RegionServerServices.RegionStateTransitionContext;
|
||||||
import org.apache.hadoop.hbase.util.Bytes;
|
import org.apache.hadoop.hbase.util.Bytes;
|
||||||
|
import org.apache.hadoop.hbase.util.RetryCounter;
|
||||||
import org.apache.yetus.audience.InterfaceAudience;
|
import org.apache.yetus.audience.InterfaceAudience;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -54,12 +55,15 @@ public class UnassignRegionHandler extends EventHandler {
|
||||||
|
|
||||||
private final ServerName destination;
|
private final ServerName destination;
|
||||||
|
|
||||||
|
private final RetryCounter retryCounter;
|
||||||
|
|
||||||
public UnassignRegionHandler(RegionServerServices server, String encodedName, boolean abort,
|
public UnassignRegionHandler(RegionServerServices server, String encodedName, boolean abort,
|
||||||
@Nullable ServerName destination, EventType eventType) {
|
@Nullable ServerName destination, EventType eventType) {
|
||||||
super(server, eventType);
|
super(server, eventType);
|
||||||
this.encodedName = encodedName;
|
this.encodedName = encodedName;
|
||||||
this.abort = abort;
|
this.abort = abort;
|
||||||
this.destination = destination;
|
this.destination = destination;
|
||||||
|
this.retryCounter = HandlerUtil.getRetryCounter();
|
||||||
}
|
}
|
||||||
|
|
||||||
private RegionServerServices getServer() {
|
private RegionServerServices getServer() {
|
||||||
|
@ -76,10 +80,10 @@ public class UnassignRegionHandler extends EventHandler {
|
||||||
// This could happen as we will update the region state to OPEN when calling
|
// This could happen as we will update the region state to OPEN when calling
|
||||||
// reportRegionStateTransition, so the HMaster will think the region is online, before we
|
// reportRegionStateTransition, so the HMaster will think the region is online, before we
|
||||||
// actually open the region, as reportRegionStateTransition is part of the opening process.
|
// actually open the region, as reportRegionStateTransition is part of the opening process.
|
||||||
|
long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
|
||||||
LOG.warn("Received CLOSE for the region: {}, which we are already " +
|
LOG.warn("Received CLOSE for the region: {}, which we are already " +
|
||||||
"trying to OPEN. try again later.", encodedName);
|
"trying to OPEN. try again after {}ms", encodedName, backoff);
|
||||||
// TODO: backoff
|
rs.getExecutorService().delayedSubmit(this, backoff, TimeUnit.MILLISECONDS);
|
||||||
rs.getExecutorService().delayedSubmit(this, 1, TimeUnit.SECONDS);
|
|
||||||
} else {
|
} else {
|
||||||
LOG.info("Received CLOSE for the region: {}, which we are already trying to CLOSE," +
|
LOG.info("Received CLOSE for the region: {}, which we are already trying to CLOSE," +
|
||||||
" but not completed yet", encodedName);
|
" but not completed yet", encodedName);
|
||||||
|
|
Loading…
Reference in New Issue