HBASE-8888 Tweak retry settings some more, *some more*

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1501651 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Stack 2013-07-10 05:48:47 +00:00
parent bab89d2e57
commit a2516d3bc0
9 changed files with 66 additions and 47 deletions

View File

@ -174,7 +174,8 @@ public abstract class ServerCallable<T> implements Callable<T> {
prepare(tries != 0); // if called with false, check table status on ZK
return call();
} catch (Throwable t) {
LOG.warn("Call exception, tries=" + tries + ", numRetries=" + numRetries, t);
LOG.warn("Call exception, tries=" + tries + ", numRetries=" + numRetries + ", retryTime=" +
(this.globalStartTime - System.currentTimeMillis()) + "ms", t);
t = translateException(t);
// translateException throws an exception when we should not retry, i.e. when it's the

View File

@ -37,6 +37,7 @@ import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.ClientService.Blo
import org.apache.hadoop.hbase.util.Bytes;
import org.junit.Before;
import org.junit.Test;
import org.junit.Ignore;
import org.mockito.Mockito;
import com.google.protobuf.RpcController;
@ -91,6 +92,36 @@ public class TestClientNoCluster {
}
}
/**
* Remove the @Ignore to try out timeout and retry asettings
* @throws IOException
*/
@Ignore
@Test
public void testTimeoutAndRetries() throws IOException {
Configuration localConfig = HBaseConfiguration.create(this.conf);
// This override mocks up our exists/get call to throw a RegionServerStoppedException.
localConfig.set("hbase.client.connection.impl", RpcTimeoutConnection.class.getName());
HTable table = new HTable(localConfig, HConstants.META_TABLE_NAME);
Throwable t = null;
LOG.info("Start");
try {
// An exists call turns into a get w/ a flag.
table.exists(new Get(Bytes.toBytes("abc")));
} catch (SocketTimeoutException e) {
// I expect this exception.
LOG.info("Got expected exception", e);
t = e;
} catch (RetriesExhaustedException e) {
// This is the old, unwanted behavior. If we get here FAIL!!!
fail();
} finally {
table.close();
}
LOG.info("Stop");
assertTrue(t != null);
}
/**
* Test that operation timeout prevails over rpc default timeout and retries, etc.
* @throws IOException
@ -102,7 +133,7 @@ public class TestClientNoCluster {
localConfig.set("hbase.client.connection.impl", RpcTimeoutConnection.class.getName());
int pause = 10;
localConfig.setInt("hbase.client.pause", pause);
localConfig.setInt("hbase.client.retries.number", 10);
localConfig.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 10);
// Set the operation timeout to be < the pause. Expectation is that after first pause, we will
// fail out of the rpc because the rpc timeout will have been set to the operation tiemout
// and it has expired. Otherwise, if this functionality is broke, all retries will be run --
@ -263,4 +294,4 @@ public class TestClientNoCluster {
return this.stub;
}
}
}
}

View File

@ -54,13 +54,14 @@ public class TestSnapshotFromAdmin {
*/
@Test(timeout = 60000)
public void testBackoffLogic() throws Exception {
final int maxWaitTime = 7500;
final int numRetries = 10;
final int pauseTime = 500;
final int pauseTime = 100;
final int maxWaitTime =
HConstants.RETRY_BACKOFF[HConstants.RETRY_BACKOFF.length - 1] * pauseTime;
final int numRetries = HConstants.RETRY_BACKOFF.length;
// calculate the wait time, if we just do straight backoff (ignoring the expected time from
// master)
long ignoreExpectedTime = 0;
for (int i = 0; i < 6; i++) {
for (int i = 0; i < HConstants.RETRY_BACKOFF.length; i++) {
ignoreExpectedTime += HConstants.RETRY_BACKOFF[i] * pauseTime;
}
// the correct wait time, capping at the maxTime/tries + fudge room

View File

@ -25,13 +25,4 @@
<name>hbase.defaults.for.version.skip</name>
<value>true</value>
</property>
<property>
<name>hbase.client.retries.number</name>
<value>5</value>
<description>Maximum retries. Used as maximum for all retryable
operations such as fetching of the root region from root region
server, getting a cell's value, starting a row update, etc.
Default: 10.
</description>
</property>
</configuration>

View File

@ -492,10 +492,13 @@ public final class HConstants {
public static final String CONFIGURATION = "CONFIGURATION";
/**
* This is a retry backoff multiplier table similar to the BSD TCP syn
* backoff table, a bit more aggressive than simple exponential backoff.
* Retrying we multiply hbase.client.pause setting by what we have in this array until we
* run out of array items. Retries beyond this use the last number in the array. So, for
* example, if hbase.client.pause is 1 second, and maximum retries count
* hbase.client.retries.number is 10, we will retry at the following intervals:
* 1, 2, 3, 10, 100, 100, 100, 100, 100, 100.
*/
public static int RETRY_BACKOFF[] = { 1, 1, 1, 2, 2, 4, 4, 8, 16, 32, 64 };
public static int RETRY_BACKOFF[] = { 1, 2, 3, 5, 10, 100 };
public static final String REGION_IMPL = "hbase.hregion.impl";
@ -574,7 +577,7 @@ public final class HConstants {
/**
* Default value of {@link #HBASE_CLIENT_RETRIES_NUMBER}.
*/
public static int DEFAULT_HBASE_CLIENT_RETRIES_NUMBER = 20;
public static int DEFAULT_HBASE_CLIENT_RETRIES_NUMBER = 31;
/**
* Parameter name for client prefetch limit, used as the maximum number of regions
@ -729,7 +732,7 @@ public final class HConstants {
public static final boolean DEFAULT_DISALLOW_WRITES_IN_RECOVERING_CONFIG = false;
/** Conf key that specifies timeout value to wait for a region ready */
public static final String LOG_REPLAY_WAIT_REGION_TIMEOUT =
public static final String LOG_REPLAY_WAIT_REGION_TIMEOUT =
"hbase.master.log.replay.wait.region.timeout";
/**
@ -796,7 +799,7 @@ public final class HConstants {
/* Name of old snapshot directory. See HBASE-8352 for details on why it needs to be renamed */
public static final String OLD_SNAPSHOT_DIR_NAME = ".snapshot";
/** Temporary directory used for table creation and deletion */
public static final String HBASE_TEMP_DIRECTORY = ".tmp";

View File

@ -429,7 +429,7 @@ possible configurations would overwhelm and obscure the important.
</property>
<property>
<name>hbase.client.retries.number</name>
<value>14</value>
<value>35</value>
<description>Maximum retries. Used as maximum for all retryable
operations such as the getting of a cell's value, starting a row update,
etc. Retry interval is a rough function based on hbase.client.pause. At

View File

@ -68,6 +68,7 @@ public class TestClientScannerRPCTimeout {
conf.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY, rpcTimeout);
conf.setStrings(HConstants.REGION_SERVER_IMPL, RegionServerWithScanTimeout.class.getName());
conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, CLIENT_RETRIES_NUMBER);
conf.setInt(HConstants.HBASE_CLIENT_PAUSE, 1000);
TEST_UTIL.startMiniCluster(1);
}

View File

@ -18,7 +18,12 @@
*/
package org.apache.hadoop.hbase.client;
import static org.junit.Assert.*;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.IOException;
import java.lang.reflect.Field;
@ -48,7 +53,6 @@ import org.apache.hadoop.hbase.Waiter;
import org.apache.hadoop.hbase.client.HConnectionManager.HConnectionImplementation;
import org.apache.hadoop.hbase.exceptions.DeserializationException;
import org.apache.hadoop.hbase.exceptions.RegionServerStoppedException;
import org.apache.hadoop.hbase.exceptions.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.filter.FilterBase;
import org.apache.hadoop.hbase.master.ClusterStatusPublisher;
@ -57,13 +61,14 @@ import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.ManualEnvironmentEdge;
import org.apache.hadoop.hbase.util.JVMClusterUtil;
import org.apache.hadoop.hbase.util.ManualEnvironmentEdge;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
import org.junit.experimental.categories.Category;
@ -796,9 +801,10 @@ public class TestHCM {
conn.close();
}
@Test
@Ignore ("Test presumes RETRY_BACKOFF will never change; it has") @Test
public void testErrorBackoffTimeCalculation() throws Exception {
final long ANY_PAUSE = 1000;
// TODO: This test would seem to presume hardcoded RETRY_BACKOFF which it should not.
final long ANY_PAUSE = 100;
HRegionInfo ri = new HRegionInfo(TABLE_NAME);
HRegionLocation location = new HRegionLocation(ri, new ServerName("127.0.0.1", 1, 0));
HRegionLocation diffLocation = new HRegionLocation(ri, new ServerName("127.0.0.1", 2, 0));
@ -820,7 +826,7 @@ public class TestHCM {
tracker.reportServerError(location);
tracker.reportServerError(location);
tracker.reportServerError(location);
assertEqualsWithJitter(ANY_PAUSE * 2, tracker.calculateBackoffTime(location, ANY_PAUSE));
assertEqualsWithJitter(ANY_PAUSE * 5, tracker.calculateBackoffTime(location, ANY_PAUSE));
// All of this shouldn't affect backoff for different location.
@ -831,17 +837,17 @@ public class TestHCM {
// But should still work for a different region in the same location.
HRegionInfo ri2 = new HRegionInfo(TABLE_NAME2);
HRegionLocation diffRegion = new HRegionLocation(ri2, location.getServerName());
assertEqualsWithJitter(ANY_PAUSE * 2, tracker.calculateBackoffTime(diffRegion, ANY_PAUSE));
assertEqualsWithJitter(ANY_PAUSE * 5, tracker.calculateBackoffTime(diffRegion, ANY_PAUSE));
// Check with different base.
assertEqualsWithJitter(ANY_PAUSE * 4,
assertEqualsWithJitter(ANY_PAUSE * 10,
tracker.calculateBackoffTime(location, ANY_PAUSE * 2));
// See that time from last error is taken into account. Time shift is applied after jitter,
// so pass the original expected backoff as the base for jitter.
long timeShift = (long)(ANY_PAUSE * 0.5);
timeMachine.setValue(timeBase + timeShift);
assertEqualsWithJitter(ANY_PAUSE * 2 - timeShift,
assertEqualsWithJitter((ANY_PAUSE * 5) - timeShift,
tracker.calculateBackoffTime(location, ANY_PAUSE), ANY_PAUSE * 2);
// However we should not go into negative.

View File

@ -29,25 +29,10 @@
tests to be responsive.
</description>
</property>
<property>
<name>hbase.client.pause</name>
<value>1000</value>
<description>General client pause value. Used mostly as value to wait
before running a retry of a failed get, region lookup, etc.</description>
</property>
<property>
<name>hbase.defaults.for.version.skip</name>
<value>true</value>
</property>
<property>
<name>hbase.client.retries.number</name>
<value>20</value>
<description>Maximum retries. Used as maximum for all retryable
operations such as fetching of the root region from root region
server, getting a cell's value, starting a row update, etc.
Default: 20.
</description>
</property>
<property>
<name>hbase.server.thread.wakefrequency</name>
<value>1000</value>