From 0042d6d4c877b3adecbf5247f200d2e25be128f9 Mon Sep 17 00:00:00 2001 From: Mikhail Antonov Date: Sun, 15 May 2016 20:49:00 -0700 Subject: [PATCH] HBASE-15615 Wrong sleep time when RegionServerCallable need retry (Guanghao Zhang) --- .../hadoop/hbase/client/ConnectionUtils.java | 3 + .../client/RegionAdminServiceCallable.java | 2 +- .../hbase/client/RegionServerCallable.java | 3 +- .../hbase/client/RpcRetryingCaller.java | 4 +- .../hbase/client/TestConnectionUtils.java | 20 +++ .../apache/hadoop/hbase/client/TestHCM.java | 137 ++++++++++++++---- 6 files changed, 138 insertions(+), 31 deletions(-) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionUtils.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionUtils.java index 63861beebab..d9e460bf914 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionUtils.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionUtils.java @@ -54,6 +54,9 @@ public class ConnectionUtils { if (ntries >= HConstants.RETRY_BACKOFF.length) { ntries = HConstants.RETRY_BACKOFF.length - 1; } + if (ntries < 0) { + ntries = 0; + } long normalPause = pause * HConstants.RETRY_BACKOFF[ntries]; long jitter = (long)(normalPause * RANDOM.nextFloat() * 0.01f); // 1% possible jitter diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionAdminServiceCallable.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionAdminServiceCallable.java index 305cb93271e..675a2f94fbb 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionAdminServiceCallable.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionAdminServiceCallable.java @@ -138,7 +138,7 @@ public abstract class RegionAdminServiceCallable implements RetryingCallable< @Override public long sleep(long pause, int tries) { - long sleep = ConnectionUtils.getPauseTime(pause, tries + 1); + long sleep = ConnectionUtils.getPauseTime(pause, tries); if (sleep < MIN_WAIT_DEAD_SERVER && (location == null || connection.isDeadServer(location.getServerName()))) { sleep = ConnectionUtils.addJitter(MIN_WAIT_DEAD_SERVER, 0.10f); diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionServerCallable.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionServerCallable.java index 45acd160c48..0997b10e2c5 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionServerCallable.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionServerCallable.java @@ -134,8 +134,7 @@ public abstract class RegionServerCallable implements RetryingCallable { @Override public long sleep(long pause, int tries) { - // Tries hasn't been bumped up yet so we use "tries + 1" to get right pause time - long sleep = ConnectionUtils.getPauseTime(pause, tries + 1); + long sleep = ConnectionUtils.getPauseTime(pause, tries); if (sleep < MIN_WAIT_DEAD_SERVER && (location == null || getConnection().isDeadServer(location.getServerName()))) { sleep = ConnectionUtils.addJitter(MIN_WAIT_DEAD_SERVER, 0.10f); diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RpcRetryingCaller.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RpcRetryingCaller.java index c29fc814eef..fd74161f321 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RpcRetryingCaller.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RpcRetryingCaller.java @@ -159,8 +159,8 @@ public class RpcRetryingCaller { } // If the server is dead, we need to wait a little before retrying, to give // a chance to the regions to be - // tries hasn't been bumped up yet so we use "tries + 1" to get right pause time - expectedSleep = callable.sleep(pause, tries + 1); + // get right pause time, start by RETRY_BACKOFF[0] * pause + expectedSleep = callable.sleep(pause, tries); // If, after the planned sleep, there won't be enough time left, we stop now. long duration = singleCallDuration(expectedSleep); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnectionUtils.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnectionUtils.java index 649d6746275..3d449ae93e1 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnectionUtils.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnectionUtils.java @@ -19,6 +19,7 @@ */ package org.apache.hadoop.hbase.client; +import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.testclassification.SmallTests; import org.junit.Test; import org.junit.experimental.categories.Category; @@ -53,4 +54,23 @@ public class TestConnectionUtils { assertTrue(retyTimeSet.size() > (retries.length * 0.80)); } + @Test + public void testGetPauseTime() { + long pauseTime; + long baseTime = 100; + pauseTime = ConnectionUtils.getPauseTime(baseTime, -1); + assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[0])); + assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[0] * 1.01f)); + + for (int i = 0; i < HConstants.RETRY_BACKOFF.length; i++) { + pauseTime = ConnectionUtils.getPauseTime(baseTime, i); + assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[i])); + assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[i] * 1.01f)); + } + + int length = HConstants.RETRY_BACKOFF.length; + pauseTime = ConnectionUtils.getPauseTime(baseTime, length); + assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[length - 1])); + assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[length - 1] * 1.01f)); + } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java index a8c7113885f..e592fa83031 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java @@ -67,6 +67,7 @@ import org.apache.hadoop.hbase.exceptions.RegionMovedException; import org.apache.hadoop.hbase.filter.Filter; import org.apache.hadoop.hbase.filter.FilterBase; import org.apache.hadoop.hbase.ipc.RpcClient; +import org.apache.hadoop.hbase.ipc.RpcControllerFactory; import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.regionserver.Region; @@ -120,14 +121,25 @@ public class TestHCM { */ public static class SleepAndFailFirstTime extends BaseRegionObserver { static final AtomicLong ct = new AtomicLong(0); + static final String SLEEP_TIME_CONF_KEY = + "hbase.coprocessor.SleepAndFailFirstTime.sleepTime"; + static final long DEFAULT_SLEEP_TIME = 20000; + static final AtomicLong sleepTime = new AtomicLong(DEFAULT_SLEEP_TIME); public SleepAndFailFirstTime() { } + @Override + public void postOpen(ObserverContext c) { + RegionCoprocessorEnvironment env = c.getEnvironment(); + Configuration conf = env.getConfiguration(); + sleepTime.set(conf.getLong(SLEEP_TIME_CONF_KEY, DEFAULT_SLEEP_TIME)); + } + @Override public void preGetOp(final ObserverContext e, final Get get, final List results) throws IOException { - Threads.sleep(20000); + Threads.sleep(sleepTime.get()); if (ct.incrementAndGet() == 1){ throw new IOException("first call I fail"); } @@ -357,6 +369,96 @@ public class TestHCM { } } + /** + * Test starting from 0 index when RpcRetryingCaller calculate the backoff time. + */ + @Test + public void testRpcRetryingCallerSleep() throws Exception { + HTableDescriptor hdt = TEST_UTIL.createTableDescriptor("HCM-testRpcRetryingCallerSleep"); + hdt.addCoprocessorWithSpec("|" + SleepAndFailFirstTime.class.getName() + "||" + + SleepAndFailFirstTime.SLEEP_TIME_CONF_KEY + "=2000"); + TEST_UTIL.createTable(hdt, new byte[][] { FAM_NAM }).close(); + + Configuration c = new Configuration(TEST_UTIL.getConfiguration()); + c.setInt(HConstants.HBASE_CLIENT_PAUSE, 3000); + c.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY, 4000); + + Connection connection = ConnectionFactory.createConnection(c); + Table t = connection.getTable(TableName.valueOf("HCM-testRpcRetryingCallerSleep")); + if (t instanceof HTable) { + HTable table = (HTable) t; + table.setOperationTimeout(8000); + // Check that it works. Because 2s + 3s * RETRY_BACKOFF[0] + 2s < 8s + table.get(new Get(FAM_NAM)); + + // Resetting and retrying. + SleepAndFailFirstTime.ct.set(0); + try { + table.setOperationTimeout(6000); + // Will fail this time. After sleep, there are not enough time for second retry + // Beacuse 2s + 3s + 2s > 6s + table.get(new Get(FAM_NAM)); + Assert.fail("We expect an exception here"); + } catch (SocketTimeoutException e) { + LOG.info("We received an exception, as expected ", e); + } catch (IOException e) { + Assert.fail("Wrong exception:" + e.getMessage()); + } finally { + table.close(); + connection.close(); + } + } + } + + @Test + public void testCallableSleep() throws Exception { + long pauseTime; + long baseTime = 100; + TableName tableName = TableName.valueOf("HCM-testCallableSleep"); + HTable table = TEST_UTIL.createTable(tableName, FAM_NAM); + RegionServerCallable regionServerCallable = new RegionServerCallable( + TEST_UTIL.getConnection(), tableName, ROW) { + public Object call(int timeout) throws IOException { + return null; + } + }; + + regionServerCallable.prepare(false); + for (int i = 0; i < HConstants.RETRY_BACKOFF.length; i++) { + pauseTime = regionServerCallable.sleep(baseTime, i); + assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[i])); + assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[i] * 1.01f)); + } + + RegionAdminServiceCallable regionAdminServiceCallable = + new RegionAdminServiceCallable( + (ClusterConnection) TEST_UTIL.getConnection(), new RpcControllerFactory( + TEST_UTIL.getConfiguration()), tableName, ROW) { + public Object call(int timeout) throws IOException { + return null; + } + }; + + regionAdminServiceCallable.prepare(false); + for (int i = 0; i < HConstants.RETRY_BACKOFF.length; i++) { + pauseTime = regionAdminServiceCallable.sleep(baseTime, i); + assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[i])); + assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[i] * 1.01f)); + } + + MasterCallable masterCallable = new MasterCallable((HConnection) TEST_UTIL.getConnection()) { + public Object call(int timeout) throws IOException { + return null; + } + }; + + for (int i = 0; i < HConstants.RETRY_BACKOFF.length; i++) { + pauseTime = masterCallable.sleep(baseTime, i); + assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[i])); + assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[i] * 1.01f)); + } + } + private void testConnectionClose(boolean allowsInterrupt) throws Exception { TableName tableName = TableName.valueOf("HCM-testConnectionClose" + allowsInterrupt); TEST_UTIL.createTable(tableName, FAM_NAM).close(); @@ -1139,7 +1241,7 @@ public class TestHCM { } } - @Ignore ("Test presumes RETRY_BACKOFF will never change; it has") @Test + @Test public void testErrorBackoffTimeCalculation() throws Exception { // TODO: This test would seem to presume hardcoded RETRY_BACKOFF which it should not. final long ANY_PAUSE = 100; @@ -1159,40 +1261,23 @@ public class TestHCM { // Check some backoff values from HConstants sequence. tracker.reportServerError(location); - assertEqualsWithJitter(ANY_PAUSE, tracker.calculateBackoffTime(location, ANY_PAUSE)); + assertEqualsWithJitter(ANY_PAUSE * HConstants.RETRY_BACKOFF[0], + tracker.calculateBackoffTime(location, ANY_PAUSE)); tracker.reportServerError(location); tracker.reportServerError(location); tracker.reportServerError(location); - assertEqualsWithJitter(ANY_PAUSE * 5, tracker.calculateBackoffTime(location, ANY_PAUSE)); + assertEqualsWithJitter(ANY_PAUSE * HConstants.RETRY_BACKOFF[3], + tracker.calculateBackoffTime(location, ANY_PAUSE)); // All of this shouldn't affect backoff for different location. assertEquals(0, tracker.calculateBackoffTime(diffLocation, ANY_PAUSE)); tracker.reportServerError(diffLocation); - assertEqualsWithJitter(ANY_PAUSE, tracker.calculateBackoffTime(diffLocation, ANY_PAUSE)); + assertEqualsWithJitter(ANY_PAUSE * HConstants.RETRY_BACKOFF[0], + tracker.calculateBackoffTime(diffLocation, ANY_PAUSE)); // Check with different base. - assertEqualsWithJitter(ANY_PAUSE * 10, + assertEqualsWithJitter(ANY_PAUSE * 2 * HConstants.RETRY_BACKOFF[3], tracker.calculateBackoffTime(location, ANY_PAUSE * 2)); - - // See that time from last error is taken into account. Time shift is applied after jitter, - // so pass the original expected backoff as the base for jitter. - long timeShift = (long)(ANY_PAUSE * 0.5); - timeMachine.setValue(timeBase + timeShift); - assertEqualsWithJitter((ANY_PAUSE * 5) - timeShift, - tracker.calculateBackoffTime(location, ANY_PAUSE), ANY_PAUSE * 2); - - // However we should not go into negative. - timeMachine.setValue(timeBase + ANY_PAUSE * 100); - assertEquals(0, tracker.calculateBackoffTime(location, ANY_PAUSE)); - - // We also should not go over the boundary; last retry would be on it. - long timeLeft = (long)(ANY_PAUSE * 0.5); - timeMachine.setValue(timeBase + largeAmountOfTime - timeLeft); - assertTrue(tracker.canRetryMore(1)); - tracker.reportServerError(location); - assertEquals(timeLeft, tracker.calculateBackoffTime(location, ANY_PAUSE)); - timeMachine.setValue(timeBase + largeAmountOfTime); - assertFalse(tracker.canRetryMore(1)); } finally { EnvironmentEdgeManager.reset(); }