From 049120cddeda3680a16df4dcd55a935fd562a4c2 Mon Sep 17 00:00:00 2001 From: Jean-Daniel Cryans Date: Wed, 23 Jun 2010 20:33:48 +0000 Subject: [PATCH] HBASE-2772 Scan doesn't recover from region server failure HBASE-2775 Update of hadoop jar in HBASE-2771 broke TestMultiClusters git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@957333 13f79535-47bb-0310-9956-ffa450edef68 --- pom.xml | 2 +- .../apache/hadoop/hbase/client/HTable.java | 31 +++++----- .../apache/hadoop/hbase/MiniHBaseCluster.java | 14 ++++- .../hbase/client/TestScannerTimeout.java | 56 ++++++++++++------- 4 files changed, 68 insertions(+), 35 deletions(-) diff --git a/pom.xml b/pom.xml index 108f2f9b84c..46bc035aaa5 100644 --- a/pom.xml +++ b/pom.xml @@ -443,7 +443,7 @@ 1.6 UTF-8 0.21.0-SNAPSHOT - 0.20.3-append-r956776 + 0.20.3-append-r956776+1240 1.2 1.1.1 diff --git a/src/main/java/org/apache/hadoop/hbase/client/HTable.java b/src/main/java/org/apache/hadoop/hbase/client/HTable.java index e2ffeec938b..23b6e494748 100644 --- a/src/main/java/org/apache/hadoop/hbase/client/HTable.java +++ b/src/main/java/org/apache/hadoop/hbase/client/HTable.java @@ -942,19 +942,24 @@ public class HTable implements HTableInterface { values = getConnection().getRegionServerWithRetries(callable); } } catch (DoNotRetryIOException e) { - long timeout = lastNext + scannerTimeout; - if (e instanceof UnknownScannerException && - timeout < System.currentTimeMillis()) { - long elapsed = System.currentTimeMillis() - lastNext; - ScannerTimeoutException ex = new ScannerTimeoutException( - elapsed + "ms passed since the last invocation, " + - "timeout is currently set to " + scannerTimeout); - ex.initCause(e); - throw ex; - } - Throwable cause = e.getCause(); - if (cause == null || !(cause instanceof NotServingRegionException)) { - throw e; + if (e instanceof UnknownScannerException) { + long timeout = lastNext + scannerTimeout; + // If we are over the timeout, throw this exception to the client + // Else, it's because the region moved and we used the old id + // against the new region server; reset the scanner. + if (timeout < System.currentTimeMillis()) { + long elapsed = System.currentTimeMillis() - lastNext; + ScannerTimeoutException ex = new ScannerTimeoutException( + elapsed + "ms passed since the last invocation, " + + "timeout is currently set to " + scannerTimeout); + ex.initCause(e); + throw ex; + } + } else { + Throwable cause = e.getCause(); + if (cause == null || !(cause instanceof NotServingRegionException)) { + throw e; + } } // Else, its signal from depths of ScannerCallable that we got an // NSRE on a next and that we need to reset the scanner. diff --git a/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java b/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java index 68f334653cd..e0865d58f30 100644 --- a/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java +++ b/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java @@ -386,15 +386,25 @@ public class MiniHBaseCluster { /** * @return Index into List of {@link MiniHBaseCluster#getRegionServerThreads()} - * of HRS carrying .META. Returns -1 if none found. + * of HRS carrying regionName. Returns -1 if none found. */ public int getServerWithMeta() { + return getServerWith(HRegionInfo.FIRST_META_REGIONINFO.getRegionName()); + } + + /** + * Get the location of the specified region + * @param regionName Name of the region in bytes + * @return Index into List of {@link MiniHBaseCluster#getRegionServerThreads()} + * of HRS carrying .META.. Returns -1 if none found. + */ + public int getServerWith(byte[] regionName) { int index = -1; int count = 0; for (JVMClusterUtil.RegionServerThread rst: getRegionServerThreads()) { HRegionServer hrs = rst.getRegionServer(); HRegion metaRegion = - hrs.getOnlineRegion(HRegionInfo.FIRST_META_REGIONINFO.getRegionName()); + hrs.getOnlineRegion(regionName); if (metaRegion != null) { index = count; break; diff --git a/src/test/java/org/apache/hadoop/hbase/client/TestScannerTimeout.java b/src/test/java/org/apache/hadoop/hbase/client/TestScannerTimeout.java index 56fbbf98e0a..39ad3f4d6a7 100644 --- a/src/test/java/org/apache/hadoop/hbase/client/TestScannerTimeout.java +++ b/src/test/java/org/apache/hadoop/hbase/client/TestScannerTimeout.java @@ -23,7 +23,11 @@ public class TestScannerTimeout { TEST_UTIL = new HBaseTestingUtility(); final Log LOG = LogFactory.getLog(getClass()); - private final byte[] someBytes = Bytes.toBytes("f"); + private final static byte[] SOME_BYTES = Bytes.toBytes("f"); + private final static byte[] TABLE_NAME = Bytes.toBytes("t"); + private final static int NB_ROWS = 10; + private final static int SCANNER_TIMEOUT = 6000; + private static HTable table; /** * @throws java.lang.Exception @@ -31,8 +35,14 @@ public class TestScannerTimeout { @BeforeClass public static void setUpBeforeClass() throws Exception { Configuration c = TEST_UTIL.getConfiguration(); - c.setInt("hbase.regionserver.lease.period", 1000); - TEST_UTIL.startMiniCluster(1); + c.setInt("hbase.regionserver.lease.period", SCANNER_TIMEOUT); + TEST_UTIL.startMiniCluster(2); + table = TEST_UTIL.createTable(Bytes.toBytes("t"), SOME_BYTES); + for (int i = 0; i < NB_ROWS; i++) { + Put put = new Put(Bytes.toBytes(i)); + put.add(SOME_BYTES, SOME_BYTES, SOME_BYTES); + table.put(put); + } } /** @@ -48,13 +58,7 @@ public class TestScannerTimeout { */ @Before public void setUp() throws Exception { - } - - /** - * @throws java.lang.Exception - */ - @After - public void tearDown() throws Exception { + TEST_UTIL.ensureSomeRegionServersAvailable(2); } /** @@ -63,22 +67,16 @@ public class TestScannerTimeout { */ @Test public void test2481() throws Exception { - int initialCount = 10; - HTable t = TEST_UTIL.createTable(Bytes.toBytes("t"), someBytes); - for (int i = 0; i < initialCount; i++) { - Put put = new Put(Bytes.toBytes(i)); - put.add(someBytes, someBytes, someBytes); - t.put(put); - } Scan scan = new Scan(); - ResultScanner r = t.getScanner(scan); + ResultScanner r = table.getScanner(scan); int count = 0; try { Result res = r.next(); while (res != null) { count++; if (count == 5) { - Thread.sleep(1500); + // Sleep just a bit more to be sure + Thread.sleep(SCANNER_TIMEOUT+100); } res = r.next(); } @@ -88,4 +86,24 @@ public class TestScannerTimeout { } fail("We should be timing out"); } + + /** + * Test that scanner can continue even if the region server it was reading + * from failed. Before 2772, it reused the same scanner id. + * @throws Exception + */ + @Test + public void test2772() throws Exception { + int rs = TEST_UTIL.getHBaseCluster().getServerWith( + TEST_UTIL.getHBaseCluster().getRegions( + TABLE_NAME).get(0).getRegionName()); + Scan scan = new Scan(); + ResultScanner r = table.getScanner(scan); + // This takes exactly 5 seconds + TEST_UTIL.getHBaseCluster().getRegionServer(rs).abort("die!"); + Result[] results = r.next(NB_ROWS); + assertEquals(NB_ROWS, results.length); + r.close(); + + } }