From c925dd3d765485f0f8a2514dfc558a28e47eb3ed Mon Sep 17 00:00:00 2001 From: Zhihong Yu Date: Wed, 4 Sep 2013 16:04:04 +0000 Subject: [PATCH] HBASE-9387 Region could get lost during assignment (Ted Yu) git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1520058 13f79535-47bb-0310-9956-ffa450edef68 --- .../handler/OpenRegionHandler.java | 22 ++++++-- .../hbase/MockRegionServerServices.java | 6 +- .../hbase/TestFullLogReconstruction.java | 4 +- .../handler/TestOpenRegionHandler.java | 56 +++++++++++++++++++ 4 files changed, 78 insertions(+), 10 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java index ec2dc2b5c9b..df3f6d37d8a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java @@ -35,6 +35,7 @@ import org.apache.hadoop.hbase.regionserver.RegionServerAccounting; import org.apache.hadoop.hbase.regionserver.RegionServerServices; import org.apache.hadoop.hbase.util.CancelableProgressable; import org.apache.hadoop.hbase.zookeeper.ZKAssign; +import org.apache.hadoop.hbase.zookeeper.ZKUtil; import org.apache.zookeeper.KeeperException; /** @@ -359,7 +360,7 @@ public class OpenRegionHandler extends EventHandler { * @return whether znode is successfully transitioned to OPENED state. * @throws IOException */ - private boolean transitionToOpened(final HRegion r) throws IOException { + boolean transitionToOpened(final HRegion r) throws IOException { boolean result = false; HRegionInfo hri = r.getRegionInfo(); final String name = hri.getRegionNameAsString(); @@ -367,11 +368,20 @@ public class OpenRegionHandler extends EventHandler { try { if (ZKAssign.transitionNodeOpened(this.server.getZooKeeper(), hri, this.server.getServerName(), this.version) == -1) { - LOG.warn("Completed the OPEN of region " + name + - " but when transitioning from " + - " OPENING to OPENED got a version mismatch, someone else clashed " + - "so now unassigning -- closing region on server: " + - this.server.getServerName()); + String warnMsg = "Completed the OPEN of region " + name + + " but when transitioning from " + " OPENING to OPENED "; + try { + String node = ZKAssign.getNodeName(this.server.getZooKeeper(), hri.getEncodedName()); + if (ZKUtil.checkExists(this.server.getZooKeeper(), node) < 0) { + // if the znode + rsServices.abort(warnMsg + "the znode disappeared", null); + } else { + LOG.warn(warnMsg + "got a version mismatch, someone else clashed; " + + "so now unassigning -- closing region on server: " + this.server.getServerName()); + } + } catch (KeeperException ke) { + rsServices.abort(warnMsg, ke); + } } else { LOG.debug("Transitioned " + r.getRegionInfo().getEncodedName() + " to OPENED in zk on " + this.server.getServerName()); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/MockRegionServerServices.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/MockRegionServerServices.java index 440f29988dd..b2c434cf352 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/MockRegionServerServices.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/MockRegionServerServices.java @@ -55,6 +55,7 @@ class MockRegionServerServices implements RegionServerServices { private ZooKeeperWatcher zkw = null; private ServerName serverName = null; private RpcServerInterface rpcServer = null; + private volatile boolean abortRequested; MockRegionServerServices(ZooKeeperWatcher zkw) { this.zkw = zkw; @@ -154,7 +155,8 @@ class MockRegionServerServices implements RegionServerServices { @Override public void abort(String why, Throwable e) { - //no-op + this.abortRequested = true; + stop(why); } @Override @@ -169,7 +171,7 @@ class MockRegionServerServices implements RegionServerServices { @Override public boolean isAborted() { - return false; + return this.abortRequested; } @Override diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestFullLogReconstruction.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestFullLogReconstruction.java index 01009596311..95253ebb3c8 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestFullLogReconstruction.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestFullLogReconstruction.java @@ -55,7 +55,7 @@ public class TestFullLogReconstruction { c.setInt("ipc.client.connect.max.retries", 1); c.setInt("dfs.client.block.recovery.retries", 1); c.setInt(HConstants.ZK_SESSION_TIMEOUT, 1000); - TEST_UTIL.startMiniCluster(2); + TEST_UTIL.startMiniCluster(3); } /** @@ -109,4 +109,4 @@ public class TestFullLogReconstruction { assertEquals(count, newCount); table.close(); } -} \ No newline at end of file +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java index 1fb038e08df..7c7cf5a4ed1 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java @@ -129,6 +129,62 @@ public class TestOpenRegionHandler { } } + /** + * Test the openregionhandler can deal with perceived failure of transitioning to OPENED state + * due to intermittent zookeeper malfunctioning. + * @see HBASE-9387 + * @throws IOException + * @throws NodeExistsException + * @throws KeeperException + */ + @Test + public void testRegionServerAbortionDueToFailureTransitioningToOpened() + throws IOException, NodeExistsException, KeeperException { + final Server server = new MockServer(HTU); + final RegionServerServices rss = HTU.createMockRegionServerService(); + + HTableDescriptor htd = TEST_HTD; + final HRegionInfo hri = TEST_HRI; + HRegion region = + HRegion.createHRegion(hri, HTU.getDataTestDir(), HTU + .getConfiguration(), htd); + assertNotNull(region); + try { + OpenRegionHandler handler = new OpenRegionHandler(server, rss, hri, htd) { + boolean transitionToOpened(final HRegion r) throws IOException { + // remove znode simulating intermittent zookeeper connection issue + ZooKeeperWatcher zkw = this.server.getZooKeeper(); + String node = ZKAssign.getNodeName(zkw, hri.getEncodedName()); + try { + ZKUtil.deleteNodeFailSilent(zkw, node); + } catch (KeeperException e) { + throw new RuntimeException("Ugh failed delete of " + node, e); + } + // then try to transition to OPENED + return super.transitionToOpened(r); + } + }; + rss.getRegionsInTransitionInRS().put( + hri.getEncodedNameAsBytes(), Boolean.TRUE); + // Call process without first creating OFFLINE region in zk, see if + // exception or just quiet return (expected). + handler.process(); + rss.getRegionsInTransitionInRS().put( + hri.getEncodedNameAsBytes(), Boolean.TRUE); + ZKAssign.createNodeOffline(server.getZooKeeper(), hri, server.getServerName()); + // Call process again but this time yank the zk znode out from under it + // post OPENING; again will expect it to come back w/o NPE or exception. + handler.process(); + } catch (IOException ioe) { + } finally { + HRegion.closeHRegion(region); + } + // Region server is expected to abort due to OpenRegionHandler perceiving transitioning + // to OPENED as failed + // This was corresponding to the second handler.process() call above. + assertTrue("region server should have aborted", rss.isAborted()); + } + @Test public void testFailedOpenRegion() throws Exception { Server server = new MockServer(HTU);