diff --git a/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java b/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java index 7defac0d046..ea9a04f9cc4 100644 --- a/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java +++ b/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java @@ -71,7 +71,6 @@ public class LocalHBaseCluster { /** 'local:' */ public static final String LOCAL_COLON = LOCAL + ":"; private final Configuration conf; - private final Class masterClass; private final Class regionServerClass; /** @@ -145,9 +144,6 @@ public class LocalHBaseCluster { conf.set(HConstants.MASTER_PORT, "0"); conf.set(HConstants.REGIONSERVER_PORT, "0"); // Start the HMasters. - this.masterClass = - (Class)conf.getClass(HConstants.MASTER_IMPL, - masterClass); for (int i = 0; i < noMasters; i++) { addMaster(new Configuration(conf), i); } @@ -199,9 +195,8 @@ public class LocalHBaseCluster { // Create each master with its own Configuration instance so each has // its HConnection instance rather than share (see HBASE_INSTANCES down in // the guts of HConnectionManager. - JVMClusterUtil.MasterThread mt = - JVMClusterUtil.createMasterThread(c, - this.masterClass, index); + JVMClusterUtil.MasterThread mt = JVMClusterUtil.createMasterThread(c, + (Class) c.getClass(HConstants.MASTER_IMPL, HMaster.class), index); this.masterThreads.add(mt); return mt; } diff --git a/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index 97fa18dcd2c..bf93970186e 100644 --- a/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -609,9 +609,14 @@ public class AssignmentManager extends ZooKeeperListener { } failoverProcessedRegions.put(encodedRegionName, regionInfo); break; - + case RS_ZK_REGION_SPLITTING: + LOG.debug("Processed region in state : " + et); + break; + case RS_ZK_REGION_SPLIT: + LOG.debug("Processed region in state : " + et); + break; default: - throw new IllegalStateException("Received event is not valid."); + throw new IllegalStateException("Received region in state :" + et + " is not valid"); } } } @@ -2547,6 +2552,19 @@ public class AssignmentManager extends ZooKeeperListener { enableTableIfNotDisabledOrDisablingOrEnabling(disabled, disablingOrEnabling, tableName); } else { + // If region is in offline and split state check the ZKNode + if (regionInfo.isOffline() && regionInfo.isSplit()) { + String node = ZKAssign.getNodeName(this.watcher, regionInfo + .getEncodedName()); + Stat stat = new Stat(); + byte[] data = ZKUtil.getDataNoWatch(this.watcher, node, stat); + // If znode does not exist dont consider this region + if (data == null) { + LOG.debug("Region " + regionInfo.getRegionNameAsString() + + " split is completed. Hence need not add to regions list"); + continue; + } + } // Region is being served and on an active server // add only if region not in disabled and enabling table if (false == checkIfRegionBelongsToDisabled(regionInfo) diff --git a/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 95f51288720..947ec5f80ad 100644 --- a/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -611,7 +611,7 @@ Server { status.setStatus("Starting balancer and catalog janitor"); this.balancerChore = getAndStartBalancerChore(this); this.catalogJanitorChore = new CatalogJanitor(this, this); - Threads.setDaemonThreadRunning(catalogJanitorChore.getThread()); + startCatalogJanitorChore(); registerMBean(); @@ -628,6 +628,14 @@ Server { } } } + + /** + * Useful for testing purpose also where we have + * master restart scenarios. + */ + protected void startCatalogJanitorChore() { + Threads.setDaemonThreadRunning(catalogJanitorChore.getThread()); + } /** * Override to change master's splitLogAfterStartup. Used testing diff --git a/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java b/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java index 33cfb090b0d..06da2ce679e 100644 --- a/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java +++ b/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java @@ -269,7 +269,7 @@ public class ServerShutdownHandler extends EventHandler { // Skip regions that were in transition unless CLOSING or PENDING_CLOSE for (RegionState rit : regionsInTransition) { - if (!rit.isClosing() && !rit.isPendingClose()) { + if (!rit.isClosing() && !rit.isPendingClose() && !rit.isSplitting()) { LOG.debug("Removed " + rit.getRegion().getRegionNameAsString() + " from list of regions to assign because in RIT; region state: " + rit.getState()); diff --git a/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java b/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java index 4093b7d12c0..a24f937698f 100644 --- a/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java +++ b/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java @@ -29,17 +29,20 @@ import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.*; import org.apache.hadoop.hbase.client.Delete; import org.apache.hadoop.hbase.client.HBaseAdmin; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.executor.EventHandler.EventType; +import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.master.handler.SplitRegionHandler; import org.apache.hadoop.hbase.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread; import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.zookeeper.ZKAssign; +import org.apache.hadoop.hbase.zookeeper.ZKUtil; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.KeeperException.NodeExistsException; import org.apache.zookeeper.data.Stat; @@ -354,6 +357,178 @@ public class TestSplitTransactionOnCluster { cluster.getMaster().setCatalogJanitorEnabled(true); } } + + /** + * Verifies HBASE-5806. When splitting is partially done and the master goes down + * when the SPLIT node is in either SPLIT or SPLITTING state. + * + * @throws IOException + * @throws InterruptedException + * @throws NodeExistsException + * @throws KeeperException + * @throws DeserializationException + */ + @Test(timeout = 300000) + public void testMasterRestartWhenSplittingIsPartial() + throws IOException, InterruptedException, NodeExistsException, + KeeperException, DeserializationException { + final byte[] tableName = Bytes.toBytes("testMasterRestartWhenSplittingIsPartial"); + + // Create table then get the single region for our new table. + HTable t = TESTING_UTIL.createTable(tableName, HConstants.CATALOG_FAMILY); + + List regions = cluster.getRegions(tableName); + HRegionInfo hri = getAndCheckSingleTableRegion(regions); + + int tableRegionIndex = ensureTableRegionNotOnSameServerAsMeta(admin, hri); + + // Turn off balancer so it doesn't cut in and mess up our placements. + this.admin.balanceSwitch(false); + // Turn off the meta scanner so it don't remove parent on us. + cluster.getMaster().setCatalogJanitorEnabled(false); + try { + // Add a bit of load up into the table so splittable. + TESTING_UTIL.loadTable(t, HConstants.CATALOG_FAMILY); + // Get region pre-split. + HRegionServer server = cluster.getRegionServer(tableRegionIndex); + printOutRegions(server, "Initial regions: "); + // Now, before we split, set special flag in master, a flag that has + // it FAIL the processing of split. + SplitRegionHandler.TEST_SKIP = true; + // Now try splitting and it should work. + + this.admin.split(hri.getRegionNameAsString()); + while (!(cluster.getRegions(tableName).size() >= 2)) { + LOG.debug("Waiting on region to split"); + Thread.sleep(100); + } + + // Get daughters + List daughters = cluster.getRegions(tableName); + assertTrue(daughters.size() >= 2); + // Assert the ephemeral node is up in zk. + String path = ZKAssign.getNodeName(t.getConnection() + .getZooKeeperWatcher(), hri.getEncodedName()); + Stat stats = t.getConnection().getZooKeeperWatcher() + .getRecoverableZooKeeper().exists(path, false); + LOG.info("EPHEMERAL NODE BEFORE SERVER ABORT, path=" + path + ", stats=" + + stats); + byte[] bytes = ZKAssign.getData(t.getConnection() + .getZooKeeperWatcher(), hri.getEncodedName()); + RegionTransition rtd = RegionTransition.parseFrom(bytes); + // State could be SPLIT or SPLITTING. + assertTrue(rtd.getEventType().equals(EventType.RS_ZK_REGION_SPLIT) + || rtd.getEventType().equals(EventType.RS_ZK_REGION_SPLITTING)); + + // abort and wait for new master. + MockMasterWithoutCatalogJanitor master = abortAndWaitForMaster(); + + this.admin = new HBaseAdmin(TESTING_UTIL.getConfiguration()); + + // update the hri to be offlined and splitted. + hri.setOffline(true); + hri.setSplit(true); + ServerName regionServerOfRegion = master.getAssignmentManager() + .getRegionServerOfRegion(hri); + assertTrue(regionServerOfRegion != null); + + } finally { + // Set this flag back. + SplitRegionHandler.TEST_SKIP = false; + admin.balanceSwitch(true); + cluster.getMaster().setCatalogJanitorEnabled(true); + } + } + + /** + * Verifies HBASE-5806. Here the case is that splitting is completed but before the + * CJ could remove the parent region the master is killed and restarted. + * @throws IOException + * @throws InterruptedException + * @throws NodeExistsException + * @throws KeeperException + */ + @Test (timeout = 300000) + public void testMasterRestartAtRegionSplitPendingCatalogJanitor() + throws IOException, InterruptedException, NodeExistsException, + KeeperException { + final byte[] tableName = Bytes.toBytes("testMasterRestartAtRegionSplitPendingCatalogJanitor"); + + // Create table then get the single region for our new table. + HTable t = TESTING_UTIL.createTable(tableName, HConstants.CATALOG_FAMILY); + + List regions = cluster.getRegions(tableName); + HRegionInfo hri = getAndCheckSingleTableRegion(regions); + + int tableRegionIndex = ensureTableRegionNotOnSameServerAsMeta(admin, hri); + + // Turn off balancer so it doesn't cut in and mess up our placements. + this.admin.balanceSwitch(false); + // Turn off the meta scanner so it don't remove parent on us. + cluster.getMaster().setCatalogJanitorEnabled(false); + try { + // Add a bit of load up into the table so splittable. + TESTING_UTIL.loadTable(t, HConstants.CATALOG_FAMILY); + // Get region pre-split. + HRegionServer server = cluster.getRegionServer(tableRegionIndex); + printOutRegions(server, "Initial regions: "); + + this.admin.split(hri.getRegionNameAsString()); + while (!(cluster.getRegions(tableName).size() >= 2)) { + LOG.debug("Waiting on region to split"); + Thread.sleep(100); + } + + // Get daughters + List daughters = cluster.getRegions(tableName); + assertTrue(daughters.size() >= 2); + // Assert the ephemeral node is up in zk. + String path = ZKAssign.getNodeName(t.getConnection() + .getZooKeeperWatcher(), hri.getEncodedName()); + Stat stats = t.getConnection().getZooKeeperWatcher() + .getRecoverableZooKeeper().exists(path, false); + LOG.info("EPHEMERAL NODE BEFORE SERVER ABORT, path=" + path + ", stats=" + + stats); + String node = ZKAssign.getNodeName(t.getConnection() + .getZooKeeperWatcher(), hri.getEncodedName()); + Stat stat = new Stat(); + byte[] data = ZKUtil.getDataNoWatch(t.getConnection() + .getZooKeeperWatcher(), node, stat); + // ZKUtil.create + while (data != null) { + Thread.sleep(1000); + data = ZKUtil.getDataNoWatch(t.getConnection().getZooKeeperWatcher(), + node, stat); + + } + MockMasterWithoutCatalogJanitor master = abortAndWaitForMaster(); + + this.admin = new HBaseAdmin(TESTING_UTIL.getConfiguration()); + + hri.setOffline(true); + hri.setSplit(true); + ServerName regionServerOfRegion = master.getAssignmentManager() + .getRegionServerOfRegion(hri); + assertTrue(regionServerOfRegion == null); + } finally { + // Set this flag back. + SplitRegionHandler.TEST_SKIP = false; + this.admin.balanceSwitch(true); + cluster.getMaster().setCatalogJanitorEnabled(true); + } + } + + private MockMasterWithoutCatalogJanitor abortAndWaitForMaster() + throws IOException, InterruptedException { + cluster.abortMaster(0); + cluster.waitOnMaster(0); + cluster.getConfiguration().setClass(HConstants.MASTER_IMPL, + MockMasterWithoutCatalogJanitor.class, HMaster.class); + MockMasterWithoutCatalogJanitor master = null; + master = (MockMasterWithoutCatalogJanitor) cluster.startMaster().getMaster(); + cluster.waitForActiveAndReadyMaster(); + return master; + } private void split(final HRegionInfo hri, final HRegionServer server, final int regionCount) @@ -459,6 +634,18 @@ public class TestSplitTransactionOnCluster { Thread.sleep(100); } } + + public static class MockMasterWithoutCatalogJanitor extends HMaster { + + public MockMasterWithoutCatalogJanitor(Configuration conf) throws IOException, KeeperException, + InterruptedException { + super(conf); + } + + protected void startCatalogJanitorChore() { + LOG.debug("Customised master executed."); + } + } @org.junit.Rule public org.apache.hadoop.hbase.ResourceCheckerJUnitRule cu =