From baecf33ea6d977cdaadba055ad32b297134a5d8c Mon Sep 17 00:00:00 2001 From: Umesh Agashe Date: Thu, 7 Sep 2017 13:42:36 -0700 Subject: [PATCH] HBASE-18543 [AMv2] Fixed and re-enabled TestMasterFailover * testSimpleMasterFailover - fixed and verified * testPendingOpenOrCloseWhenMasterFailover - removed as logic is based on old code and no longer relevant. TestServerCrashProcedure tests assignments with crashing master and region servers * testMetaInTransitionWhenMasterFailover - verified that it is fixed by patch for HBASE-18511. Signed-off-by: Michael Stack --- .../hbase/master/TestMasterFailover.java | 177 +----------------- 1 file changed, 10 insertions(+), 167 deletions(-) diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java index cf57aa30d8a..9cbc1973de3 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java @@ -1,4 +1,4 @@ -/** +/* * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file @@ -28,23 +28,13 @@ import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.ClusterStatus; -import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HBaseTestingUtility; -import org.apache.hadoop.hbase.HColumnDescriptor; -import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HTableDescriptor; -import org.apache.hadoop.hbase.MetaTableAccessor; import org.apache.hadoop.hbase.MiniHBaseCluster; import org.apache.hadoop.hbase.ServerName; -import org.apache.hadoop.hbase.TableName; -import org.apache.hadoop.hbase.client.RegionLocator; -import org.apache.hadoop.hbase.client.Table; -import org.apache.hadoop.hbase.master.assignment.RegionStates; -import org.apache.hadoop.hbase.master.assignment.RegionStateStore; import org.apache.hadoop.hbase.master.RegionState.State; import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.HRegionServer; @@ -53,32 +43,15 @@ import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.testclassification.FlakeyTests; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.hbase.util.FSTableDescriptors; -import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; import org.apache.hadoop.hbase.zookeeper.MetaTableLocator; -import org.junit.Ignore; import org.junit.Test; import org.junit.experimental.categories.Category; @Category({FlakeyTests.class, LargeTests.class}) -@Ignore // Needs to be rewritten for AMv2. Uses tricks not ordained when up on AMv2. public class TestMasterFailover { private static final Log LOG = LogFactory.getLog(TestMasterFailover.class); - HRegion createRegion(final HRegionInfo hri, final Path rootdir, final Configuration c, - final HTableDescriptor htd) - throws IOException { - HRegion r = HBaseTestingUtility.createRegionAndWAL(hri, rootdir, c, htd); - // The above call to create a region will create an wal file. Each - // log file create will also create a running thread to do syncing. We need - // to close out this log else we will have a running thread trying to sync - // the file system continuously which is ugly when dfs is taken away at the - // end of the test. - HBaseTestingUtility.closeRegionAndWAL(r); - return r; - } - // TODO: Next test to add is with testing permutations of the RIT or the RS // killed are hosting ROOT and hbase:meta regions. @@ -92,7 +65,6 @@ public class TestMasterFailover { * Starts with three masters. Kills a backup master. Then kills the active * master. Ensures the final master becomes active and we can still contact * the cluster. - * @throws Exception */ @Test (timeout=240000) public void testSimpleMasterFailover() throws Exception { @@ -157,7 +129,7 @@ public class TestMasterFailover { assertEquals(2, masterThreads.size()); int rsCount = masterThreads.get(activeIndex).getMaster().getClusterStatus().getServersSize(); LOG.info("Active master " + active.getServerName() + " managing " + rsCount + " regions servers"); - assertEquals(4, rsCount); + assertEquals(3, rsCount); // Check that ClusterStatus reports the correct active and backup masters assertNotNull(active); @@ -190,142 +162,12 @@ public class TestMasterFailover { int rss = status.getServersSize(); LOG.info("Active master " + mastername.getServerName() + " managing " + rss + " region servers"); - assertEquals(4, rss); + assertEquals(3, rss); // Stop the cluster TEST_UTIL.shutdownMiniCluster(); } - /** - * Test region in pending_open/close when master failover - */ - @Test (timeout=180000) - public void testPendingOpenOrCloseWhenMasterFailover() throws Exception { - final int NUM_MASTERS = 1; - final int NUM_RS = 1; - - // Create config to use for this cluster - Configuration conf = HBaseConfiguration.create(); - - // Start the cluster - HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf); - TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS); - MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); - log("Cluster started"); - - // get all the master threads - List masterThreads = cluster.getMasterThreads(); - assertEquals(1, masterThreads.size()); - - // only one master thread, let's wait for it to be initialized - assertTrue(cluster.waitForActiveAndReadyMaster()); - HMaster master = masterThreads.get(0).getMaster(); - assertTrue(master.isActiveMaster()); - assertTrue(master.isInitialized()); - - // Create a table with a region online - Table onlineTable = TEST_UTIL.createTable(TableName.valueOf("onlineTable"), "family"); - onlineTable.close(); - // Create a table in META, so it has a region offline - HTableDescriptor offlineTable = new HTableDescriptor( - TableName.valueOf(Bytes.toBytes("offlineTable"))); - offlineTable.addFamily(new HColumnDescriptor(Bytes.toBytes("family"))); - - FileSystem filesystem = FileSystem.get(conf); - Path rootdir = FSUtils.getRootDir(conf); - FSTableDescriptors fstd = new FSTableDescriptors(conf, filesystem, rootdir); - fstd.createTableDescriptor(offlineTable); - - HRegionInfo hriOffline = new HRegionInfo(offlineTable.getTableName(), null, null); - createRegion(hriOffline, rootdir, conf, offlineTable); - MetaTableAccessor.addRegionToMeta(master.getConnection(), hriOffline); - - log("Regions in hbase:meta and namespace have been created"); - - // at this point we only expect 3 regions to be assigned out - // (catalogs and namespace, + 1 online region) - assertEquals(3, cluster.countServedRegions()); - HRegionInfo hriOnline = null; - try (RegionLocator locator = - TEST_UTIL.getConnection().getRegionLocator(TableName.valueOf("onlineTable"))) { - hriOnline = locator.getRegionLocation(HConstants.EMPTY_START_ROW).getRegionInfo(); - } - RegionStates regionStates = master.getAssignmentManager().getRegionStates(); - RegionStateStore stateStore = master.getAssignmentManager().getRegionStateStore(); - - // Put the online region in pending_close. It is actually already opened. - // This is to simulate that the region close RPC is not sent out before failover - RegionState oldState = regionStates.getRegionState(hriOnline); - RegionState newState = new RegionState(hriOnline, State.CLOSING, oldState.getServerName()); - stateStore.updateRegionState(HConstants.NO_SEQNUM, -1, newState, oldState); - - // Put the offline region in pending_open. It is actually not opened yet. - // This is to simulate that the region open RPC is not sent out before failover - oldState = new RegionState(hriOffline, State.OFFLINE); - newState = new RegionState(hriOffline, State.OPENING, newState.getServerName()); - stateStore.updateRegionState(HConstants.NO_SEQNUM, -1, newState, oldState); - - HRegionInfo failedClose = new HRegionInfo(offlineTable.getTableName(), null, null); - createRegion(failedClose, rootdir, conf, offlineTable); - MetaTableAccessor.addRegionToMeta(master.getConnection(), failedClose); - - oldState = new RegionState(failedClose, State.CLOSING); - newState = new RegionState(failedClose, State.FAILED_CLOSE, newState.getServerName()); - stateStore.updateRegionState(HConstants.NO_SEQNUM, -1, newState, oldState); - - HRegionInfo failedOpen = new HRegionInfo(offlineTable.getTableName(), null, null); - createRegion(failedOpen, rootdir, conf, offlineTable); - MetaTableAccessor.addRegionToMeta(master.getConnection(), failedOpen); - - // Simulate a region transitioning to failed open when the region server reports the - // transition as FAILED_OPEN - oldState = new RegionState(failedOpen, State.OPENING); - newState = new RegionState(failedOpen, State.FAILED_OPEN, newState.getServerName()); - stateStore.updateRegionState(HConstants.NO_SEQNUM, -1, newState, oldState); - - HRegionInfo failedOpenNullServer = new HRegionInfo(offlineTable.getTableName(), null, null); - LOG.info("Failed open NUll server " + failedOpenNullServer.getEncodedName()); - createRegion(failedOpenNullServer, rootdir, conf, offlineTable); - MetaTableAccessor.addRegionToMeta(master.getConnection(), failedOpenNullServer); - - // Simulate a region transitioning to failed open when the master couldn't find a plan for - // the region - oldState = new RegionState(failedOpenNullServer, State.OFFLINE); - newState = new RegionState(failedOpenNullServer, State.FAILED_OPEN, null); - stateStore.updateRegionState(HConstants.NO_SEQNUM, -1, newState, oldState); - - // Stop the master - log("Aborting master"); - cluster.abortMaster(0); - cluster.waitOnMaster(0); - log("Master has aborted"); - - // Start up a new master - log("Starting up a new master"); - master = cluster.startMaster().getMaster(); - log("Waiting for master to be ready"); - cluster.waitForActiveAndReadyMaster(); - log("Master is ready"); - - // Wait till no region in transition any more - TEST_UTIL.waitUntilNoRegionsInTransition(60000); - - // Get new region states since master restarted - regionStates = master.getAssignmentManager().getRegionStates(); - - // Both pending_open (RPC sent/not yet) regions should be online - assertTrue(regionStates.isRegionOnline(hriOffline)); - assertTrue(regionStates.isRegionOnline(hriOnline)); - assertTrue(regionStates.isRegionOnline(failedClose)); - assertTrue(regionStates.isRegionOnline(failedOpenNullServer)); - assertTrue(regionStates.isRegionOnline(failedOpen)); - - log("Done with verification, shutting down cluster"); - - // Done, shutdown the cluster - TEST_UTIL.shutdownMiniCluster(); - } - /** * Test meta in transition when master failover */ @@ -361,9 +203,9 @@ public class TestMasterFailover { // meta should remain where it was RegionState metaState = MetaTableLocator.getMetaRegionState(rs.getZooKeeper()); - assertEquals("hbase:meta should be onlined on RS", + assertEquals("hbase:meta should be online on RS", metaState.getServerName(), rs.getServerName()); - assertEquals("hbase:meta should be onlined on RS", + assertEquals("hbase:meta should be online on RS", metaState.getState(), State.OPEN); // Start up a new master @@ -376,9 +218,9 @@ public class TestMasterFailover { // ensure meta is still deployed on RS metaState = MetaTableLocator.getMetaRegionState(activeMaster.getZooKeeper()); - assertEquals("hbase:meta should be onlined on RS", + assertEquals("hbase:meta should be online on RS", metaState.getServerName(), rs.getServerName()); - assertEquals("hbase:meta should be onlined on RS", + assertEquals("hbase:meta should be online on RS", metaState.getState(), State.OPEN); // Update meta state as OPENING, then kill master @@ -408,9 +250,9 @@ public class TestMasterFailover { metaState = MetaTableLocator.getMetaRegionState(activeMaster.getZooKeeper()); - assertEquals("hbase:meta should be onlined on RS", + assertEquals("hbase:meta should be online on RS", metaState.getServerName(), rs.getServerName()); - assertEquals("hbase:meta should be onlined on RS", + assertEquals("hbase:meta should be online on RS", metaState.getState(), State.OPEN); // Update meta state as CLOSING, then kill master @@ -431,6 +273,7 @@ public class TestMasterFailover { // Start up a new master log("Starting up a new master"); activeMaster = cluster.startMaster().getMaster(); + assertNotNull(activeMaster); log("Waiting for master to be ready"); cluster.waitForActiveAndReadyMaster(); log("Master is ready");