HBASE-18543 [AMv2] Fixed and re-enabled TestMasterFailover

* testSimpleMasterFailover - fixed and verified
  * testPendingOpenOrCloseWhenMasterFailover - removed as logic is based on old code and no longer relevant. TestServerCrashProcedure tests assignments with crashing master and region servers
  * testMetaInTransitionWhenMasterFailover - verified that it is fixed by patch for HBASE-18511.

Signed-off-by: Michael Stack <stack@apache.org>
This commit is contained in:
Umesh Agashe 2017-09-07 13:42:36 -07:00 committed by Michael Stack
parent 56cba5e450
commit baecf33ea6
1 changed files with 10 additions and 167 deletions

View File

@ -1,4 +1,4 @@
/**
/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
@ -28,23 +28,13 @@ import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.ClusterStatus;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.MetaTableAccessor;
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.RegionLocator;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.master.assignment.RegionStates;
import org.apache.hadoop.hbase.master.assignment.RegionStateStore;
import org.apache.hadoop.hbase.master.RegionState.State;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
@ -53,32 +43,15 @@ import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.testclassification.FlakeyTests;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.FSTableDescriptors;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
import org.junit.Ignore;
import org.junit.Test;
import org.junit.experimental.categories.Category;
@Category({FlakeyTests.class, LargeTests.class})
@Ignore // Needs to be rewritten for AMv2. Uses tricks not ordained when up on AMv2.
public class TestMasterFailover {
private static final Log LOG = LogFactory.getLog(TestMasterFailover.class);
HRegion createRegion(final HRegionInfo hri, final Path rootdir, final Configuration c,
final HTableDescriptor htd)
throws IOException {
HRegion r = HBaseTestingUtility.createRegionAndWAL(hri, rootdir, c, htd);
// The above call to create a region will create an wal file. Each
// log file create will also create a running thread to do syncing. We need
// to close out this log else we will have a running thread trying to sync
// the file system continuously which is ugly when dfs is taken away at the
// end of the test.
HBaseTestingUtility.closeRegionAndWAL(r);
return r;
}
// TODO: Next test to add is with testing permutations of the RIT or the RS
// killed are hosting ROOT and hbase:meta regions.
@ -92,7 +65,6 @@ public class TestMasterFailover {
* Starts with three masters. Kills a backup master. Then kills the active
* master. Ensures the final master becomes active and we can still contact
* the cluster.
* @throws Exception
*/
@Test (timeout=240000)
public void testSimpleMasterFailover() throws Exception {
@ -157,7 +129,7 @@ public class TestMasterFailover {
assertEquals(2, masterThreads.size());
int rsCount = masterThreads.get(activeIndex).getMaster().getClusterStatus().getServersSize();
LOG.info("Active master " + active.getServerName() + " managing " + rsCount + " regions servers");
assertEquals(4, rsCount);
assertEquals(3, rsCount);
// Check that ClusterStatus reports the correct active and backup masters
assertNotNull(active);
@ -190,142 +162,12 @@ public class TestMasterFailover {
int rss = status.getServersSize();
LOG.info("Active master " + mastername.getServerName() + " managing " +
rss + " region servers");
assertEquals(4, rss);
assertEquals(3, rss);
// Stop the cluster
TEST_UTIL.shutdownMiniCluster();
}
/**
* Test region in pending_open/close when master failover
*/
@Test (timeout=180000)
public void testPendingOpenOrCloseWhenMasterFailover() throws Exception {
final int NUM_MASTERS = 1;
final int NUM_RS = 1;
// Create config to use for this cluster
Configuration conf = HBaseConfiguration.create();
// Start the cluster
HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
log("Cluster started");
// get all the master threads
List<MasterThread> masterThreads = cluster.getMasterThreads();
assertEquals(1, masterThreads.size());
// only one master thread, let's wait for it to be initialized
assertTrue(cluster.waitForActiveAndReadyMaster());
HMaster master = masterThreads.get(0).getMaster();
assertTrue(master.isActiveMaster());
assertTrue(master.isInitialized());
// Create a table with a region online
Table onlineTable = TEST_UTIL.createTable(TableName.valueOf("onlineTable"), "family");
onlineTable.close();
// Create a table in META, so it has a region offline
HTableDescriptor offlineTable = new HTableDescriptor(
TableName.valueOf(Bytes.toBytes("offlineTable")));
offlineTable.addFamily(new HColumnDescriptor(Bytes.toBytes("family")));
FileSystem filesystem = FileSystem.get(conf);
Path rootdir = FSUtils.getRootDir(conf);
FSTableDescriptors fstd = new FSTableDescriptors(conf, filesystem, rootdir);
fstd.createTableDescriptor(offlineTable);
HRegionInfo hriOffline = new HRegionInfo(offlineTable.getTableName(), null, null);
createRegion(hriOffline, rootdir, conf, offlineTable);
MetaTableAccessor.addRegionToMeta(master.getConnection(), hriOffline);
log("Regions in hbase:meta and namespace have been created");
// at this point we only expect 3 regions to be assigned out
// (catalogs and namespace, + 1 online region)
assertEquals(3, cluster.countServedRegions());
HRegionInfo hriOnline = null;
try (RegionLocator locator =
TEST_UTIL.getConnection().getRegionLocator(TableName.valueOf("onlineTable"))) {
hriOnline = locator.getRegionLocation(HConstants.EMPTY_START_ROW).getRegionInfo();
}
RegionStates regionStates = master.getAssignmentManager().getRegionStates();
RegionStateStore stateStore = master.getAssignmentManager().getRegionStateStore();
// Put the online region in pending_close. It is actually already opened.
// This is to simulate that the region close RPC is not sent out before failover
RegionState oldState = regionStates.getRegionState(hriOnline);
RegionState newState = new RegionState(hriOnline, State.CLOSING, oldState.getServerName());
stateStore.updateRegionState(HConstants.NO_SEQNUM, -1, newState, oldState);
// Put the offline region in pending_open. It is actually not opened yet.
// This is to simulate that the region open RPC is not sent out before failover
oldState = new RegionState(hriOffline, State.OFFLINE);
newState = new RegionState(hriOffline, State.OPENING, newState.getServerName());
stateStore.updateRegionState(HConstants.NO_SEQNUM, -1, newState, oldState);
HRegionInfo failedClose = new HRegionInfo(offlineTable.getTableName(), null, null);
createRegion(failedClose, rootdir, conf, offlineTable);
MetaTableAccessor.addRegionToMeta(master.getConnection(), failedClose);
oldState = new RegionState(failedClose, State.CLOSING);
newState = new RegionState(failedClose, State.FAILED_CLOSE, newState.getServerName());
stateStore.updateRegionState(HConstants.NO_SEQNUM, -1, newState, oldState);
HRegionInfo failedOpen = new HRegionInfo(offlineTable.getTableName(), null, null);
createRegion(failedOpen, rootdir, conf, offlineTable);
MetaTableAccessor.addRegionToMeta(master.getConnection(), failedOpen);
// Simulate a region transitioning to failed open when the region server reports the
// transition as FAILED_OPEN
oldState = new RegionState(failedOpen, State.OPENING);
newState = new RegionState(failedOpen, State.FAILED_OPEN, newState.getServerName());
stateStore.updateRegionState(HConstants.NO_SEQNUM, -1, newState, oldState);
HRegionInfo failedOpenNullServer = new HRegionInfo(offlineTable.getTableName(), null, null);
LOG.info("Failed open NUll server " + failedOpenNullServer.getEncodedName());
createRegion(failedOpenNullServer, rootdir, conf, offlineTable);
MetaTableAccessor.addRegionToMeta(master.getConnection(), failedOpenNullServer);
// Simulate a region transitioning to failed open when the master couldn't find a plan for
// the region
oldState = new RegionState(failedOpenNullServer, State.OFFLINE);
newState = new RegionState(failedOpenNullServer, State.FAILED_OPEN, null);
stateStore.updateRegionState(HConstants.NO_SEQNUM, -1, newState, oldState);
// Stop the master
log("Aborting master");
cluster.abortMaster(0);
cluster.waitOnMaster(0);
log("Master has aborted");
// Start up a new master
log("Starting up a new master");
master = cluster.startMaster().getMaster();
log("Waiting for master to be ready");
cluster.waitForActiveAndReadyMaster();
log("Master is ready");
// Wait till no region in transition any more
TEST_UTIL.waitUntilNoRegionsInTransition(60000);
// Get new region states since master restarted
regionStates = master.getAssignmentManager().getRegionStates();
// Both pending_open (RPC sent/not yet) regions should be online
assertTrue(regionStates.isRegionOnline(hriOffline));
assertTrue(regionStates.isRegionOnline(hriOnline));
assertTrue(regionStates.isRegionOnline(failedClose));
assertTrue(regionStates.isRegionOnline(failedOpenNullServer));
assertTrue(regionStates.isRegionOnline(failedOpen));
log("Done with verification, shutting down cluster");
// Done, shutdown the cluster
TEST_UTIL.shutdownMiniCluster();
}
/**
* Test meta in transition when master failover
*/
@ -361,9 +203,9 @@ public class TestMasterFailover {
// meta should remain where it was
RegionState metaState =
MetaTableLocator.getMetaRegionState(rs.getZooKeeper());
assertEquals("hbase:meta should be onlined on RS",
assertEquals("hbase:meta should be online on RS",
metaState.getServerName(), rs.getServerName());
assertEquals("hbase:meta should be onlined on RS",
assertEquals("hbase:meta should be online on RS",
metaState.getState(), State.OPEN);
// Start up a new master
@ -376,9 +218,9 @@ public class TestMasterFailover {
// ensure meta is still deployed on RS
metaState =
MetaTableLocator.getMetaRegionState(activeMaster.getZooKeeper());
assertEquals("hbase:meta should be onlined on RS",
assertEquals("hbase:meta should be online on RS",
metaState.getServerName(), rs.getServerName());
assertEquals("hbase:meta should be onlined on RS",
assertEquals("hbase:meta should be online on RS",
metaState.getState(), State.OPEN);
// Update meta state as OPENING, then kill master
@ -408,9 +250,9 @@ public class TestMasterFailover {
metaState =
MetaTableLocator.getMetaRegionState(activeMaster.getZooKeeper());
assertEquals("hbase:meta should be onlined on RS",
assertEquals("hbase:meta should be online on RS",
metaState.getServerName(), rs.getServerName());
assertEquals("hbase:meta should be onlined on RS",
assertEquals("hbase:meta should be online on RS",
metaState.getState(), State.OPEN);
// Update meta state as CLOSING, then kill master
@ -431,6 +273,7 @@ public class TestMasterFailover {
// Start up a new master
log("Starting up a new master");
activeMaster = cluster.startMaster().getMaster();
assertNotNull(activeMaster);
log("Waiting for master to be ready");
cluster.waitForActiveAndReadyMaster();
log("Master is ready");