HBASE-18543 [AMv2] Fixed and re-enabled TestMasterFailover
* testSimpleMasterFailover - fixed and verified * testPendingOpenOrCloseWhenMasterFailover - removed as logic is based on old code and no longer relevant. TestServerCrashProcedure tests assignments with crashing master and region servers * testMetaInTransitionWhenMasterFailover - verified that it is fixed by patch for HBASE-18511. Signed-off-by: Michael Stack <stack@apache.org>
This commit is contained in:
parent
6752eba68f
commit
5847c901a7
|
@ -1,4 +1,4 @@
|
|||
/**
|
||||
/*
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
|
@ -28,23 +28,13 @@ import java.util.List;
|
|||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.ClusterStatus;
|
||||
import org.apache.hadoop.hbase.HBaseConfiguration;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.HColumnDescriptor;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.HRegionInfo;
|
||||
import org.apache.hadoop.hbase.HTableDescriptor;
|
||||
import org.apache.hadoop.hbase.MetaTableAccessor;
|
||||
import org.apache.hadoop.hbase.MiniHBaseCluster;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.RegionLocator;
|
||||
import org.apache.hadoop.hbase.client.Table;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStates;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStateStore;
|
||||
import org.apache.hadoop.hbase.master.RegionState.State;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegion;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegionServer;
|
||||
|
@ -53,32 +43,15 @@ import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
|||
import org.apache.hadoop.hbase.testclassification.FlakeyTests;
|
||||
import org.apache.hadoop.hbase.testclassification.LargeTests;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.FSTableDescriptors;
|
||||
import org.apache.hadoop.hbase.util.FSUtils;
|
||||
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
|
||||
import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
|
||||
@Category({FlakeyTests.class, LargeTests.class})
|
||||
@Ignore // Needs to be rewritten for AMv2. Uses tricks not ordained when up on AMv2.
|
||||
public class TestMasterFailover {
|
||||
private static final Log LOG = LogFactory.getLog(TestMasterFailover.class);
|
||||
|
||||
HRegion createRegion(final HRegionInfo hri, final Path rootdir, final Configuration c,
|
||||
final HTableDescriptor htd)
|
||||
throws IOException {
|
||||
HRegion r = HBaseTestingUtility.createRegionAndWAL(hri, rootdir, c, htd);
|
||||
// The above call to create a region will create an wal file. Each
|
||||
// log file create will also create a running thread to do syncing. We need
|
||||
// to close out this log else we will have a running thread trying to sync
|
||||
// the file system continuously which is ugly when dfs is taken away at the
|
||||
// end of the test.
|
||||
HBaseTestingUtility.closeRegionAndWAL(r);
|
||||
return r;
|
||||
}
|
||||
|
||||
// TODO: Next test to add is with testing permutations of the RIT or the RS
|
||||
// killed are hosting ROOT and hbase:meta regions.
|
||||
|
||||
|
@ -92,7 +65,6 @@ public class TestMasterFailover {
|
|||
* Starts with three masters. Kills a backup master. Then kills the active
|
||||
* master. Ensures the final master becomes active and we can still contact
|
||||
* the cluster.
|
||||
* @throws Exception
|
||||
*/
|
||||
@Test (timeout=240000)
|
||||
public void testSimpleMasterFailover() throws Exception {
|
||||
|
@ -157,7 +129,7 @@ public class TestMasterFailover {
|
|||
assertEquals(2, masterThreads.size());
|
||||
int rsCount = masterThreads.get(activeIndex).getMaster().getClusterStatus().getServersSize();
|
||||
LOG.info("Active master " + active.getServerName() + " managing " + rsCount + " regions servers");
|
||||
assertEquals(4, rsCount);
|
||||
assertEquals(3, rsCount);
|
||||
|
||||
// Check that ClusterStatus reports the correct active and backup masters
|
||||
assertNotNull(active);
|
||||
|
@ -190,142 +162,12 @@ public class TestMasterFailover {
|
|||
int rss = status.getServersSize();
|
||||
LOG.info("Active master " + mastername.getServerName() + " managing " +
|
||||
rss + " region servers");
|
||||
assertEquals(4, rss);
|
||||
assertEquals(3, rss);
|
||||
|
||||
// Stop the cluster
|
||||
TEST_UTIL.shutdownMiniCluster();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test region in pending_open/close when master failover
|
||||
*/
|
||||
@Test (timeout=180000)
|
||||
public void testPendingOpenOrCloseWhenMasterFailover() throws Exception {
|
||||
final int NUM_MASTERS = 1;
|
||||
final int NUM_RS = 1;
|
||||
|
||||
// Create config to use for this cluster
|
||||
Configuration conf = HBaseConfiguration.create();
|
||||
|
||||
// Start the cluster
|
||||
HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
|
||||
TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
|
||||
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
|
||||
log("Cluster started");
|
||||
|
||||
// get all the master threads
|
||||
List<MasterThread> masterThreads = cluster.getMasterThreads();
|
||||
assertEquals(1, masterThreads.size());
|
||||
|
||||
// only one master thread, let's wait for it to be initialized
|
||||
assertTrue(cluster.waitForActiveAndReadyMaster());
|
||||
HMaster master = masterThreads.get(0).getMaster();
|
||||
assertTrue(master.isActiveMaster());
|
||||
assertTrue(master.isInitialized());
|
||||
|
||||
// Create a table with a region online
|
||||
Table onlineTable = TEST_UTIL.createTable(TableName.valueOf("onlineTable"), "family");
|
||||
onlineTable.close();
|
||||
// Create a table in META, so it has a region offline
|
||||
HTableDescriptor offlineTable = new HTableDescriptor(
|
||||
TableName.valueOf(Bytes.toBytes("offlineTable")));
|
||||
offlineTable.addFamily(new HColumnDescriptor(Bytes.toBytes("family")));
|
||||
|
||||
FileSystem filesystem = FileSystem.get(conf);
|
||||
Path rootdir = FSUtils.getRootDir(conf);
|
||||
FSTableDescriptors fstd = new FSTableDescriptors(conf, filesystem, rootdir);
|
||||
fstd.createTableDescriptor(offlineTable);
|
||||
|
||||
HRegionInfo hriOffline = new HRegionInfo(offlineTable.getTableName(), null, null);
|
||||
createRegion(hriOffline, rootdir, conf, offlineTable);
|
||||
MetaTableAccessor.addRegionToMeta(master.getConnection(), hriOffline);
|
||||
|
||||
log("Regions in hbase:meta and namespace have been created");
|
||||
|
||||
// at this point we only expect 3 regions to be assigned out
|
||||
// (catalogs and namespace, + 1 online region)
|
||||
assertEquals(3, cluster.countServedRegions());
|
||||
HRegionInfo hriOnline = null;
|
||||
try (RegionLocator locator =
|
||||
TEST_UTIL.getConnection().getRegionLocator(TableName.valueOf("onlineTable"))) {
|
||||
hriOnline = locator.getRegionLocation(HConstants.EMPTY_START_ROW).getRegionInfo();
|
||||
}
|
||||
RegionStates regionStates = master.getAssignmentManager().getRegionStates();
|
||||
RegionStateStore stateStore = master.getAssignmentManager().getRegionStateStore();
|
||||
|
||||
// Put the online region in pending_close. It is actually already opened.
|
||||
// This is to simulate that the region close RPC is not sent out before failover
|
||||
RegionState oldState = regionStates.getRegionState(hriOnline);
|
||||
RegionState newState = new RegionState(hriOnline, State.CLOSING, oldState.getServerName());
|
||||
stateStore.updateRegionState(HConstants.NO_SEQNUM, -1, newState, oldState);
|
||||
|
||||
// Put the offline region in pending_open. It is actually not opened yet.
|
||||
// This is to simulate that the region open RPC is not sent out before failover
|
||||
oldState = new RegionState(hriOffline, State.OFFLINE);
|
||||
newState = new RegionState(hriOffline, State.OPENING, newState.getServerName());
|
||||
stateStore.updateRegionState(HConstants.NO_SEQNUM, -1, newState, oldState);
|
||||
|
||||
HRegionInfo failedClose = new HRegionInfo(offlineTable.getTableName(), null, null);
|
||||
createRegion(failedClose, rootdir, conf, offlineTable);
|
||||
MetaTableAccessor.addRegionToMeta(master.getConnection(), failedClose);
|
||||
|
||||
oldState = new RegionState(failedClose, State.CLOSING);
|
||||
newState = new RegionState(failedClose, State.FAILED_CLOSE, newState.getServerName());
|
||||
stateStore.updateRegionState(HConstants.NO_SEQNUM, -1, newState, oldState);
|
||||
|
||||
HRegionInfo failedOpen = new HRegionInfo(offlineTable.getTableName(), null, null);
|
||||
createRegion(failedOpen, rootdir, conf, offlineTable);
|
||||
MetaTableAccessor.addRegionToMeta(master.getConnection(), failedOpen);
|
||||
|
||||
// Simulate a region transitioning to failed open when the region server reports the
|
||||
// transition as FAILED_OPEN
|
||||
oldState = new RegionState(failedOpen, State.OPENING);
|
||||
newState = new RegionState(failedOpen, State.FAILED_OPEN, newState.getServerName());
|
||||
stateStore.updateRegionState(HConstants.NO_SEQNUM, -1, newState, oldState);
|
||||
|
||||
HRegionInfo failedOpenNullServer = new HRegionInfo(offlineTable.getTableName(), null, null);
|
||||
LOG.info("Failed open NUll server " + failedOpenNullServer.getEncodedName());
|
||||
createRegion(failedOpenNullServer, rootdir, conf, offlineTable);
|
||||
MetaTableAccessor.addRegionToMeta(master.getConnection(), failedOpenNullServer);
|
||||
|
||||
// Simulate a region transitioning to failed open when the master couldn't find a plan for
|
||||
// the region
|
||||
oldState = new RegionState(failedOpenNullServer, State.OFFLINE);
|
||||
newState = new RegionState(failedOpenNullServer, State.FAILED_OPEN, null);
|
||||
stateStore.updateRegionState(HConstants.NO_SEQNUM, -1, newState, oldState);
|
||||
|
||||
// Stop the master
|
||||
log("Aborting master");
|
||||
cluster.abortMaster(0);
|
||||
cluster.waitOnMaster(0);
|
||||
log("Master has aborted");
|
||||
|
||||
// Start up a new master
|
||||
log("Starting up a new master");
|
||||
master = cluster.startMaster().getMaster();
|
||||
log("Waiting for master to be ready");
|
||||
cluster.waitForActiveAndReadyMaster();
|
||||
log("Master is ready");
|
||||
|
||||
// Wait till no region in transition any more
|
||||
TEST_UTIL.waitUntilNoRegionsInTransition(60000);
|
||||
|
||||
// Get new region states since master restarted
|
||||
regionStates = master.getAssignmentManager().getRegionStates();
|
||||
|
||||
// Both pending_open (RPC sent/not yet) regions should be online
|
||||
assertTrue(regionStates.isRegionOnline(hriOffline));
|
||||
assertTrue(regionStates.isRegionOnline(hriOnline));
|
||||
assertTrue(regionStates.isRegionOnline(failedClose));
|
||||
assertTrue(regionStates.isRegionOnline(failedOpenNullServer));
|
||||
assertTrue(regionStates.isRegionOnline(failedOpen));
|
||||
|
||||
log("Done with verification, shutting down cluster");
|
||||
|
||||
// Done, shutdown the cluster
|
||||
TEST_UTIL.shutdownMiniCluster();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test meta in transition when master failover
|
||||
*/
|
||||
|
@ -361,9 +203,9 @@ public class TestMasterFailover {
|
|||
// meta should remain where it was
|
||||
RegionState metaState =
|
||||
MetaTableLocator.getMetaRegionState(rs.getZooKeeper());
|
||||
assertEquals("hbase:meta should be onlined on RS",
|
||||
assertEquals("hbase:meta should be online on RS",
|
||||
metaState.getServerName(), rs.getServerName());
|
||||
assertEquals("hbase:meta should be onlined on RS",
|
||||
assertEquals("hbase:meta should be online on RS",
|
||||
metaState.getState(), State.OPEN);
|
||||
|
||||
// Start up a new master
|
||||
|
@ -376,9 +218,9 @@ public class TestMasterFailover {
|
|||
// ensure meta is still deployed on RS
|
||||
metaState =
|
||||
MetaTableLocator.getMetaRegionState(activeMaster.getZooKeeper());
|
||||
assertEquals("hbase:meta should be onlined on RS",
|
||||
assertEquals("hbase:meta should be online on RS",
|
||||
metaState.getServerName(), rs.getServerName());
|
||||
assertEquals("hbase:meta should be onlined on RS",
|
||||
assertEquals("hbase:meta should be online on RS",
|
||||
metaState.getState(), State.OPEN);
|
||||
|
||||
// Update meta state as OPENING, then kill master
|
||||
|
@ -408,9 +250,9 @@ public class TestMasterFailover {
|
|||
|
||||
metaState =
|
||||
MetaTableLocator.getMetaRegionState(activeMaster.getZooKeeper());
|
||||
assertEquals("hbase:meta should be onlined on RS",
|
||||
assertEquals("hbase:meta should be online on RS",
|
||||
metaState.getServerName(), rs.getServerName());
|
||||
assertEquals("hbase:meta should be onlined on RS",
|
||||
assertEquals("hbase:meta should be online on RS",
|
||||
metaState.getState(), State.OPEN);
|
||||
|
||||
// Update meta state as CLOSING, then kill master
|
||||
|
@ -431,6 +273,7 @@ public class TestMasterFailover {
|
|||
// Start up a new master
|
||||
log("Starting up a new master");
|
||||
activeMaster = cluster.startMaster().getMaster();
|
||||
assertNotNull(activeMaster);
|
||||
log("Waiting for master to be ready");
|
||||
cluster.waitForActiveAndReadyMaster();
|
||||
log("Master is ready");
|
||||
|
|
Loading…
Reference in New Issue