HBASE-4400 .META. getting stuck if RS hosting it is dead and znode state is in
RS_ZK_REGION_OPENED (Ramkrishna) git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1172063 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cc3e3c0bc8
commit
3cc7e62571
|
@ -286,6 +286,8 @@ Release 0.91.0 - Unreleased
|
|||
IOException instead of UnknownRegionException
|
||||
HBASE-4419 Resolve build warning messages (Praveen Patibandia)
|
||||
HBASE-4428 Two methods in CacheTestUtils don't call setDaemon() on the threads
|
||||
HBASE-4400 .META. getting stuck if RS hosting it is dead and znode state is in
|
||||
RS_ZK_REGION_OPENED (Ramkrishna)
|
||||
|
||||
IMPROVEMENTS
|
||||
HBASE-3290 Max Compaction Size (Nicolas Spiegelberg via Stack)
|
||||
|
|
|
@ -510,10 +510,9 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
LOG.warn("Region in transition " + regionInfo.getEncodedName() +
|
||||
" references a null server; letting RIT timeout so will be " +
|
||||
"assigned elsewhere");
|
||||
} else if (isOnDeadServer(regionInfo, deadServers) &&
|
||||
!serverManager.isServerOnline(sn)) {
|
||||
// If was on a dead server, then its not open any more; needs
|
||||
// handling.
|
||||
} else if (!serverManager.isServerOnline(sn)
|
||||
&& (isOnDeadServer(regionInfo, deadServers)
|
||||
|| regionInfo.isMetaRegion() || regionInfo.isRootRegion())) {
|
||||
forceOffline(regionInfo, data);
|
||||
} else {
|
||||
new OpenedRegionHandler(master, this, regionInfo, sn).process();
|
||||
|
|
|
@ -63,6 +63,7 @@ import org.apache.hadoop.hbase.util.FSUtils;
|
|||
import org.apache.hadoop.hbase.util.Threads;
|
||||
import org.apache.hadoop.hbase.util.Writables;
|
||||
import org.apache.hadoop.hbase.zookeeper.MiniZooKeeperCluster;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKConfig;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
|
||||
import org.apache.hadoop.hdfs.DFSClient;
|
||||
|
@ -72,7 +73,9 @@ import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
|||
import org.apache.hadoop.hdfs.server.namenode.LeaseManager;
|
||||
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||
import org.apache.hadoop.mapred.MiniMRCluster;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.apache.zookeeper.ZooKeeper;
|
||||
import org.apache.zookeeper.KeeperException.NodeExistsException;
|
||||
|
||||
/**
|
||||
* Facility for testing HBase. Replacement for
|
||||
|
@ -1557,4 +1560,37 @@ public class HBaseTestingUtility {
|
|||
|
||||
return getFromStoreFile(store,get);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an znode with OPENED state.
|
||||
* @param TEST_UTIL
|
||||
* @param metaRegion
|
||||
* @param regionServer
|
||||
* @return
|
||||
* @throws IOException
|
||||
* @throws ZooKeeperConnectionException
|
||||
* @throws KeeperException
|
||||
* @throws NodeExistsException
|
||||
*/
|
||||
public static ZooKeeperWatcher createAndForceNodeToOpenedState(
|
||||
HBaseTestingUtility TEST_UTIL, HRegion metaRegion,
|
||||
HRegionServer regionServer) throws ZooKeeperConnectionException,
|
||||
IOException, KeeperException, NodeExistsException {
|
||||
ZooKeeperWatcher zkw = new ZooKeeperWatcher(TEST_UTIL.getConfiguration(),
|
||||
"unittest", new Abortable() {
|
||||
@Override
|
||||
public void abort(String why, Throwable e) {
|
||||
throw new RuntimeException("Fatal ZK error, why=" + why, e);
|
||||
}
|
||||
});
|
||||
|
||||
ZKAssign.createNodeOffline(zkw, metaRegion.getRegionInfo(), regionServer
|
||||
.getServerName());
|
||||
int version = ZKAssign.transitionNodeOpening(zkw, metaRegion
|
||||
.getRegionInfo(), regionServer.getServerName());
|
||||
ZKAssign.transitionNodeOpened(zkw, metaRegion.getRegionInfo(), regionServer
|
||||
.getServerName(), version);
|
||||
return zkw;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import static org.junit.Assert.assertEquals;
|
|||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
@ -44,9 +45,12 @@ import org.apache.hadoop.hbase.util.Bytes;
|
|||
import org.apache.hadoop.hbase.util.FSUtils;
|
||||
import org.apache.hadoop.hbase.util.JVMClusterUtil;
|
||||
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
|
||||
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKTable;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.apache.zookeeper.KeeperException.NodeExistsException;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestMasterFailover {
|
||||
|
@ -127,6 +131,83 @@ public class TestMasterFailover {
|
|||
// Stop the cluster
|
||||
TEST_UTIL.shutdownMiniCluster();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testShouldCheckMasterFailOverWhenMETAIsInOpenedState()
|
||||
throws Exception {
|
||||
final int NUM_MASTERS = 1;
|
||||
final int NUM_RS = 2;
|
||||
|
||||
Configuration conf = HBaseConfiguration.create();
|
||||
conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000);
|
||||
conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 8000);
|
||||
// Start the cluster
|
||||
HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
|
||||
TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
|
||||
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
|
||||
|
||||
// get all the master threads
|
||||
List<MasterThread> masterThreads = cluster.getMasterThreads();
|
||||
|
||||
// wait for each to come online
|
||||
for (MasterThread mt : masterThreads) {
|
||||
assertTrue(mt.isAlive());
|
||||
}
|
||||
|
||||
// verify only one is the active master and we have right number
|
||||
int numActive = 0;
|
||||
ServerName activeName = null;
|
||||
for (int i = 0; i < masterThreads.size(); i++) {
|
||||
if (masterThreads.get(i).getMaster().isActiveMaster()) {
|
||||
numActive++;
|
||||
activeName = masterThreads.get(i).getMaster().getServerName();
|
||||
}
|
||||
}
|
||||
assertEquals(1, numActive);
|
||||
assertEquals(NUM_MASTERS, masterThreads.size());
|
||||
|
||||
// verify still one active master and it's the same
|
||||
for (int i = 0; i < masterThreads.size(); i++) {
|
||||
if (masterThreads.get(i).getMaster().isActiveMaster()) {
|
||||
assertTrue(activeName.equals(masterThreads.get(i).getMaster()
|
||||
.getServerName()));
|
||||
}
|
||||
}
|
||||
assertEquals(1, numActive);
|
||||
assertEquals(1, masterThreads.size());
|
||||
|
||||
List<RegionServerThread> regionServerThreads = cluster
|
||||
.getRegionServerThreads();
|
||||
int count = -1;
|
||||
HRegion metaRegion = null;
|
||||
for (RegionServerThread regionServerThread : regionServerThreads) {
|
||||
HRegionServer regionServer = regionServerThread.getRegionServer();
|
||||
metaRegion = regionServer
|
||||
.getOnlineRegion(HRegionInfo.FIRST_META_REGIONINFO.getRegionName());
|
||||
count++;
|
||||
regionServer.abort("");
|
||||
if (null != metaRegion) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
HRegionServer regionServer = cluster.getRegionServer(count);
|
||||
|
||||
cluster.shutdown();
|
||||
// Create a ZKW to use in the test
|
||||
ZooKeeperWatcher zkw =
|
||||
HBaseTestingUtility.createAndForceNodeToOpenedState(TEST_UTIL,
|
||||
metaRegion, regionServer);
|
||||
|
||||
TEST_UTIL.startMiniHBaseCluster(1, 1);
|
||||
|
||||
// Failover should be completed, now wait for no RIT
|
||||
log("Waiting for no more RIT");
|
||||
ZKAssign.blockUntilNoRIT(zkw);
|
||||
|
||||
// Stop the cluster
|
||||
TEST_UTIL.shutdownMiniCluster();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Complex test of master failover that tests as many permutations of the
|
||||
|
|
Loading…
Reference in New Issue