HBASE-4400 .META. getting stuck if RS hosting it is dead and znode state is in

RS_ZK_REGION_OPENED (Ramkrishna)


git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1172063 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Zhihong Yu 2011-09-17 20:27:00 +00:00
parent cc3e3c0bc8
commit 3cc7e62571
4 changed files with 122 additions and 4 deletions

View File

@ -286,6 +286,8 @@ Release 0.91.0 - Unreleased
IOException instead of UnknownRegionException IOException instead of UnknownRegionException
HBASE-4419 Resolve build warning messages (Praveen Patibandia) HBASE-4419 Resolve build warning messages (Praveen Patibandia)
HBASE-4428 Two methods in CacheTestUtils don't call setDaemon() on the threads HBASE-4428 Two methods in CacheTestUtils don't call setDaemon() on the threads
HBASE-4400 .META. getting stuck if RS hosting it is dead and znode state is in
RS_ZK_REGION_OPENED (Ramkrishna)
IMPROVEMENTS IMPROVEMENTS
HBASE-3290 Max Compaction Size (Nicolas Spiegelberg via Stack) HBASE-3290 Max Compaction Size (Nicolas Spiegelberg via Stack)

View File

@ -510,10 +510,9 @@ public class AssignmentManager extends ZooKeeperListener {
LOG.warn("Region in transition " + regionInfo.getEncodedName() + LOG.warn("Region in transition " + regionInfo.getEncodedName() +
" references a null server; letting RIT timeout so will be " + " references a null server; letting RIT timeout so will be " +
"assigned elsewhere"); "assigned elsewhere");
} else if (isOnDeadServer(regionInfo, deadServers) && } else if (!serverManager.isServerOnline(sn)
!serverManager.isServerOnline(sn)) { && (isOnDeadServer(regionInfo, deadServers)
// If was on a dead server, then its not open any more; needs || regionInfo.isMetaRegion() || regionInfo.isRootRegion())) {
// handling.
forceOffline(regionInfo, data); forceOffline(regionInfo, data);
} else { } else {
new OpenedRegionHandler(master, this, regionInfo, sn).process(); new OpenedRegionHandler(master, this, regionInfo, sn).process();

View File

@ -63,6 +63,7 @@ import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.util.Writables; import org.apache.hadoop.hbase.util.Writables;
import org.apache.hadoop.hbase.zookeeper.MiniZooKeeperCluster; import org.apache.hadoop.hbase.zookeeper.MiniZooKeeperCluster;
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
import org.apache.hadoop.hbase.zookeeper.ZKConfig; import org.apache.hadoop.hbase.zookeeper.ZKConfig;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.hadoop.hdfs.DFSClient; import org.apache.hadoop.hdfs.DFSClient;
@ -72,7 +73,9 @@ import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.LeaseManager; import org.apache.hadoop.hdfs.server.namenode.LeaseManager;
import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.mapred.MiniMRCluster; import org.apache.hadoop.mapred.MiniMRCluster;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.ZooKeeper; import org.apache.zookeeper.ZooKeeper;
import org.apache.zookeeper.KeeperException.NodeExistsException;
/** /**
* Facility for testing HBase. Replacement for * Facility for testing HBase. Replacement for
@ -1557,4 +1560,37 @@ public class HBaseTestingUtility {
return getFromStoreFile(store,get); return getFromStoreFile(store,get);
} }
/**
* Creates an znode with OPENED state.
* @param TEST_UTIL
* @param metaRegion
* @param regionServer
* @return
* @throws IOException
* @throws ZooKeeperConnectionException
* @throws KeeperException
* @throws NodeExistsException
*/
public static ZooKeeperWatcher createAndForceNodeToOpenedState(
HBaseTestingUtility TEST_UTIL, HRegion metaRegion,
HRegionServer regionServer) throws ZooKeeperConnectionException,
IOException, KeeperException, NodeExistsException {
ZooKeeperWatcher zkw = new ZooKeeperWatcher(TEST_UTIL.getConfiguration(),
"unittest", new Abortable() {
@Override
public void abort(String why, Throwable e) {
throw new RuntimeException("Fatal ZK error, why=" + why, e);
}
});
ZKAssign.createNodeOffline(zkw, metaRegion.getRegionInfo(), regionServer
.getServerName());
int version = ZKAssign.transitionNodeOpening(zkw, metaRegion
.getRegionInfo(), regionServer.getServerName());
ZKAssign.transitionNodeOpened(zkw, metaRegion.getRegionInfo(), regionServer
.getServerName(), version);
return zkw;
}
} }

View File

@ -23,6 +23,7 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
@ -44,9 +45,12 @@ import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.JVMClusterUtil; import org.apache.hadoop.hbase.util.JVMClusterUtil;
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
import org.apache.hadoop.hbase.zookeeper.ZKAssign; import org.apache.hadoop.hbase.zookeeper.ZKAssign;
import org.apache.hadoop.hbase.zookeeper.ZKTable; import org.apache.hadoop.hbase.zookeeper.ZKTable;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.KeeperException.NodeExistsException;
import org.junit.Test; import org.junit.Test;
public class TestMasterFailover { public class TestMasterFailover {
@ -128,6 +132,83 @@ public class TestMasterFailover {
TEST_UTIL.shutdownMiniCluster(); TEST_UTIL.shutdownMiniCluster();
} }
@Test
public void testShouldCheckMasterFailOverWhenMETAIsInOpenedState()
throws Exception {
final int NUM_MASTERS = 1;
final int NUM_RS = 2;
Configuration conf = HBaseConfiguration.create();
conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000);
conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 8000);
// Start the cluster
HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
// get all the master threads
List<MasterThread> masterThreads = cluster.getMasterThreads();
// wait for each to come online
for (MasterThread mt : masterThreads) {
assertTrue(mt.isAlive());
}
// verify only one is the active master and we have right number
int numActive = 0;
ServerName activeName = null;
for (int i = 0; i < masterThreads.size(); i++) {
if (masterThreads.get(i).getMaster().isActiveMaster()) {
numActive++;
activeName = masterThreads.get(i).getMaster().getServerName();
}
}
assertEquals(1, numActive);
assertEquals(NUM_MASTERS, masterThreads.size());
// verify still one active master and it's the same
for (int i = 0; i < masterThreads.size(); i++) {
if (masterThreads.get(i).getMaster().isActiveMaster()) {
assertTrue(activeName.equals(masterThreads.get(i).getMaster()
.getServerName()));
}
}
assertEquals(1, numActive);
assertEquals(1, masterThreads.size());
List<RegionServerThread> regionServerThreads = cluster
.getRegionServerThreads();
int count = -1;
HRegion metaRegion = null;
for (RegionServerThread regionServerThread : regionServerThreads) {
HRegionServer regionServer = regionServerThread.getRegionServer();
metaRegion = regionServer
.getOnlineRegion(HRegionInfo.FIRST_META_REGIONINFO.getRegionName());
count++;
regionServer.abort("");
if (null != metaRegion) {
break;
}
}
HRegionServer regionServer = cluster.getRegionServer(count);
cluster.shutdown();
// Create a ZKW to use in the test
ZooKeeperWatcher zkw =
HBaseTestingUtility.createAndForceNodeToOpenedState(TEST_UTIL,
metaRegion, regionServer);
TEST_UTIL.startMiniHBaseCluster(1, 1);
// Failover should be completed, now wait for no RIT
log("Waiting for no more RIT");
ZKAssign.blockUntilNoRIT(zkw);
// Stop the cluster
TEST_UTIL.shutdownMiniCluster();
}
/** /**
* Complex test of master failover that tests as many permutations of the * Complex test of master failover that tests as many permutations of the
* different possible states that regions in transition could be in within ZK. * different possible states that regions in transition could be in within ZK.