HBASE-4400 .META. getting stuck if RS hosting it is dead and znode state is in
RS_ZK_REGION_OPENED (Ramkrishna) git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1172063 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cc3e3c0bc8
commit
3cc7e62571
|
@ -286,6 +286,8 @@ Release 0.91.0 - Unreleased
|
||||||
IOException instead of UnknownRegionException
|
IOException instead of UnknownRegionException
|
||||||
HBASE-4419 Resolve build warning messages (Praveen Patibandia)
|
HBASE-4419 Resolve build warning messages (Praveen Patibandia)
|
||||||
HBASE-4428 Two methods in CacheTestUtils don't call setDaemon() on the threads
|
HBASE-4428 Two methods in CacheTestUtils don't call setDaemon() on the threads
|
||||||
|
HBASE-4400 .META. getting stuck if RS hosting it is dead and znode state is in
|
||||||
|
RS_ZK_REGION_OPENED (Ramkrishna)
|
||||||
|
|
||||||
IMPROVEMENTS
|
IMPROVEMENTS
|
||||||
HBASE-3290 Max Compaction Size (Nicolas Spiegelberg via Stack)
|
HBASE-3290 Max Compaction Size (Nicolas Spiegelberg via Stack)
|
||||||
|
|
|
@ -510,10 +510,9 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
LOG.warn("Region in transition " + regionInfo.getEncodedName() +
|
LOG.warn("Region in transition " + regionInfo.getEncodedName() +
|
||||||
" references a null server; letting RIT timeout so will be " +
|
" references a null server; letting RIT timeout so will be " +
|
||||||
"assigned elsewhere");
|
"assigned elsewhere");
|
||||||
} else if (isOnDeadServer(regionInfo, deadServers) &&
|
} else if (!serverManager.isServerOnline(sn)
|
||||||
!serverManager.isServerOnline(sn)) {
|
&& (isOnDeadServer(regionInfo, deadServers)
|
||||||
// If was on a dead server, then its not open any more; needs
|
|| regionInfo.isMetaRegion() || regionInfo.isRootRegion())) {
|
||||||
// handling.
|
|
||||||
forceOffline(regionInfo, data);
|
forceOffline(regionInfo, data);
|
||||||
} else {
|
} else {
|
||||||
new OpenedRegionHandler(master, this, regionInfo, sn).process();
|
new OpenedRegionHandler(master, this, regionInfo, sn).process();
|
||||||
|
|
|
@ -63,6 +63,7 @@ import org.apache.hadoop.hbase.util.FSUtils;
|
||||||
import org.apache.hadoop.hbase.util.Threads;
|
import org.apache.hadoop.hbase.util.Threads;
|
||||||
import org.apache.hadoop.hbase.util.Writables;
|
import org.apache.hadoop.hbase.util.Writables;
|
||||||
import org.apache.hadoop.hbase.zookeeper.MiniZooKeeperCluster;
|
import org.apache.hadoop.hbase.zookeeper.MiniZooKeeperCluster;
|
||||||
|
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
|
||||||
import org.apache.hadoop.hbase.zookeeper.ZKConfig;
|
import org.apache.hadoop.hbase.zookeeper.ZKConfig;
|
||||||
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
|
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
|
||||||
import org.apache.hadoop.hdfs.DFSClient;
|
import org.apache.hadoop.hdfs.DFSClient;
|
||||||
|
@ -72,7 +73,9 @@ import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.LeaseManager;
|
import org.apache.hadoop.hdfs.server.namenode.LeaseManager;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||||
import org.apache.hadoop.mapred.MiniMRCluster;
|
import org.apache.hadoop.mapred.MiniMRCluster;
|
||||||
|
import org.apache.zookeeper.KeeperException;
|
||||||
import org.apache.zookeeper.ZooKeeper;
|
import org.apache.zookeeper.ZooKeeper;
|
||||||
|
import org.apache.zookeeper.KeeperException.NodeExistsException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Facility for testing HBase. Replacement for
|
* Facility for testing HBase. Replacement for
|
||||||
|
@ -1557,4 +1560,37 @@ public class HBaseTestingUtility {
|
||||||
|
|
||||||
return getFromStoreFile(store,get);
|
return getFromStoreFile(store,get);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates an znode with OPENED state.
|
||||||
|
* @param TEST_UTIL
|
||||||
|
* @param metaRegion
|
||||||
|
* @param regionServer
|
||||||
|
* @return
|
||||||
|
* @throws IOException
|
||||||
|
* @throws ZooKeeperConnectionException
|
||||||
|
* @throws KeeperException
|
||||||
|
* @throws NodeExistsException
|
||||||
|
*/
|
||||||
|
public static ZooKeeperWatcher createAndForceNodeToOpenedState(
|
||||||
|
HBaseTestingUtility TEST_UTIL, HRegion metaRegion,
|
||||||
|
HRegionServer regionServer) throws ZooKeeperConnectionException,
|
||||||
|
IOException, KeeperException, NodeExistsException {
|
||||||
|
ZooKeeperWatcher zkw = new ZooKeeperWatcher(TEST_UTIL.getConfiguration(),
|
||||||
|
"unittest", new Abortable() {
|
||||||
|
@Override
|
||||||
|
public void abort(String why, Throwable e) {
|
||||||
|
throw new RuntimeException("Fatal ZK error, why=" + why, e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
ZKAssign.createNodeOffline(zkw, metaRegion.getRegionInfo(), regionServer
|
||||||
|
.getServerName());
|
||||||
|
int version = ZKAssign.transitionNodeOpening(zkw, metaRegion
|
||||||
|
.getRegionInfo(), regionServer.getServerName());
|
||||||
|
ZKAssign.transitionNodeOpened(zkw, metaRegion.getRegionInfo(), regionServer
|
||||||
|
.getServerName(), version);
|
||||||
|
return zkw;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,6 +23,7 @@ import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertFalse;
|
import static org.junit.Assert.assertFalse;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
@ -44,9 +45,12 @@ import org.apache.hadoop.hbase.util.Bytes;
|
||||||
import org.apache.hadoop.hbase.util.FSUtils;
|
import org.apache.hadoop.hbase.util.FSUtils;
|
||||||
import org.apache.hadoop.hbase.util.JVMClusterUtil;
|
import org.apache.hadoop.hbase.util.JVMClusterUtil;
|
||||||
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
|
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
|
||||||
|
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
|
||||||
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
|
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
|
||||||
import org.apache.hadoop.hbase.zookeeper.ZKTable;
|
import org.apache.hadoop.hbase.zookeeper.ZKTable;
|
||||||
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
|
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
|
||||||
|
import org.apache.zookeeper.KeeperException;
|
||||||
|
import org.apache.zookeeper.KeeperException.NodeExistsException;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
public class TestMasterFailover {
|
public class TestMasterFailover {
|
||||||
|
@ -127,6 +131,83 @@ public class TestMasterFailover {
|
||||||
// Stop the cluster
|
// Stop the cluster
|
||||||
TEST_UTIL.shutdownMiniCluster();
|
TEST_UTIL.shutdownMiniCluster();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testShouldCheckMasterFailOverWhenMETAIsInOpenedState()
|
||||||
|
throws Exception {
|
||||||
|
final int NUM_MASTERS = 1;
|
||||||
|
final int NUM_RS = 2;
|
||||||
|
|
||||||
|
Configuration conf = HBaseConfiguration.create();
|
||||||
|
conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000);
|
||||||
|
conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 8000);
|
||||||
|
// Start the cluster
|
||||||
|
HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
|
||||||
|
TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
|
||||||
|
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
|
||||||
|
|
||||||
|
// get all the master threads
|
||||||
|
List<MasterThread> masterThreads = cluster.getMasterThreads();
|
||||||
|
|
||||||
|
// wait for each to come online
|
||||||
|
for (MasterThread mt : masterThreads) {
|
||||||
|
assertTrue(mt.isAlive());
|
||||||
|
}
|
||||||
|
|
||||||
|
// verify only one is the active master and we have right number
|
||||||
|
int numActive = 0;
|
||||||
|
ServerName activeName = null;
|
||||||
|
for (int i = 0; i < masterThreads.size(); i++) {
|
||||||
|
if (masterThreads.get(i).getMaster().isActiveMaster()) {
|
||||||
|
numActive++;
|
||||||
|
activeName = masterThreads.get(i).getMaster().getServerName();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertEquals(1, numActive);
|
||||||
|
assertEquals(NUM_MASTERS, masterThreads.size());
|
||||||
|
|
||||||
|
// verify still one active master and it's the same
|
||||||
|
for (int i = 0; i < masterThreads.size(); i++) {
|
||||||
|
if (masterThreads.get(i).getMaster().isActiveMaster()) {
|
||||||
|
assertTrue(activeName.equals(masterThreads.get(i).getMaster()
|
||||||
|
.getServerName()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertEquals(1, numActive);
|
||||||
|
assertEquals(1, masterThreads.size());
|
||||||
|
|
||||||
|
List<RegionServerThread> regionServerThreads = cluster
|
||||||
|
.getRegionServerThreads();
|
||||||
|
int count = -1;
|
||||||
|
HRegion metaRegion = null;
|
||||||
|
for (RegionServerThread regionServerThread : regionServerThreads) {
|
||||||
|
HRegionServer regionServer = regionServerThread.getRegionServer();
|
||||||
|
metaRegion = regionServer
|
||||||
|
.getOnlineRegion(HRegionInfo.FIRST_META_REGIONINFO.getRegionName());
|
||||||
|
count++;
|
||||||
|
regionServer.abort("");
|
||||||
|
if (null != metaRegion) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
HRegionServer regionServer = cluster.getRegionServer(count);
|
||||||
|
|
||||||
|
cluster.shutdown();
|
||||||
|
// Create a ZKW to use in the test
|
||||||
|
ZooKeeperWatcher zkw =
|
||||||
|
HBaseTestingUtility.createAndForceNodeToOpenedState(TEST_UTIL,
|
||||||
|
metaRegion, regionServer);
|
||||||
|
|
||||||
|
TEST_UTIL.startMiniHBaseCluster(1, 1);
|
||||||
|
|
||||||
|
// Failover should be completed, now wait for no RIT
|
||||||
|
log("Waiting for no more RIT");
|
||||||
|
ZKAssign.blockUntilNoRIT(zkw);
|
||||||
|
|
||||||
|
// Stop the cluster
|
||||||
|
TEST_UTIL.shutdownMiniCluster();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Complex test of master failover that tests as many permutations of the
|
* Complex test of master failover that tests as many permutations of the
|
||||||
|
|
Loading…
Reference in New Issue