HBASE-10101 testOfflineRegionReAssginedAfterMasterRestart times out sometimes

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1549708 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
jxiang 2013-12-09 23:38:02 +00:00
parent 08bd55aab2
commit 97191c921b
7 changed files with 114 additions and 95 deletions

View File

@ -613,14 +613,31 @@ public class AssignmentManager extends ZooKeeperListener {
return true;
}
if (!serverManager.isServerOnline(sn)) {
// It was on a dead server, it's closed now. Force to OFFLINE and put
// it in transition. Try to re-assign it, but it will fail most likely,
// since we have not done log splitting for the dead server yet.
// It was transitioning on a dead server, so it's closed now.
// Force to OFFLINE and put it in transition, but not assign it
// since log splitting for the dead server is not done yet.
LOG.debug("RIT " + encodedName + " in state=" + rt.getEventType() +
" was on deadserver; forcing offline");
ZKAssign.createOrForceNodeOffline(this.watcher, regionInfo, sn);
if (regionStates.isRegionOnline(regionInfo)) {
// Meta could still show the region is assigned to the previous
// server. If that server is online, when we reload the meta, the
// region is put back to online, we need to offline it.
regionStates.regionOffline(regionInfo);
}
// Put it back in transition so that SSH can re-assign it
regionStates.updateRegionState(regionInfo, State.OFFLINE, sn);
invokeAssign(regionInfo);
// No mater the previous server is online or offline,
// we need to reset the last region server of the region.
regionStates.setLastRegionServerOfRegion(sn, encodedName);
if (regionInfo.isMetaRegion()) {
// If it's meta region, reset the meta location.
// So that master knows the right meta region server.
MetaRegionTracker.setMetaLocation(watcher, sn);
}
// Make sure we know the server is dead.
if (!serverManager.isServerDead(sn)) {
serverManager.expireServer(sn);
}
return false;
}
switch (et) {

View File

@ -1006,7 +1006,9 @@ MasterServices, Server {
if (!rit) {
// Assign meta since not already in transition
if (currentMetaServer != null) {
if (expireIfOnline(currentMetaServer)) {
if (!serverManager.isServerDead(currentMetaServer)) {
LOG.info("Forcing expire of " + currentMetaServer);
serverManager.expireServer(currentMetaServer);
splitMetaLogBeforeAssignment(currentMetaServer);
if (this.distributedLogReplay) {
logReplayFailedMetaServer = currentMetaServer;
@ -1087,22 +1089,6 @@ MasterServices, Server {
}
}
/**
* Expire a server if we find it is one of the online servers.
* @param sn ServerName to check.
* @return true when server <code>sn<code> is being expired by the function.
* @throws IOException
*/
private boolean expireIfOnline(final ServerName sn)
throws IOException {
if (sn == null || !serverManager.isServerOnline(sn)) {
return false;
}
LOG.info("Forcing expire of " + sn);
serverManager.expireServer(sn);
return true;
}
/**
* This function returns a set of region server names under hbase:meta recovering region ZK node
* @return Set of meta server names which were recorded in ZK

View File

@ -511,8 +511,9 @@ public class RegionStates {
// region is not open on this server. So the region must be
// moving to this server from another one (i.e. opening or
// pending open on this server, was open on another one.
// It could be in failed_close state too if tried several times
// to open it while the server is not reachable.
// Offline state is also kind of pending open if the region is in
// transition. The region could be in failed_close state too if we have
// tried several times to open it while this region server is not reachable)
if (state.isPendingOpenOrOpening() || state.isFailedClose() || state.isOffline()) {
LOG.info("Found region in " + state + " to be reassigned by SSH for " + sn);
rits.add(hri);
@ -623,10 +624,15 @@ public class RegionStates {
synchronized void setLastRegionServerOfRegions(
final ServerName serverName, final List<HRegionInfo> regionInfos) {
for (HRegionInfo hri: regionInfos) {
lastAssignments.put(hri.getEncodedName(), serverName);
setLastRegionServerOfRegion(serverName, hri.getEncodedName());
}
}
synchronized void setLastRegionServerOfRegion(
final ServerName serverName, final String encodedName) {
lastAssignments.put(encodedName, serverName);
}
/**
* Compute the average load across all region servers.
* Currently, this uses a very naive computation - just uses the number of

View File

@ -832,8 +832,8 @@ public class TestAssignmentManager {
/**
* Test the scenario when the master is in failover and trying to process a
* region which is in Opening state on a dead RS. Master should immediately
* assign the region and not wait for Timeout Monitor.(Hbase-5882).
* region which is in Opening state on a dead RS. Master will force offline the
* region and put it in transition. AM relies on SSH to reassign it.
*/
@Test(timeout = 60000)
public void testRegionInOpeningStateOnDeadRSWhileMasterFailover() throws IOException,
@ -851,7 +851,10 @@ public class TestAssignmentManager {
am.getRegionStates().logSplit(SERVERNAME_A); // Assume log splitting is done
am.getRegionStates().createRegionState(REGIONINFO);
am.gate.set(false);
am.processRegionsInTransition(rt, REGIONINFO, version);
CatalogTracker ct = Mockito.mock(CatalogTracker.class);
assertFalse(am.processRegionsInTransition(rt, REGIONINFO, version));
am.getZKTable().setEnabledTable(REGIONINFO.getTable());
processServerShutdownHandler(ct, am, false);
// Waiting for the assignment to get completed.
while (!am.gate.get()) {
Thread.sleep(10);
@ -1159,6 +1162,8 @@ public class TestAssignmentManager {
public void assign(List<HRegionInfo> regions)
throws IOException, InterruptedException {
assignInvoked = (regions != null && regions.size() > 0);
super.assign(regions);
this.gate.set(true);
}
/** reset the watcher */

View File

@ -29,8 +29,6 @@ import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@ -41,11 +39,9 @@ import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.MediumTests;
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.RegionTransition;
import org.apache.hadoop.hbase.ServerLoad;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.Waiter;
import org.apache.hadoop.hbase.catalog.MetaEditor;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
@ -64,7 +60,6 @@ import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.data.Stat;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
@ -75,7 +70,6 @@ import org.junit.experimental.categories.Category;
*/
@Category(MediumTests.class)
public class TestAssignmentManagerOnCluster {
private static final Log LOG = LogFactory.getLog(TestAssignmentManagerOnCluster.class);
private final static byte[] FAMILY = Bytes.toBytes("FAMILY");
private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
private final static Configuration conf = TEST_UTIL.getConfiguration();
@ -766,66 +760,6 @@ public class TestAssignmentManagerOnCluster {
}
}
/**
* This tests a RIT in offline state will get re-assigned after a master restart
*/
@Test(timeout = 60000)
public void testOfflineRegionReAssginedAfterMasterRestart() throws Exception {
final TableName table = TableName.valueOf("testOfflineRegionReAssginedAfterMasterRestart");
final HRegionInfo hri = createTableAndGetOneRegion(table);
HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
RegionStates regionStates = master.getAssignmentManager().getRegionStates();
ServerName serverName = regionStates.getRegionServerOfRegion(hri);
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
ServerName dstName = null;
for (ServerName tmpServer : master.serverManager.getOnlineServers().keySet()) {
if (!tmpServer.equals(serverName)) {
dstName = tmpServer;
break;
}
}
// find a different server
assertTrue(dstName != null);
// shutdown HBase cluster
TEST_UTIL.shutdownMiniHBaseCluster();
// create a RIT node in offline state
ZooKeeperWatcher zkw = TEST_UTIL.getZooKeeperWatcher();
ZKAssign.createNodeOffline(zkw, hri, dstName);
Stat stat = new Stat();
byte[] data =
ZKAssign.getDataNoWatch(TEST_UTIL.getZooKeeperWatcher(), hri.getEncodedName(), stat);
assertTrue(data != null);
RegionTransition rt = RegionTransition.parseFrom(data);
assertTrue(rt.getEventType() == EventType.M_ZK_REGION_OFFLINE);
LOG.info(hri.getEncodedName() + " region is in offline state with source server=" + serverName
+ " and dst server=" + dstName);
// start HBase cluster
TEST_UTIL.startMiniHBaseCluster(1, 4, MyMaster.class, null);
// wait for the region is re-assigned.
TEST_UTIL.waitFor(30000, 200, new Waiter.Predicate<Exception>() {
@Override
public boolean evaluate() throws Exception {
HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
if (master != null && master.isInitialized()) {
ServerManager serverManager = master.getServerManager();
return !serverManager.areDeadServersInProgress();
}
return false;
}
});
// verify the region is assigned
master = TEST_UTIL.getHBaseCluster().getMaster();
master.getAssignmentManager().waitForAssignment(hri);
regionStates = master.getAssignmentManager().getRegionStates();
RegionState newState = regionStates.getRegionState(hri);
assertTrue(newState.isOpened());
}
static class MyLoadBalancer extends StochasticLoadBalancer {
// For this region, if specified, always assign to nowhere
static volatile String controledRegion = null;

View File

@ -64,6 +64,7 @@ import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
import org.apache.hadoop.hbase.zookeeper.ZKTable;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.zookeeper.data.Stat;
import org.junit.Test;
import org.junit.experimental.categories.Category;
@ -972,6 +973,76 @@ public class TestMasterFailover {
TEST_UTIL.shutdownMiniCluster();
}
/**
* This tests a RIT in offline state will get re-assigned after a master restart
*/
@Test(timeout=240000)
public void testOfflineRegionReAssginedAfterMasterRestart() throws Exception {
final TableName table = TableName.valueOf("testOfflineRegionReAssginedAfterMasterRestart");
final int NUM_MASTERS = 1;
final int NUM_RS = 2;
// Create config to use for this cluster
Configuration conf = HBaseConfiguration.create();
// Start the cluster
final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
log("Cluster started");
TEST_UTIL.createTable(table, Bytes.toBytes("family"));
HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
RegionStates regionStates = master.getAssignmentManager().getRegionStates();
HRegionInfo hri = regionStates.getRegionsOfTable(table).get(0);
ServerName serverName = regionStates.getRegionServerOfRegion(hri);
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
ServerName dstName = null;
for (ServerName tmpServer : master.serverManager.getOnlineServers().keySet()) {
if (!tmpServer.equals(serverName)) {
dstName = tmpServer;
break;
}
}
// find a different server
assertTrue(dstName != null);
// shutdown HBase cluster
TEST_UTIL.shutdownMiniHBaseCluster();
// create a RIT node in offline state
ZooKeeperWatcher zkw = TEST_UTIL.getZooKeeperWatcher();
ZKAssign.createNodeOffline(zkw, hri, dstName);
Stat stat = new Stat();
byte[] data =
ZKAssign.getDataNoWatch(zkw, hri.getEncodedName(), stat);
assertTrue(data != null);
RegionTransition rt = RegionTransition.parseFrom(data);
assertTrue(rt.getEventType() == EventType.M_ZK_REGION_OFFLINE);
LOG.info(hri.getEncodedName() + " region is in offline state with source server=" + serverName
+ " and dst server=" + dstName);
// start HBase cluster
TEST_UTIL.startMiniHBaseCluster(NUM_MASTERS, NUM_RS);
while (true) {
master = TEST_UTIL.getHBaseCluster().getMaster();
if (master != null && master.isInitialized()) {
ServerManager serverManager = master.getServerManager();
if (!serverManager.areDeadServersInProgress()) {
break;
}
}
Thread.sleep(200);
}
// verify the region is assigned
master = TEST_UTIL.getHBaseCluster().getMaster();
master.getAssignmentManager().waitForAssignment(hri);
regionStates = master.getAssignmentManager().getRegionStates();
RegionState newState = regionStates.getRegionState(hri);
assertTrue(newState.isOpened());
}
/**
* Simple test of master failover.
* <p>

View File

@ -66,7 +66,7 @@ public class TestRestartCluster {
String unassignedZNode = zooKeeper.assignmentZNode;
ZKUtil.createAndFailSilent(zooKeeper, unassignedZNode);
ServerName sn = ServerName.valueOf(HMaster.MASTER, -1, System.currentTimeMillis());
ServerName sn = ServerName.valueOf(HMaster.MASTER, 1, System.currentTimeMillis());
ZKAssign.createNodeOffline(zooKeeper, HRegionInfo.FIRST_META_REGIONINFO, sn);