HBASE-10101 testOfflineRegionReAssginedAfterMasterRestart times out sometimes
git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1549708 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
08bd55aab2
commit
97191c921b
|
@ -613,14 +613,31 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
return true;
|
||||
}
|
||||
if (!serverManager.isServerOnline(sn)) {
|
||||
// It was on a dead server, it's closed now. Force to OFFLINE and put
|
||||
// it in transition. Try to re-assign it, but it will fail most likely,
|
||||
// since we have not done log splitting for the dead server yet.
|
||||
// It was transitioning on a dead server, so it's closed now.
|
||||
// Force to OFFLINE and put it in transition, but not assign it
|
||||
// since log splitting for the dead server is not done yet.
|
||||
LOG.debug("RIT " + encodedName + " in state=" + rt.getEventType() +
|
||||
" was on deadserver; forcing offline");
|
||||
ZKAssign.createOrForceNodeOffline(this.watcher, regionInfo, sn);
|
||||
if (regionStates.isRegionOnline(regionInfo)) {
|
||||
// Meta could still show the region is assigned to the previous
|
||||
// server. If that server is online, when we reload the meta, the
|
||||
// region is put back to online, we need to offline it.
|
||||
regionStates.regionOffline(regionInfo);
|
||||
}
|
||||
// Put it back in transition so that SSH can re-assign it
|
||||
regionStates.updateRegionState(regionInfo, State.OFFLINE, sn);
|
||||
invokeAssign(regionInfo);
|
||||
// No mater the previous server is online or offline,
|
||||
// we need to reset the last region server of the region.
|
||||
regionStates.setLastRegionServerOfRegion(sn, encodedName);
|
||||
if (regionInfo.isMetaRegion()) {
|
||||
// If it's meta region, reset the meta location.
|
||||
// So that master knows the right meta region server.
|
||||
MetaRegionTracker.setMetaLocation(watcher, sn);
|
||||
}
|
||||
// Make sure we know the server is dead.
|
||||
if (!serverManager.isServerDead(sn)) {
|
||||
serverManager.expireServer(sn);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
switch (et) {
|
||||
|
|
|
@ -1006,7 +1006,9 @@ MasterServices, Server {
|
|||
if (!rit) {
|
||||
// Assign meta since not already in transition
|
||||
if (currentMetaServer != null) {
|
||||
if (expireIfOnline(currentMetaServer)) {
|
||||
if (!serverManager.isServerDead(currentMetaServer)) {
|
||||
LOG.info("Forcing expire of " + currentMetaServer);
|
||||
serverManager.expireServer(currentMetaServer);
|
||||
splitMetaLogBeforeAssignment(currentMetaServer);
|
||||
if (this.distributedLogReplay) {
|
||||
logReplayFailedMetaServer = currentMetaServer;
|
||||
|
@ -1087,22 +1089,6 @@ MasterServices, Server {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Expire a server if we find it is one of the online servers.
|
||||
* @param sn ServerName to check.
|
||||
* @return true when server <code>sn<code> is being expired by the function.
|
||||
* @throws IOException
|
||||
*/
|
||||
private boolean expireIfOnline(final ServerName sn)
|
||||
throws IOException {
|
||||
if (sn == null || !serverManager.isServerOnline(sn)) {
|
||||
return false;
|
||||
}
|
||||
LOG.info("Forcing expire of " + sn);
|
||||
serverManager.expireServer(sn);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* This function returns a set of region server names under hbase:meta recovering region ZK node
|
||||
* @return Set of meta server names which were recorded in ZK
|
||||
|
|
|
@ -511,8 +511,9 @@ public class RegionStates {
|
|||
// region is not open on this server. So the region must be
|
||||
// moving to this server from another one (i.e. opening or
|
||||
// pending open on this server, was open on another one.
|
||||
// It could be in failed_close state too if tried several times
|
||||
// to open it while the server is not reachable.
|
||||
// Offline state is also kind of pending open if the region is in
|
||||
// transition. The region could be in failed_close state too if we have
|
||||
// tried several times to open it while this region server is not reachable)
|
||||
if (state.isPendingOpenOrOpening() || state.isFailedClose() || state.isOffline()) {
|
||||
LOG.info("Found region in " + state + " to be reassigned by SSH for " + sn);
|
||||
rits.add(hri);
|
||||
|
@ -623,10 +624,15 @@ public class RegionStates {
|
|||
synchronized void setLastRegionServerOfRegions(
|
||||
final ServerName serverName, final List<HRegionInfo> regionInfos) {
|
||||
for (HRegionInfo hri: regionInfos) {
|
||||
lastAssignments.put(hri.getEncodedName(), serverName);
|
||||
setLastRegionServerOfRegion(serverName, hri.getEncodedName());
|
||||
}
|
||||
}
|
||||
|
||||
synchronized void setLastRegionServerOfRegion(
|
||||
final ServerName serverName, final String encodedName) {
|
||||
lastAssignments.put(encodedName, serverName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the average load across all region servers.
|
||||
* Currently, this uses a very naive computation - just uses the number of
|
||||
|
|
|
@ -832,8 +832,8 @@ public class TestAssignmentManager {
|
|||
|
||||
/**
|
||||
* Test the scenario when the master is in failover and trying to process a
|
||||
* region which is in Opening state on a dead RS. Master should immediately
|
||||
* assign the region and not wait for Timeout Monitor.(Hbase-5882).
|
||||
* region which is in Opening state on a dead RS. Master will force offline the
|
||||
* region and put it in transition. AM relies on SSH to reassign it.
|
||||
*/
|
||||
@Test(timeout = 60000)
|
||||
public void testRegionInOpeningStateOnDeadRSWhileMasterFailover() throws IOException,
|
||||
|
@ -851,7 +851,10 @@ public class TestAssignmentManager {
|
|||
am.getRegionStates().logSplit(SERVERNAME_A); // Assume log splitting is done
|
||||
am.getRegionStates().createRegionState(REGIONINFO);
|
||||
am.gate.set(false);
|
||||
am.processRegionsInTransition(rt, REGIONINFO, version);
|
||||
CatalogTracker ct = Mockito.mock(CatalogTracker.class);
|
||||
assertFalse(am.processRegionsInTransition(rt, REGIONINFO, version));
|
||||
am.getZKTable().setEnabledTable(REGIONINFO.getTable());
|
||||
processServerShutdownHandler(ct, am, false);
|
||||
// Waiting for the assignment to get completed.
|
||||
while (!am.gate.get()) {
|
||||
Thread.sleep(10);
|
||||
|
@ -1159,6 +1162,8 @@ public class TestAssignmentManager {
|
|||
public void assign(List<HRegionInfo> regions)
|
||||
throws IOException, InterruptedException {
|
||||
assignInvoked = (regions != null && regions.size() > 0);
|
||||
super.assign(regions);
|
||||
this.gate.set(true);
|
||||
}
|
||||
|
||||
/** reset the watcher */
|
||||
|
|
|
@ -29,8 +29,6 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
@ -41,11 +39,9 @@ import org.apache.hadoop.hbase.HRegionInfo;
|
|||
import org.apache.hadoop.hbase.HTableDescriptor;
|
||||
import org.apache.hadoop.hbase.MediumTests;
|
||||
import org.apache.hadoop.hbase.MiniHBaseCluster;
|
||||
import org.apache.hadoop.hbase.RegionTransition;
|
||||
import org.apache.hadoop.hbase.ServerLoad;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.Waiter;
|
||||
import org.apache.hadoop.hbase.catalog.MetaEditor;
|
||||
import org.apache.hadoop.hbase.client.HBaseAdmin;
|
||||
import org.apache.hadoop.hbase.client.HTable;
|
||||
|
@ -64,7 +60,6 @@ import org.apache.hadoop.hbase.util.FSUtils;
|
|||
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.apache.zookeeper.data.Stat;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
@ -75,7 +70,6 @@ import org.junit.experimental.categories.Category;
|
|||
*/
|
||||
@Category(MediumTests.class)
|
||||
public class TestAssignmentManagerOnCluster {
|
||||
private static final Log LOG = LogFactory.getLog(TestAssignmentManagerOnCluster.class);
|
||||
private final static byte[] FAMILY = Bytes.toBytes("FAMILY");
|
||||
private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
|
||||
private final static Configuration conf = TEST_UTIL.getConfiguration();
|
||||
|
@ -766,66 +760,6 @@ public class TestAssignmentManagerOnCluster {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This tests a RIT in offline state will get re-assigned after a master restart
|
||||
*/
|
||||
@Test(timeout = 60000)
|
||||
public void testOfflineRegionReAssginedAfterMasterRestart() throws Exception {
|
||||
final TableName table = TableName.valueOf("testOfflineRegionReAssginedAfterMasterRestart");
|
||||
final HRegionInfo hri = createTableAndGetOneRegion(table);
|
||||
HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
|
||||
RegionStates regionStates = master.getAssignmentManager().getRegionStates();
|
||||
ServerName serverName = regionStates.getRegionServerOfRegion(hri);
|
||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
|
||||
|
||||
ServerName dstName = null;
|
||||
for (ServerName tmpServer : master.serverManager.getOnlineServers().keySet()) {
|
||||
if (!tmpServer.equals(serverName)) {
|
||||
dstName = tmpServer;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// find a different server
|
||||
assertTrue(dstName != null);
|
||||
// shutdown HBase cluster
|
||||
TEST_UTIL.shutdownMiniHBaseCluster();
|
||||
// create a RIT node in offline state
|
||||
ZooKeeperWatcher zkw = TEST_UTIL.getZooKeeperWatcher();
|
||||
ZKAssign.createNodeOffline(zkw, hri, dstName);
|
||||
Stat stat = new Stat();
|
||||
byte[] data =
|
||||
ZKAssign.getDataNoWatch(TEST_UTIL.getZooKeeperWatcher(), hri.getEncodedName(), stat);
|
||||
assertTrue(data != null);
|
||||
RegionTransition rt = RegionTransition.parseFrom(data);
|
||||
assertTrue(rt.getEventType() == EventType.M_ZK_REGION_OFFLINE);
|
||||
|
||||
LOG.info(hri.getEncodedName() + " region is in offline state with source server=" + serverName
|
||||
+ " and dst server=" + dstName);
|
||||
|
||||
// start HBase cluster
|
||||
TEST_UTIL.startMiniHBaseCluster(1, 4, MyMaster.class, null);
|
||||
|
||||
// wait for the region is re-assigned.
|
||||
TEST_UTIL.waitFor(30000, 200, new Waiter.Predicate<Exception>() {
|
||||
@Override
|
||||
public boolean evaluate() throws Exception {
|
||||
HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
|
||||
if (master != null && master.isInitialized()) {
|
||||
ServerManager serverManager = master.getServerManager();
|
||||
return !serverManager.areDeadServersInProgress();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
});
|
||||
|
||||
// verify the region is assigned
|
||||
master = TEST_UTIL.getHBaseCluster().getMaster();
|
||||
master.getAssignmentManager().waitForAssignment(hri);
|
||||
regionStates = master.getAssignmentManager().getRegionStates();
|
||||
RegionState newState = regionStates.getRegionState(hri);
|
||||
assertTrue(newState.isOpened());
|
||||
}
|
||||
|
||||
static class MyLoadBalancer extends StochasticLoadBalancer {
|
||||
// For this region, if specified, always assign to nowhere
|
||||
static volatile String controledRegion = null;
|
||||
|
|
|
@ -64,6 +64,7 @@ import org.apache.hadoop.hbase.util.Threads;
|
|||
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKTable;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
|
||||
import org.apache.zookeeper.data.Stat;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
|
||||
|
@ -972,6 +973,76 @@ public class TestMasterFailover {
|
|||
TEST_UTIL.shutdownMiniCluster();
|
||||
}
|
||||
|
||||
/**
|
||||
* This tests a RIT in offline state will get re-assigned after a master restart
|
||||
*/
|
||||
@Test(timeout=240000)
|
||||
public void testOfflineRegionReAssginedAfterMasterRestart() throws Exception {
|
||||
final TableName table = TableName.valueOf("testOfflineRegionReAssginedAfterMasterRestart");
|
||||
final int NUM_MASTERS = 1;
|
||||
final int NUM_RS = 2;
|
||||
|
||||
// Create config to use for this cluster
|
||||
Configuration conf = HBaseConfiguration.create();
|
||||
|
||||
// Start the cluster
|
||||
final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
|
||||
TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
|
||||
log("Cluster started");
|
||||
|
||||
TEST_UTIL.createTable(table, Bytes.toBytes("family"));
|
||||
HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
|
||||
RegionStates regionStates = master.getAssignmentManager().getRegionStates();
|
||||
HRegionInfo hri = regionStates.getRegionsOfTable(table).get(0);
|
||||
ServerName serverName = regionStates.getRegionServerOfRegion(hri);
|
||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
|
||||
|
||||
ServerName dstName = null;
|
||||
for (ServerName tmpServer : master.serverManager.getOnlineServers().keySet()) {
|
||||
if (!tmpServer.equals(serverName)) {
|
||||
dstName = tmpServer;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// find a different server
|
||||
assertTrue(dstName != null);
|
||||
// shutdown HBase cluster
|
||||
TEST_UTIL.shutdownMiniHBaseCluster();
|
||||
// create a RIT node in offline state
|
||||
ZooKeeperWatcher zkw = TEST_UTIL.getZooKeeperWatcher();
|
||||
ZKAssign.createNodeOffline(zkw, hri, dstName);
|
||||
Stat stat = new Stat();
|
||||
byte[] data =
|
||||
ZKAssign.getDataNoWatch(zkw, hri.getEncodedName(), stat);
|
||||
assertTrue(data != null);
|
||||
RegionTransition rt = RegionTransition.parseFrom(data);
|
||||
assertTrue(rt.getEventType() == EventType.M_ZK_REGION_OFFLINE);
|
||||
|
||||
LOG.info(hri.getEncodedName() + " region is in offline state with source server=" + serverName
|
||||
+ " and dst server=" + dstName);
|
||||
|
||||
// start HBase cluster
|
||||
TEST_UTIL.startMiniHBaseCluster(NUM_MASTERS, NUM_RS);
|
||||
|
||||
while (true) {
|
||||
master = TEST_UTIL.getHBaseCluster().getMaster();
|
||||
if (master != null && master.isInitialized()) {
|
||||
ServerManager serverManager = master.getServerManager();
|
||||
if (!serverManager.areDeadServersInProgress()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Thread.sleep(200);
|
||||
}
|
||||
|
||||
// verify the region is assigned
|
||||
master = TEST_UTIL.getHBaseCluster().getMaster();
|
||||
master.getAssignmentManager().waitForAssignment(hri);
|
||||
regionStates = master.getAssignmentManager().getRegionStates();
|
||||
RegionState newState = regionStates.getRegionState(hri);
|
||||
assertTrue(newState.isOpened());
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple test of master failover.
|
||||
* <p>
|
||||
|
|
|
@ -66,7 +66,7 @@ public class TestRestartCluster {
|
|||
String unassignedZNode = zooKeeper.assignmentZNode;
|
||||
ZKUtil.createAndFailSilent(zooKeeper, unassignedZNode);
|
||||
|
||||
ServerName sn = ServerName.valueOf(HMaster.MASTER, -1, System.currentTimeMillis());
|
||||
ServerName sn = ServerName.valueOf(HMaster.MASTER, 1, System.currentTimeMillis());
|
||||
|
||||
ZKAssign.createNodeOffline(zooKeeper, HRegionInfo.FIRST_META_REGIONINFO, sn);
|
||||
|
||||
|
|
Loading…
Reference in New Issue