HBASE-8545 Meta stuck in transition when it is assigned to a just restarted dead region sever

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1484875 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
jxiang 2013-05-21 17:19:02 +00:00
parent 634f63ef3c
commit aa12c8ac72
4 changed files with 80 additions and 4 deletions

View File

@ -798,7 +798,6 @@ public class ZKAssign {
// Node no longer exists. Return -1. It means unsuccessful transition.
return -1;
}
RegionTransition rt = getRegionTransition(existingBytes);
// Verify it is the expected version
if (expectedVersion != -1 && stat.getVersion() != expectedVersion) {
@ -808,7 +807,9 @@ public class ZKAssign {
"the node existed but was version " + stat.getVersion() +
" not the expected version " + expectedVersion));
return -1;
} else if (beginState.equals(EventType.M_ZK_REGION_OFFLINE)
}
if (beginState.equals(EventType.M_ZK_REGION_OFFLINE)
&& endState.equals(EventType.RS_ZK_REGION_OPENING)
&& expectedVersion == -1 && stat.getVersion() != 0) {
// the below check ensures that double assignment doesnot happen.
@ -822,6 +823,18 @@ public class ZKAssign {
return -1;
}
RegionTransition rt = getRegionTransition(existingBytes);
// Verify the server transition happens on is not changed
if (!rt.getServerName().equals(serverName)) {
LOG.warn(zkw.prefix("Attempt to transition the " +
"unassigned node for " + encoded +
" from " + beginState + " to " + endState + " failed, " +
"the server that tried to transition was " + serverName +
" not the expected " + rt.getServerName()));
return -1;
}
// Verify it is in expected state
EventType et = rt.getEventType();
if (!et.equals(beginState)) {

View File

@ -534,7 +534,7 @@ public class AssignmentManager extends ZooKeeperListener {
EventType et = rt.getEventType();
// Get ServerName. Could not be null.
final ServerName sn = rt.getServerName();
String encodedRegionName = regionInfo.getEncodedName();
final String encodedRegionName = regionInfo.getEncodedName();
LOG.info("Processing region " + regionInfo.getRegionNameAsString() + " in state " + et);
@ -592,6 +592,8 @@ public class AssignmentManager extends ZooKeeperListener {
public void process() throws IOException {
ReentrantLock lock = locker.acquireLock(regionInfo.getEncodedName());
try {
RegionPlan plan = new RegionPlan(regionInfo, null, sn);
addPlan(encodedRegionName, plan);
assign(rs, false, false);
} finally {
lock.unlock();

View File

@ -399,7 +399,7 @@ public class TestAssignmentManager {
assertNotSame(-1, versionid);
// This uglyness below is what the openregionhandler on RS side does.
versionid = ZKAssign.transitionNode(server.getZooKeeper(), REGIONINFO,
SERVERNAME_A, EventType.M_ZK_REGION_OFFLINE,
SERVERNAME_B, EventType.M_ZK_REGION_OFFLINE,
EventType.RS_ZK_REGION_OPENING, versionid);
assertNotSame(-1, versionid);
// Move znode from OPENING to OPENED as RS does on successful open.

View File

@ -24,6 +24,7 @@ import static org.junit.Assert.fail;
import java.io.IOException;
import java.util.List;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
@ -34,6 +35,7 @@ import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.MediumTests;
import org.apache.hadoop.hbase.ServerLoad;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.catalog.MetaEditor;
import org.apache.hadoop.hbase.client.HBaseAdmin;
@ -43,6 +45,7 @@ import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
import org.apache.hadoop.hbase.coprocessor.ObserverContext;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.coprocessor.RegionObserver;
import org.apache.hadoop.hbase.executor.EventType;
import org.apache.hadoop.hbase.master.balancer.StochasticLoadBalancer;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.util.Bytes;
@ -110,6 +113,64 @@ public class TestAssignmentManagerOnCluster {
}
}
/**
* This tests region assignment on a simulated restarted server
*/
@Test
public void testAssignRegionOnRestartedServer() throws Exception {
String table = "testAssignRegionOnRestartedServer";
ServerName deadServer = null;
HMaster master = null;
try {
HTableDescriptor desc = new HTableDescriptor(table);
desc.addFamily(new HColumnDescriptor(FAMILY));
admin.createTable(desc);
HTable meta = new HTable(conf, HConstants.META_TABLE_NAME);
HRegionInfo hri = new HRegionInfo(
desc.getName(), Bytes.toBytes("A"), Bytes.toBytes("Z"));
MetaEditor.addRegionToMeta(meta, hri);
master = TEST_UTIL.getHBaseCluster().getMaster();
Set<ServerName> onlineServers = master.serverManager.getOnlineServers().keySet();
assertFalse("There should be some servers online", onlineServers.isEmpty());
// Use the first server as the destination server
ServerName destServer = onlineServers.iterator().next();
// Created faked dead server
deadServer = new ServerName(destServer.getHostname(),
destServer.getPort(), destServer.getStartcode() - 100L);
master.serverManager.recordNewServer(deadServer, ServerLoad.EMPTY_SERVERLOAD);
AssignmentManager am = master.getAssignmentManager();
RegionPlan plan = new RegionPlan(hri, null, deadServer);
am.addPlan(hri.getEncodedName(), plan);
master.assignRegion(hri);
int version = ZKAssign.transitionNode(master.getZooKeeper(), hri,
destServer, EventType.M_ZK_REGION_OFFLINE,
EventType.RS_ZK_REGION_OPENING, 0);
assertEquals("TansitionNode should fail", -1, version);
// Give region 2 seconds to assign, which may not be enough.
// However, if HBASE-8545 is broken, this test will be flaky.
// Otherwise, this test should never be flaky.
Thread.sleep(2000);
assertTrue("Region should still be in transition",
am.getRegionStates().isRegionInTransition(hri));
assertEquals("Assign node should still be in version 0", 0,
ZKAssign.getVersion(master.getZooKeeper(), hri));
} finally {
if (deadServer != null) {
master.serverManager.expireServer(deadServer);
}
TEST_UTIL.deleteTable(Bytes.toBytes(table));
}
}
/**
* This tests offlining a region
*/