HBASE-12440 Region may remain offline on clean startup under certain race condition (Virag Kothari)
This commit is contained in:
parent
df3ba6ea4b
commit
87fb974765
|
@ -1659,6 +1659,10 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
getLong("hbase.regionserver.rpc.startup.waittime", 60000);
|
||||
for (int i = 1; i <= maximumAttempts && !server.isStopped(); i++) {
|
||||
try {
|
||||
// regionOpenInfos is empty if all regions are in failedToOpenRegions list
|
||||
if (regionOpenInfos.isEmpty()) {
|
||||
break;
|
||||
}
|
||||
List<RegionOpeningState> regionOpeningStateList = serverManager
|
||||
.sendRegionOpen(destination, regionOpenInfos);
|
||||
if (regionOpeningStateList == null) {
|
||||
|
@ -1920,8 +1924,12 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
if (useZKForAssignment
|
||||
&& regionStates.isServerDeadAndNotProcessed(sn)
|
||||
&& wasRegionOnDeadServerByMeta(region, sn)) {
|
||||
if (!regionStates.isRegionInTransition(region)) {
|
||||
LOG.info("Updating the state to " + State.OFFLINE + " to allow to be reassigned by SSH");
|
||||
regionStates.updateRegionState(region, State.OFFLINE);
|
||||
}
|
||||
LOG.info("Skip assigning " + region.getRegionNameAsString()
|
||||
+ ", it is on a dead but not processed yet server: " + sn);
|
||||
+ ", it is on a dead but not processed yet server: " + sn);
|
||||
return null;
|
||||
}
|
||||
case CLOSED:
|
||||
|
|
|
@ -243,7 +243,7 @@ public class ServerShutdownHandler extends EventHandler {
|
|||
Lock lock = am.acquireRegionLock(encodedName);
|
||||
try {
|
||||
RegionState rit = regionStates.getRegionTransitionState(hri);
|
||||
if (processDeadRegion(hri, am)) {
|
||||
if (processDeadRegion(hri, am)) {
|
||||
ServerName addressFromAM = regionStates.getRegionServerOfRegion(hri);
|
||||
if (addressFromAM != null && !addressFromAM.equals(this.serverName)) {
|
||||
// If this region is in transition on the dead server, it must be
|
||||
|
@ -273,7 +273,7 @@ public class ServerShutdownHandler extends EventHandler {
|
|||
}
|
||||
toAssignRegions.add(hri);
|
||||
} else if (rit != null) {
|
||||
if (rit.isPendingCloseOrClosing()
|
||||
if ((rit.isPendingCloseOrClosing() || rit.isOffline())
|
||||
&& am.getTableStateManager().isTableState(hri.getTable(),
|
||||
ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) {
|
||||
// If the table was partially disabled and the RS went down, we should clear the RIT
|
||||
|
|
|
@ -81,6 +81,7 @@ import org.junit.BeforeClass;
|
|||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
|
||||
|
||||
/**
|
||||
* This tests AssignmentManager with a testing cluster.
|
||||
*/
|
||||
|
@ -188,7 +189,7 @@ public class TestAssignmentManagerOnCluster {
|
|||
|
||||
RegionStates regionStates = am.getRegionStates();
|
||||
ServerName serverName = regionStates.getRegionServerOfRegion(hri);
|
||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
|
||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 6000);
|
||||
|
||||
// Region is assigned now. Let's assign it again.
|
||||
// Master should not abort, and region should be assigned.
|
||||
|
@ -203,6 +204,58 @@ public class TestAssignmentManagerOnCluster {
|
|||
}
|
||||
}
|
||||
|
||||
// Simulate a scenario where the AssignCallable and SSH are trying to assign a region
|
||||
@Test (timeout=60000)
|
||||
public void testAssignRegionBySSH() throws Exception {
|
||||
if (!conf.getBoolean("hbase.assignment.usezk", true)) {
|
||||
return;
|
||||
}
|
||||
String table = "testAssignRegionBySSH";
|
||||
MyMaster master = (MyMaster) TEST_UTIL.getHBaseCluster().getMaster();
|
||||
try {
|
||||
HTableDescriptor desc = new HTableDescriptor(TableName.valueOf(table));
|
||||
desc.addFamily(new HColumnDescriptor(FAMILY));
|
||||
admin.createTable(desc);
|
||||
|
||||
HTable meta = new HTable(conf, TableName.META_TABLE_NAME);
|
||||
HRegionInfo hri = new HRegionInfo(
|
||||
desc.getTableName(), Bytes.toBytes("A"), Bytes.toBytes("Z"));
|
||||
MetaTableAccessor.addRegionToMeta(meta, hri);
|
||||
// Add some dummy server for the region entry
|
||||
MetaTableAccessor.updateRegionLocation(TEST_UTIL.getHBaseCluster().getMaster().getShortCircuitConnection(), hri,
|
||||
ServerName.valueOf("example.org", 1234, System.currentTimeMillis()), 0);
|
||||
RegionStates regionStates = master.getAssignmentManager().getRegionStates();
|
||||
int i = TEST_UTIL.getHBaseCluster().getServerWithMeta();
|
||||
HRegionServer rs = TEST_UTIL.getHBaseCluster().getRegionServer(i == 0 ? 1 : 0);
|
||||
// Choose a server other than meta to kill
|
||||
ServerName controlledServer = rs.getServerName();
|
||||
master.enableSSH(false);
|
||||
TEST_UTIL.getHBaseCluster().killRegionServer(controlledServer);
|
||||
TEST_UTIL.getHBaseCluster().waitForRegionServerToStop(controlledServer, -1);
|
||||
AssignmentManager am = master.getAssignmentManager();
|
||||
|
||||
// Simulate the AssignCallable trying to assign the region. Have the region in OFFLINE state,
|
||||
// but not in transition and the server is the dead 'controlledServer'
|
||||
regionStates.createRegionState(hri, State.OFFLINE, controlledServer, null);
|
||||
am.assign(hri, true, true);
|
||||
// Region should remain OFFLINE and go to transition
|
||||
assertEquals(State.OFFLINE, regionStates.getRegionState(hri).getState());
|
||||
assertTrue (regionStates.isRegionInTransition(hri));
|
||||
|
||||
master.enableSSH(true);
|
||||
am.waitForAssignment(hri);
|
||||
assertTrue (regionStates.getRegionState(hri).isOpened());
|
||||
ServerName serverName = regionStates.getRegionServerOfRegion(hri);
|
||||
TEST_UTIL.assertRegionOnlyOnServer(hri, serverName, 6000);
|
||||
} finally {
|
||||
if (master != null) {
|
||||
master.enableSSH(true);
|
||||
}
|
||||
TEST_UTIL.deleteTable(Bytes.toBytes(table));
|
||||
TEST_UTIL.getHBaseCluster().startRegionServer();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This tests region assignment on a simulated restarted server
|
||||
*/
|
||||
|
@ -277,7 +330,7 @@ public class TestAssignmentManagerOnCluster {
|
|||
RegionStates regionStates = TEST_UTIL.getHBaseCluster().
|
||||
getMaster().getAssignmentManager().getRegionStates();
|
||||
ServerName serverName = regionStates.getRegionServerOfRegion(hri);
|
||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
|
||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 6000);
|
||||
admin.offline(hri.getRegionName());
|
||||
|
||||
long timeoutTime = System.currentTimeMillis() + 800;
|
||||
|
@ -333,7 +386,7 @@ public class TestAssignmentManagerOnCluster {
|
|||
while (true) {
|
||||
ServerName sn = regionStates.getRegionServerOfRegion(hri);
|
||||
if (sn != null && sn.equals(destServerName)) {
|
||||
TEST_UTIL.assertRegionOnServer(hri, sn, 200);
|
||||
TEST_UTIL.assertRegionOnServer(hri, sn, 6000);
|
||||
break;
|
||||
}
|
||||
long now = System.currentTimeMillis();
|
||||
|
@ -511,7 +564,7 @@ public class TestAssignmentManagerOnCluster {
|
|||
assertTrue(am.waitForAssignment(hri));
|
||||
ServerName serverName = master.getAssignmentManager().
|
||||
getRegionStates().getRegionServerOfRegion(hri);
|
||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
|
||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 6000);
|
||||
} finally {
|
||||
MyRegionObserver.preCloseEnabled.set(false);
|
||||
TEST_UTIL.deleteTable(Bytes.toBytes(table));
|
||||
|
@ -552,7 +605,7 @@ public class TestAssignmentManagerOnCluster {
|
|||
|
||||
ServerName serverName = master.getAssignmentManager().
|
||||
getRegionStates().getRegionServerOfRegion(hri);
|
||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
|
||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 6000);
|
||||
} finally {
|
||||
MyLoadBalancer.controledRegion = null;
|
||||
TEST_UTIL.deleteTable(Bytes.toBytes(table));
|
||||
|
@ -602,7 +655,7 @@ public class TestAssignmentManagerOnCluster {
|
|||
|
||||
ServerName serverName = master.getAssignmentManager().
|
||||
getRegionStates().getRegionServerOfRegion(hri);
|
||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
|
||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 6000);
|
||||
} finally {
|
||||
TEST_UTIL.deleteTable(table);
|
||||
}
|
||||
|
@ -634,7 +687,7 @@ public class TestAssignmentManagerOnCluster {
|
|||
if (ConfigUtil.useZKForAssignment(conf)) {
|
||||
ZKAssign.createNodeOffline(zkw, hri, destServerName);
|
||||
ZKAssign.transitionNodeOpening(zkw, hri, destServerName);
|
||||
|
||||
|
||||
// Wait till the event is processed and the region is in transition
|
||||
long timeoutTime = System.currentTimeMillis() + 20000;
|
||||
while (!am.getRegionStates().isRegionInTransition(hri)) {
|
||||
|
@ -705,7 +758,7 @@ public class TestAssignmentManagerOnCluster {
|
|||
assertTrue(am.waitForAssignment(hri));
|
||||
ServerName serverName = master.getAssignmentManager().
|
||||
getRegionStates().getRegionServerOfRegion(hri);
|
||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
|
||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 6000);
|
||||
} finally {
|
||||
MyRegionObserver.postCloseEnabled.set(false);
|
||||
TEST_UTIL.deleteTable(Bytes.toBytes(table));
|
||||
|
@ -1106,7 +1159,7 @@ public class TestAssignmentManagerOnCluster {
|
|||
cluster.startRegionServer();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Test that region state transition call is idempotent
|
||||
*/
|
||||
|
@ -1129,7 +1182,7 @@ public class TestAssignmentManagerOnCluster {
|
|||
RegionStates regionStates = am.getRegionStates();
|
||||
ServerName serverName = regionStates.getRegionServerOfRegion(hri);
|
||||
// Assert the the region is actually open on the server
|
||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
|
||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 6000);
|
||||
// Closing region should just work fine
|
||||
admin.disableTable(TableName.valueOf(table));
|
||||
assertTrue(regionStates.isRegionOffline(hri));
|
||||
|
|
Loading…
Reference in New Issue