HBASE-12440 Region may remain offline on clean startup under certain race condition (Virag Kothari)
This commit is contained in:
parent
df3ba6ea4b
commit
87fb974765
|
@ -1659,6 +1659,10 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
getLong("hbase.regionserver.rpc.startup.waittime", 60000);
|
getLong("hbase.regionserver.rpc.startup.waittime", 60000);
|
||||||
for (int i = 1; i <= maximumAttempts && !server.isStopped(); i++) {
|
for (int i = 1; i <= maximumAttempts && !server.isStopped(); i++) {
|
||||||
try {
|
try {
|
||||||
|
// regionOpenInfos is empty if all regions are in failedToOpenRegions list
|
||||||
|
if (regionOpenInfos.isEmpty()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
List<RegionOpeningState> regionOpeningStateList = serverManager
|
List<RegionOpeningState> regionOpeningStateList = serverManager
|
||||||
.sendRegionOpen(destination, regionOpenInfos);
|
.sendRegionOpen(destination, regionOpenInfos);
|
||||||
if (regionOpeningStateList == null) {
|
if (regionOpeningStateList == null) {
|
||||||
|
@ -1920,8 +1924,12 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
if (useZKForAssignment
|
if (useZKForAssignment
|
||||||
&& regionStates.isServerDeadAndNotProcessed(sn)
|
&& regionStates.isServerDeadAndNotProcessed(sn)
|
||||||
&& wasRegionOnDeadServerByMeta(region, sn)) {
|
&& wasRegionOnDeadServerByMeta(region, sn)) {
|
||||||
|
if (!regionStates.isRegionInTransition(region)) {
|
||||||
|
LOG.info("Updating the state to " + State.OFFLINE + " to allow to be reassigned by SSH");
|
||||||
|
regionStates.updateRegionState(region, State.OFFLINE);
|
||||||
|
}
|
||||||
LOG.info("Skip assigning " + region.getRegionNameAsString()
|
LOG.info("Skip assigning " + region.getRegionNameAsString()
|
||||||
+ ", it is on a dead but not processed yet server: " + sn);
|
+ ", it is on a dead but not processed yet server: " + sn);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
case CLOSED:
|
case CLOSED:
|
||||||
|
|
|
@ -243,7 +243,7 @@ public class ServerShutdownHandler extends EventHandler {
|
||||||
Lock lock = am.acquireRegionLock(encodedName);
|
Lock lock = am.acquireRegionLock(encodedName);
|
||||||
try {
|
try {
|
||||||
RegionState rit = regionStates.getRegionTransitionState(hri);
|
RegionState rit = regionStates.getRegionTransitionState(hri);
|
||||||
if (processDeadRegion(hri, am)) {
|
if (processDeadRegion(hri, am)) {
|
||||||
ServerName addressFromAM = regionStates.getRegionServerOfRegion(hri);
|
ServerName addressFromAM = regionStates.getRegionServerOfRegion(hri);
|
||||||
if (addressFromAM != null && !addressFromAM.equals(this.serverName)) {
|
if (addressFromAM != null && !addressFromAM.equals(this.serverName)) {
|
||||||
// If this region is in transition on the dead server, it must be
|
// If this region is in transition on the dead server, it must be
|
||||||
|
@ -273,7 +273,7 @@ public class ServerShutdownHandler extends EventHandler {
|
||||||
}
|
}
|
||||||
toAssignRegions.add(hri);
|
toAssignRegions.add(hri);
|
||||||
} else if (rit != null) {
|
} else if (rit != null) {
|
||||||
if (rit.isPendingCloseOrClosing()
|
if ((rit.isPendingCloseOrClosing() || rit.isOffline())
|
||||||
&& am.getTableStateManager().isTableState(hri.getTable(),
|
&& am.getTableStateManager().isTableState(hri.getTable(),
|
||||||
ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) {
|
ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) {
|
||||||
// If the table was partially disabled and the RS went down, we should clear the RIT
|
// If the table was partially disabled and the RS went down, we should clear the RIT
|
||||||
|
|
|
@ -81,6 +81,7 @@ import org.junit.BeforeClass;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.junit.experimental.categories.Category;
|
import org.junit.experimental.categories.Category;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This tests AssignmentManager with a testing cluster.
|
* This tests AssignmentManager with a testing cluster.
|
||||||
*/
|
*/
|
||||||
|
@ -188,7 +189,7 @@ public class TestAssignmentManagerOnCluster {
|
||||||
|
|
||||||
RegionStates regionStates = am.getRegionStates();
|
RegionStates regionStates = am.getRegionStates();
|
||||||
ServerName serverName = regionStates.getRegionServerOfRegion(hri);
|
ServerName serverName = regionStates.getRegionServerOfRegion(hri);
|
||||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
|
TEST_UTIL.assertRegionOnServer(hri, serverName, 6000);
|
||||||
|
|
||||||
// Region is assigned now. Let's assign it again.
|
// Region is assigned now. Let's assign it again.
|
||||||
// Master should not abort, and region should be assigned.
|
// Master should not abort, and region should be assigned.
|
||||||
|
@ -203,6 +204,58 @@ public class TestAssignmentManagerOnCluster {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Simulate a scenario where the AssignCallable and SSH are trying to assign a region
|
||||||
|
@Test (timeout=60000)
|
||||||
|
public void testAssignRegionBySSH() throws Exception {
|
||||||
|
if (!conf.getBoolean("hbase.assignment.usezk", true)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
String table = "testAssignRegionBySSH";
|
||||||
|
MyMaster master = (MyMaster) TEST_UTIL.getHBaseCluster().getMaster();
|
||||||
|
try {
|
||||||
|
HTableDescriptor desc = new HTableDescriptor(TableName.valueOf(table));
|
||||||
|
desc.addFamily(new HColumnDescriptor(FAMILY));
|
||||||
|
admin.createTable(desc);
|
||||||
|
|
||||||
|
HTable meta = new HTable(conf, TableName.META_TABLE_NAME);
|
||||||
|
HRegionInfo hri = new HRegionInfo(
|
||||||
|
desc.getTableName(), Bytes.toBytes("A"), Bytes.toBytes("Z"));
|
||||||
|
MetaTableAccessor.addRegionToMeta(meta, hri);
|
||||||
|
// Add some dummy server for the region entry
|
||||||
|
MetaTableAccessor.updateRegionLocation(TEST_UTIL.getHBaseCluster().getMaster().getShortCircuitConnection(), hri,
|
||||||
|
ServerName.valueOf("example.org", 1234, System.currentTimeMillis()), 0);
|
||||||
|
RegionStates regionStates = master.getAssignmentManager().getRegionStates();
|
||||||
|
int i = TEST_UTIL.getHBaseCluster().getServerWithMeta();
|
||||||
|
HRegionServer rs = TEST_UTIL.getHBaseCluster().getRegionServer(i == 0 ? 1 : 0);
|
||||||
|
// Choose a server other than meta to kill
|
||||||
|
ServerName controlledServer = rs.getServerName();
|
||||||
|
master.enableSSH(false);
|
||||||
|
TEST_UTIL.getHBaseCluster().killRegionServer(controlledServer);
|
||||||
|
TEST_UTIL.getHBaseCluster().waitForRegionServerToStop(controlledServer, -1);
|
||||||
|
AssignmentManager am = master.getAssignmentManager();
|
||||||
|
|
||||||
|
// Simulate the AssignCallable trying to assign the region. Have the region in OFFLINE state,
|
||||||
|
// but not in transition and the server is the dead 'controlledServer'
|
||||||
|
regionStates.createRegionState(hri, State.OFFLINE, controlledServer, null);
|
||||||
|
am.assign(hri, true, true);
|
||||||
|
// Region should remain OFFLINE and go to transition
|
||||||
|
assertEquals(State.OFFLINE, regionStates.getRegionState(hri).getState());
|
||||||
|
assertTrue (regionStates.isRegionInTransition(hri));
|
||||||
|
|
||||||
|
master.enableSSH(true);
|
||||||
|
am.waitForAssignment(hri);
|
||||||
|
assertTrue (regionStates.getRegionState(hri).isOpened());
|
||||||
|
ServerName serverName = regionStates.getRegionServerOfRegion(hri);
|
||||||
|
TEST_UTIL.assertRegionOnlyOnServer(hri, serverName, 6000);
|
||||||
|
} finally {
|
||||||
|
if (master != null) {
|
||||||
|
master.enableSSH(true);
|
||||||
|
}
|
||||||
|
TEST_UTIL.deleteTable(Bytes.toBytes(table));
|
||||||
|
TEST_UTIL.getHBaseCluster().startRegionServer();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This tests region assignment on a simulated restarted server
|
* This tests region assignment on a simulated restarted server
|
||||||
*/
|
*/
|
||||||
|
@ -277,7 +330,7 @@ public class TestAssignmentManagerOnCluster {
|
||||||
RegionStates regionStates = TEST_UTIL.getHBaseCluster().
|
RegionStates regionStates = TEST_UTIL.getHBaseCluster().
|
||||||
getMaster().getAssignmentManager().getRegionStates();
|
getMaster().getAssignmentManager().getRegionStates();
|
||||||
ServerName serverName = regionStates.getRegionServerOfRegion(hri);
|
ServerName serverName = regionStates.getRegionServerOfRegion(hri);
|
||||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
|
TEST_UTIL.assertRegionOnServer(hri, serverName, 6000);
|
||||||
admin.offline(hri.getRegionName());
|
admin.offline(hri.getRegionName());
|
||||||
|
|
||||||
long timeoutTime = System.currentTimeMillis() + 800;
|
long timeoutTime = System.currentTimeMillis() + 800;
|
||||||
|
@ -333,7 +386,7 @@ public class TestAssignmentManagerOnCluster {
|
||||||
while (true) {
|
while (true) {
|
||||||
ServerName sn = regionStates.getRegionServerOfRegion(hri);
|
ServerName sn = regionStates.getRegionServerOfRegion(hri);
|
||||||
if (sn != null && sn.equals(destServerName)) {
|
if (sn != null && sn.equals(destServerName)) {
|
||||||
TEST_UTIL.assertRegionOnServer(hri, sn, 200);
|
TEST_UTIL.assertRegionOnServer(hri, sn, 6000);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
long now = System.currentTimeMillis();
|
long now = System.currentTimeMillis();
|
||||||
|
@ -511,7 +564,7 @@ public class TestAssignmentManagerOnCluster {
|
||||||
assertTrue(am.waitForAssignment(hri));
|
assertTrue(am.waitForAssignment(hri));
|
||||||
ServerName serverName = master.getAssignmentManager().
|
ServerName serverName = master.getAssignmentManager().
|
||||||
getRegionStates().getRegionServerOfRegion(hri);
|
getRegionStates().getRegionServerOfRegion(hri);
|
||||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
|
TEST_UTIL.assertRegionOnServer(hri, serverName, 6000);
|
||||||
} finally {
|
} finally {
|
||||||
MyRegionObserver.preCloseEnabled.set(false);
|
MyRegionObserver.preCloseEnabled.set(false);
|
||||||
TEST_UTIL.deleteTable(Bytes.toBytes(table));
|
TEST_UTIL.deleteTable(Bytes.toBytes(table));
|
||||||
|
@ -552,7 +605,7 @@ public class TestAssignmentManagerOnCluster {
|
||||||
|
|
||||||
ServerName serverName = master.getAssignmentManager().
|
ServerName serverName = master.getAssignmentManager().
|
||||||
getRegionStates().getRegionServerOfRegion(hri);
|
getRegionStates().getRegionServerOfRegion(hri);
|
||||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
|
TEST_UTIL.assertRegionOnServer(hri, serverName, 6000);
|
||||||
} finally {
|
} finally {
|
||||||
MyLoadBalancer.controledRegion = null;
|
MyLoadBalancer.controledRegion = null;
|
||||||
TEST_UTIL.deleteTable(Bytes.toBytes(table));
|
TEST_UTIL.deleteTable(Bytes.toBytes(table));
|
||||||
|
@ -602,7 +655,7 @@ public class TestAssignmentManagerOnCluster {
|
||||||
|
|
||||||
ServerName serverName = master.getAssignmentManager().
|
ServerName serverName = master.getAssignmentManager().
|
||||||
getRegionStates().getRegionServerOfRegion(hri);
|
getRegionStates().getRegionServerOfRegion(hri);
|
||||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
|
TEST_UTIL.assertRegionOnServer(hri, serverName, 6000);
|
||||||
} finally {
|
} finally {
|
||||||
TEST_UTIL.deleteTable(table);
|
TEST_UTIL.deleteTable(table);
|
||||||
}
|
}
|
||||||
|
@ -634,7 +687,7 @@ public class TestAssignmentManagerOnCluster {
|
||||||
if (ConfigUtil.useZKForAssignment(conf)) {
|
if (ConfigUtil.useZKForAssignment(conf)) {
|
||||||
ZKAssign.createNodeOffline(zkw, hri, destServerName);
|
ZKAssign.createNodeOffline(zkw, hri, destServerName);
|
||||||
ZKAssign.transitionNodeOpening(zkw, hri, destServerName);
|
ZKAssign.transitionNodeOpening(zkw, hri, destServerName);
|
||||||
|
|
||||||
// Wait till the event is processed and the region is in transition
|
// Wait till the event is processed and the region is in transition
|
||||||
long timeoutTime = System.currentTimeMillis() + 20000;
|
long timeoutTime = System.currentTimeMillis() + 20000;
|
||||||
while (!am.getRegionStates().isRegionInTransition(hri)) {
|
while (!am.getRegionStates().isRegionInTransition(hri)) {
|
||||||
|
@ -705,7 +758,7 @@ public class TestAssignmentManagerOnCluster {
|
||||||
assertTrue(am.waitForAssignment(hri));
|
assertTrue(am.waitForAssignment(hri));
|
||||||
ServerName serverName = master.getAssignmentManager().
|
ServerName serverName = master.getAssignmentManager().
|
||||||
getRegionStates().getRegionServerOfRegion(hri);
|
getRegionStates().getRegionServerOfRegion(hri);
|
||||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
|
TEST_UTIL.assertRegionOnServer(hri, serverName, 6000);
|
||||||
} finally {
|
} finally {
|
||||||
MyRegionObserver.postCloseEnabled.set(false);
|
MyRegionObserver.postCloseEnabled.set(false);
|
||||||
TEST_UTIL.deleteTable(Bytes.toBytes(table));
|
TEST_UTIL.deleteTable(Bytes.toBytes(table));
|
||||||
|
@ -1106,7 +1159,7 @@ public class TestAssignmentManagerOnCluster {
|
||||||
cluster.startRegionServer();
|
cluster.startRegionServer();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test that region state transition call is idempotent
|
* Test that region state transition call is idempotent
|
||||||
*/
|
*/
|
||||||
|
@ -1129,7 +1182,7 @@ public class TestAssignmentManagerOnCluster {
|
||||||
RegionStates regionStates = am.getRegionStates();
|
RegionStates regionStates = am.getRegionStates();
|
||||||
ServerName serverName = regionStates.getRegionServerOfRegion(hri);
|
ServerName serverName = regionStates.getRegionServerOfRegion(hri);
|
||||||
// Assert the the region is actually open on the server
|
// Assert the the region is actually open on the server
|
||||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
|
TEST_UTIL.assertRegionOnServer(hri, serverName, 6000);
|
||||||
// Closing region should just work fine
|
// Closing region should just work fine
|
||||||
admin.disableTable(TableName.valueOf(table));
|
admin.disableTable(TableName.valueOf(table));
|
||||||
assertTrue(regionStates.isRegionOffline(hri));
|
assertTrue(regionStates.isRegionOffline(hri));
|
||||||
|
|
Loading…
Reference in New Issue