HBASE-12480 Regions in FAILED_OPEN/FAILED_CLOSE should be processed on master failover

This commit is contained in:
Virag Kothari 2015-01-13 11:03:16 -08:00
parent 72a6a670ac
commit 4ac457a7bc
2 changed files with 49 additions and 7 deletions

View File

@ -450,8 +450,9 @@ public class AssignmentManager {
Map<String, RegionState> regionsInTransition = regionStates.getRegionsInTransition(); Map<String, RegionState> regionsInTransition = regionStates.getRegionsInTransition();
if (!regionsInTransition.isEmpty()) { if (!regionsInTransition.isEmpty()) {
for (RegionState regionState: regionsInTransition.values()) { for (RegionState regionState: regionsInTransition.values()) {
ServerName serverName = regionState.getServerName();
if (!regionState.getRegion().isMetaRegion() if (!regionState.getRegion().isMetaRegion()
&& onlineServers.contains(regionState.getServerName())) { && serverName != null && onlineServers.contains(serverName)) {
LOG.debug("Found " + regionState + " in RITs"); LOG.debug("Found " + regionState + " in RITs");
failover = true; failover = true;
break; break;
@ -1694,18 +1695,23 @@ public class AssignmentManager {
/** /**
* Processes list of regions in transition at startup * Processes list of regions in transition at startup
*/ */
void processRegionsInTransition(Collection<RegionState> regionStates) { void processRegionsInTransition(Collection<RegionState> regionsInTransition) {
// We need to send RPC call again for PENDING_OPEN/PENDING_CLOSE regions // We need to send RPC call again for PENDING_OPEN/PENDING_CLOSE regions
// in case the RPC call is not sent out yet before the master was shut down // in case the RPC call is not sent out yet before the master was shut down
// since we update the state before we send the RPC call. We can't update // since we update the state before we send the RPC call. We can't update
// the state after the RPC call. Otherwise, we don't know what's happened // the state after the RPC call. Otherwise, we don't know what's happened
// to the region if the master dies right after the RPC call is out. // to the region if the master dies right after the RPC call is out.
for (RegionState regionState: regionStates) { for (RegionState regionState: regionsInTransition) {
if (!serverManager.isServerOnline(regionState.getServerName())) { LOG.info("Processing " + regionState);
ServerName serverName = regionState.getServerName();
// Server could be null in case of FAILED_OPEN when master cannot find a region plan. In that
// case, try assigning it here.
if (serverName != null && !serverManager.getOnlineServers().containsKey(serverName)) {
LOG.info("Server " + serverName + " isn't online. SSH will handle this");
continue; // SSH will handle it continue; // SSH will handle it
} }
HRegionInfo regionInfo = regionState.getRegion();
RegionState.State state = regionState.getState(); RegionState.State state = regionState.getState();
LOG.info("Processing " + regionState);
switch (state) { switch (state) {
case CLOSED: case CLOSED:
invokeAssign(regionState.getRegion()); invokeAssign(regionState.getRegion());
@ -1716,6 +1722,10 @@ public class AssignmentManager {
case PENDING_CLOSE: case PENDING_CLOSE:
retrySendRegionClose(regionState); retrySendRegionClose(regionState);
break; break;
case FAILED_CLOSE:
case FAILED_OPEN:
invokeUnAssign(regionInfo);
break;
default: default:
// No process for other states // No process for other states
} }

View File

@ -261,6 +261,35 @@ public class TestMasterFailover {
newState = new RegionState(hriOffline, State.PENDING_OPEN, newState.getServerName()); newState = new RegionState(hriOffline, State.PENDING_OPEN, newState.getServerName());
stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState);
HRegionInfo failedClose = new HRegionInfo(offlineTable.getTableName(), null, null);
createRegion(failedClose, rootdir, conf, offlineTable);
MetaTableAccessor.addRegionToMeta(master.getConnection(), failedClose);
oldState = new RegionState(failedClose, State.PENDING_CLOSE);
newState = new RegionState(failedClose, State.FAILED_CLOSE, newState.getServerName());
stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState);
HRegionInfo failedOpen = new HRegionInfo(offlineTable.getTableName(), null, null);
createRegion(failedOpen, rootdir, conf, offlineTable);
MetaTableAccessor.addRegionToMeta(master.getConnection(), failedOpen);
// Simulate a region transitioning to failed open when the region server reports the
// transition as FAILED_OPEN
oldState = new RegionState(failedOpen, State.PENDING_OPEN);
newState = new RegionState(failedOpen, State.FAILED_OPEN, newState.getServerName());
stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState);
HRegionInfo failedOpenNullServer = new HRegionInfo(offlineTable.getTableName(), null, null);
LOG.info("Failed open NUll server " + failedOpenNullServer.getEncodedName());
createRegion(failedOpenNullServer, rootdir, conf, offlineTable);
MetaTableAccessor.addRegionToMeta(master.getConnection(), failedOpenNullServer);
// Simulate a region transitioning to failed open when the master couldn't find a plan for
// the region
oldState = new RegionState(failedOpenNullServer, State.OFFLINE);
newState = new RegionState(failedOpenNullServer, State.FAILED_OPEN, null);
stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState);
// Stop the master // Stop the master
log("Aborting master"); log("Aborting master");
cluster.abortMaster(0); cluster.abortMaster(0);
@ -283,6 +312,9 @@ public class TestMasterFailover {
// Both pending_open (RPC sent/not yet) regions should be online // Both pending_open (RPC sent/not yet) regions should be online
assertTrue(regionStates.isRegionOnline(hriOffline)); assertTrue(regionStates.isRegionOnline(hriOffline));
assertTrue(regionStates.isRegionOnline(hriOnline)); assertTrue(regionStates.isRegionOnline(hriOnline));
assertTrue(regionStates.isRegionOnline(failedClose));
assertTrue(regionStates.isRegionOnline(failedOpenNullServer));
assertTrue(regionStates.isRegionOnline(failedOpen));
log("Done with verification, shutting down cluster"); log("Done with verification, shutting down cluster");