diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index e39adc8bf18..262ffeedae0 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -550,8 +550,9 @@ public class AssignmentManager extends ZooKeeperListener { if (!regionsInTransition.isEmpty()) { Set onlineServers = serverManager.getOnlineServers().keySet(); for (RegionState regionState: regionsInTransition.values()) { + ServerName serverName = regionState.getServerName(); if (!regionState.getRegion().isMetaRegion() - && onlineServers.contains(regionState.getServerName())) { + && serverName != null && onlineServers.contains(serverName)) { LOG.debug("Found " + regionState + " in RITs"); failover = true; break; @@ -2986,15 +2987,22 @@ public class AssignmentManager extends ZooKeeperListener { // the state after the RPC call. Otherwise, we don't know what's happened // to the region if the master dies right after the RPC call is out. Map rits = regionStates.getRegionsInTransition(); - for (RegionState regionState: rits.values()) { - if (!serverManager.isServerOnline(regionState.getServerName())) { - continue; // SSH will handle it - } - State state = regionState.getState(); + for (RegionState regionState : rits.values()) { LOG.info("Processing " + regionState); + ServerName serverName = regionState.getServerName(); + // Server could be null in case of FAILED_OPEN when master cannot find a region plan. In that + // case, try assigning it here. + if (serverName != null + && !serverManager.getOnlineServers().containsKey(serverName)) { + LOG.info("Server " + serverName + " isn't online. SSH will handle this"); + continue; + } + HRegionInfo regionInfo = regionState.getRegion(); + State state = regionState.getState(); + switch (state) { case CLOSED: - invokeAssign(regionState.getRegion()); + invokeAssign(regionInfo); break; case PENDING_OPEN: retrySendRegionOpen(regionState); @@ -3002,6 +3010,10 @@ public class AssignmentManager extends ZooKeeperListener { case PENDING_CLOSE: retrySendRegionClose(regionState); break; + case FAILED_CLOSE: + case FAILED_OPEN: + invokeUnAssign(regionInfo); + break; default: // No process for other states } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java index 26e46c68e76..8ae26a350b7 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java @@ -1064,8 +1064,8 @@ public class TestMasterFailover { RegionState newState = regionStates.getRegionState(hri); assertTrue(newState.isOpened()); } - - /** + + /** * Simple test of master failover. *

* Starts with three masters. Kills a backup master. Then kills the active @@ -1176,7 +1176,7 @@ public class TestMasterFailover { } /** - * Test region in pending_open/close when master failover + * Test region in pending_open/close and failed_open/close when master failover */ @Test (timeout=180000) @SuppressWarnings("deprecation") @@ -1246,6 +1246,37 @@ public class TestMasterFailover { oldState = new RegionState(hriOffline, State.OFFLINE); newState = new RegionState(hriOffline, State.PENDING_OPEN, newState.getServerName()); stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); + + HRegionInfo failedClose = new HRegionInfo(offlineTable.getTableName(), null, null); + createRegion(failedClose, rootdir, conf, offlineTable); + MetaTableAccessor.addRegionToMeta(master.getConnection(), failedClose); + + oldState = new RegionState(failedClose, State.PENDING_CLOSE); + newState = new RegionState(failedClose, State.FAILED_CLOSE, newState.getServerName()); + stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); + + + HRegionInfo failedOpen = new HRegionInfo(offlineTable.getTableName(), null, null); + createRegion(failedOpen, rootdir, conf, offlineTable); + MetaTableAccessor.addRegionToMeta(master.getConnection(), failedOpen); + + // Simulate a region transitioning to failed open when the region server reports the + // transition as FAILED_OPEN + oldState = new RegionState(failedOpen, State.PENDING_OPEN); + newState = new RegionState(failedOpen, State.FAILED_OPEN, newState.getServerName()); + stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); + + HRegionInfo failedOpenNullServer = new HRegionInfo(offlineTable.getTableName(), null, null); + createRegion(failedOpenNullServer, rootdir, conf, offlineTable); + MetaTableAccessor.addRegionToMeta(master.getConnection(), failedOpenNullServer); + + // Simulate a region transitioning to failed open when the master couldn't find a plan for + // the region + oldState = new RegionState(failedOpenNullServer, State.OFFLINE); + newState = new RegionState(failedOpenNullServer, State.FAILED_OPEN, null); + stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); + + // Stop the master log("Aborting master"); @@ -1269,7 +1300,10 @@ public class TestMasterFailover { // Both pending_open (RPC sent/not yet) regions should be online assertTrue(regionStates.isRegionOnline(hriOffline)); assertTrue(regionStates.isRegionOnline(hriOnline)); - + assertTrue(regionStates.isRegionOnline(failedClose)); + assertTrue(regionStates.isRegionOnline(failedOpenNullServer)); + assertTrue(regionStates.isRegionOnline(failedOpen)); + log("Done with verification, shutting down cluster"); // Done, shutdown the cluster