HBASE-13330. Region left unassigned due to AM & SSH each thinking the assignment would be done by the other

This commit is contained in:
Devaraj Das 2015-10-16 13:49:18 -07:00
parent 6dc33cad75
commit 2c301d535b
2 changed files with 45 additions and 0 deletions

View File

@ -829,6 +829,7 @@ public class AssignmentManager extends ZooKeeperListener {
case RS_ZK_REGION_CLOSED:
case RS_ZK_REGION_FAILED_OPEN:
// Region is closed, insert into RIT and handle it
regionStates.setLastRegionServerOfRegion(sn, encodedName);
regionStates.updateRegionState(regionInfo, State.CLOSED, sn);
if (!replicasToClose.contains(regionInfo)) {
invokeAssign(regionInfo);

View File

@ -968,6 +968,50 @@ public class TestAssignmentManager {
}
}
/*
* Tests the scenario
* - a regionserver (SERVERNAME_DEAD) owns a region (hence the meta would have
* the SERVERNAME_DEAD as the host for the region),
* - SERVERNAME_DEAD goes down
* - one of the affected regions is assigned to a live regionserver (SERVERNAME_LIVE) but that
* assignment somehow fails. The region ends up in the FAILED_OPEN state on ZK
* - [Issue that the patch on HBASE-13330 fixes] when the master is restarted,
* the SSH for SERVERNAME_DEAD rightly thinks that the region is now on transition on
* SERVERNAME_LIVE. But the owner for the region is still SERVERNAME_DEAD in the AM's states.
* The AM thinks that the SSH for SERVERNAME_DEAD will assign the region. The region remains
* unassigned for ever.
*/
@Test(timeout = 60000)
public void testAssignmentOfRegionInSSHAndInFailedOpenState() throws IOException,
KeeperException, ServiceException, CoordinatedStateException, InterruptedException {
AssignmentManagerWithExtrasForTesting am = setUpMockedAssignmentManager(
this.server, this.serverManager);
ZKAssign.createNodeOffline(this.watcher, REGIONINFO, SERVERNAME_LIVE);
int v = ZKAssign.getVersion(this.watcher, REGIONINFO);
ZKAssign.transitionNode(this.watcher, REGIONINFO, SERVERNAME_LIVE,
EventType.M_ZK_REGION_OFFLINE, EventType.RS_ZK_REGION_FAILED_OPEN, v);
Mockito.when(this.serverManager.isServerOnline(SERVERNAME_LIVE)).thenReturn(true);
Mockito.when(this.serverManager.isServerReachable(SERVERNAME_LIVE)).thenReturn(true);
Mockito.when(this.serverManager.isServerOnline(SERVERNAME_DEAD)).thenReturn(false);
DeadServer deadServers = new DeadServer();
deadServers.add(SERVERNAME_DEAD);
Mockito.when(this.serverManager.getDeadServers()).thenReturn(deadServers);
final Map<ServerName, ServerLoad> onlineServers = new HashMap<ServerName, ServerLoad>();
onlineServers.put(SERVERNAME_LIVE, ServerLoad.EMPTY_SERVERLOAD);
Mockito.when(this.serverManager.getOnlineServersList()).thenReturn(
new ArrayList<ServerName>(onlineServers.keySet()));
Mockito.when(this.serverManager.getOnlineServers()).thenReturn(onlineServers);
am.gate.set(false);
// join the cluster - that's when the AM is really kicking in after a restart
am.joinCluster();
while (!am.gate.get()) {
Thread.sleep(10);
}
assertTrue(am.getRegionStates().getRegionState(REGIONINFO).getState()
== RegionState.State.PENDING_OPEN);
am.shutdown();
}
/**
* Test the scenario when the master is in failover and trying to process a
* region which is in Opening state on a dead RS. Master will force offline the