HBASE-12196 SSH should retry in case failed to assign regions

This commit is contained in:
Jimmy Xiang 2014-10-07 15:07:36 -07:00
parent cab0819327
commit f2fc311b19
2 changed files with 79 additions and 0 deletions

View File

@ -273,6 +273,12 @@ public class ServerShutdownHandler extends EventHandler {
} catch (InterruptedException ie) {
LOG.error("Caught " + ie + " during round-robin assignment");
throw (InterruptedIOException)new InterruptedIOException().initCause(ie);
} catch (IOException ioe) {
LOG.info("Caught " + ioe + " during region assignment, will retry");
// Only do HLog splitting if shouldSplitHlog and in DLR mode
serverManager.processDeadServer(serverName,
this.shouldSplitHlog && distributedLogReplay);
return;
}
if (this.shouldSplitHlog && distributedLogReplay) {

View File

@ -28,8 +28,10 @@ import static org.junit.Assert.fail;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
@ -79,6 +81,7 @@ import org.junit.experimental.categories.Category;
/**
* This tests AssignmentManager with a testing cluster.
*/
@SuppressWarnings("deprecation")
@Category({MasterTests.class, MediumTests.class})
public class TestAssignmentManagerOnCluster {
private final static byte[] FAMILY = Bytes.toBytes("FAMILY");
@ -830,6 +833,58 @@ public class TestAssignmentManagerOnCluster {
}
}
/**
* Test SSH waiting for extra region server for assignment
*/
@Test (timeout=300000)
public void testSSHWaitForServerToAssignRegion() throws Exception {
TableName table = TableName.valueOf("testSSHWaitForServerToAssignRegion");
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
boolean startAServer = false;
try {
HTableDescriptor desc = new HTableDescriptor(table);
desc.addFamily(new HColumnDescriptor(FAMILY));
admin.createTable(desc);
HMaster master = cluster.getMaster();
final ServerManager serverManager = master.getServerManager();
MyLoadBalancer.countRegionServers = Integer.valueOf(
serverManager.countOfRegionServers());
HRegionServer rs = TEST_UTIL.getRSForFirstRegionInTable(table);
assertNotNull("First region should be assigned", rs);
final ServerName serverName = rs.getServerName();
// Wait till SSH tried to assign regions a several times
int counter = MyLoadBalancer.counter.get() + 5;
cluster.killRegionServer(serverName);
startAServer = true;
cluster.waitForRegionServerToStop(serverName, -1);
while (counter > MyLoadBalancer.counter.get()) {
Thread.sleep(1000);
}
cluster.startRegionServer();
startAServer = false;
// Wait till the dead server is processed by SSH
TEST_UTIL.waitFor(120000, 1000, new Waiter.Predicate<Exception>() {
@Override
public boolean evaluate() throws Exception {
return serverManager.isServerDead(serverName)
&& !serverManager.areDeadServersInProgress();
}
});
TEST_UTIL.waitUntilNoRegionsInTransition(300000);
rs = TEST_UTIL.getRSForFirstRegionInTable(table);
assertTrue("First region should be re-assigned to a different server",
rs != null && !serverName.equals(rs.getServerName()));
} finally {
MyLoadBalancer.countRegionServers = null;
TEST_UTIL.deleteTable(table);
if (startAServer) {
cluster.startRegionServer();
}
}
}
/**
* Test force unassign/assign a region of a disabled table
*/
@ -1121,6 +1176,9 @@ public class TestAssignmentManagerOnCluster {
// For this region, if specified, always assign to nowhere
static volatile String controledRegion = null;
static volatile Integer countRegionServers = null;
static AtomicInteger counter = new AtomicInteger(0);
@Override
public ServerName randomAssignment(HRegionInfo regionInfo,
List<ServerName> servers) {
@ -1129,6 +1187,21 @@ public class TestAssignmentManagerOnCluster {
}
return super.randomAssignment(regionInfo, servers);
}
@Override
public Map<ServerName, List<HRegionInfo>> roundRobinAssignment(
List<HRegionInfo> regions, List<ServerName> servers) {
if (countRegionServers != null && services != null) {
int regionServers = services.getServerManager().countOfRegionServers();
if (regionServers < countRegionServers.intValue()) {
// Let's wait till more region servers join in.
// Before that, fail region assignments.
counter.incrementAndGet();
return null;
}
}
return super.roundRobinAssignment(regions, servers);
}
}
public static class MyMaster extends HMaster {