diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java index 1691c9df9da..c44396896c7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java @@ -273,6 +273,12 @@ public class ServerShutdownHandler extends EventHandler { } catch (InterruptedException ie) { LOG.error("Caught " + ie + " during round-robin assignment"); throw (InterruptedIOException)new InterruptedIOException().initCause(ie); + } catch (IOException ioe) { + LOG.info("Caught " + ioe + " during region assignment, will retry"); + // Only do HLog splitting if shouldSplitHlog and in DLR mode + serverManager.processDeadServer(serverName, + this.shouldSplitHlog && distributedLogReplay); + return; } if (this.shouldSplitHlog && distributedLogReplay) { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java index 4d2426889f7..27c7073cd2e 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java @@ -28,8 +28,10 @@ import static org.junit.Assert.fail; import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -79,6 +81,7 @@ import org.junit.experimental.categories.Category; /** * This tests AssignmentManager with a testing cluster. */ +@SuppressWarnings("deprecation") @Category({MasterTests.class, MediumTests.class}) public class TestAssignmentManagerOnCluster { private final static byte[] FAMILY = Bytes.toBytes("FAMILY"); @@ -830,6 +833,58 @@ public class TestAssignmentManagerOnCluster { } } + /** + * Test SSH waiting for extra region server for assignment + */ + @Test (timeout=300000) + public void testSSHWaitForServerToAssignRegion() throws Exception { + TableName table = TableName.valueOf("testSSHWaitForServerToAssignRegion"); + MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); + boolean startAServer = false; + try { + HTableDescriptor desc = new HTableDescriptor(table); + desc.addFamily(new HColumnDescriptor(FAMILY)); + admin.createTable(desc); + + HMaster master = cluster.getMaster(); + final ServerManager serverManager = master.getServerManager(); + MyLoadBalancer.countRegionServers = Integer.valueOf( + serverManager.countOfRegionServers()); + HRegionServer rs = TEST_UTIL.getRSForFirstRegionInTable(table); + assertNotNull("First region should be assigned", rs); + final ServerName serverName = rs.getServerName(); + // Wait till SSH tried to assign regions a several times + int counter = MyLoadBalancer.counter.get() + 5; + cluster.killRegionServer(serverName); + startAServer = true; + cluster.waitForRegionServerToStop(serverName, -1); + while (counter > MyLoadBalancer.counter.get()) { + Thread.sleep(1000); + } + cluster.startRegionServer(); + startAServer = false; + // Wait till the dead server is processed by SSH + TEST_UTIL.waitFor(120000, 1000, new Waiter.Predicate() { + @Override + public boolean evaluate() throws Exception { + return serverManager.isServerDead(serverName) + && !serverManager.areDeadServersInProgress(); + } + }); + TEST_UTIL.waitUntilNoRegionsInTransition(300000); + + rs = TEST_UTIL.getRSForFirstRegionInTable(table); + assertTrue("First region should be re-assigned to a different server", + rs != null && !serverName.equals(rs.getServerName())); + } finally { + MyLoadBalancer.countRegionServers = null; + TEST_UTIL.deleteTable(table); + if (startAServer) { + cluster.startRegionServer(); + } + } + } + /** * Test force unassign/assign a region of a disabled table */ @@ -1121,6 +1176,9 @@ public class TestAssignmentManagerOnCluster { // For this region, if specified, always assign to nowhere static volatile String controledRegion = null; + static volatile Integer countRegionServers = null; + static AtomicInteger counter = new AtomicInteger(0); + @Override public ServerName randomAssignment(HRegionInfo regionInfo, List servers) { @@ -1129,6 +1187,21 @@ public class TestAssignmentManagerOnCluster { } return super.randomAssignment(regionInfo, servers); } + + @Override + public Map> roundRobinAssignment( + List regions, List servers) { + if (countRegionServers != null && services != null) { + int regionServers = services.getServerManager().countOfRegionServers(); + if (regionServers < countRegionServers.intValue()) { + // Let's wait till more region servers join in. + // Before that, fail region assignments. + counter.incrementAndGet(); + return null; + } + } + return super.roundRobinAssignment(regions, servers); + } } public static class MyMaster extends HMaster {