From 6f6f425a31ca4464f4e252d615efc0e982c01c0d Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Fri, 2 Nov 2012 19:20:52 +0000 Subject: [PATCH] HBASE-6389 Modify the conditions to ensure that Master waits for sufficient number of Region Servers before starting region assignments git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1405111 13f79535-47bb-0310-9956-ffa450edef68 --- .../hadoop/hbase/master/ServerManager.java | 44 ++++++++++++++----- .../hadoop/hbase/HBaseTestingUtility.java | 34 ++++++++++++-- .../apache/hadoop/hbase/TestZooKeeper.java | 2 +- .../hbase/master/TestMasterFailover.java | 8 ++-- .../hbase/master/TestMasterNoCluster.java | 4 +- .../TestRSKilledWhenMasterInitializing.java | 7 ++- 6 files changed, 77 insertions(+), 22 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index d5f982ba1d7..3a5845fce93 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -88,6 +88,18 @@ import com.google.protobuf.ServiceException; */ @InterfaceAudience.Private public class ServerManager { + public static final String WAIT_ON_REGIONSERVERS_MAXTOSTART = + "hbase.master.wait.on.regionservers.maxtostart"; + + public static final String WAIT_ON_REGIONSERVERS_MINTOSTART = + "hbase.master.wait.on.regionservers.mintostart"; + + public static final String WAIT_ON_REGIONSERVERS_TIMEOUT = + "hbase.master.wait.on.regionservers.timeout"; + + public static final String WAIT_ON_REGIONSERVERS_INTERVAL = + "hbase.master.wait.on.regionservers.interval"; + private static final Log LOG = LogFactory.getLog(ServerManager.class); // Set if we are to shutdown the cluster. @@ -681,25 +693,38 @@ public class ServerManager { * Wait for the region servers to report in. * We will wait until one of this condition is met: * - the master is stopped - * - the 'hbase.master.wait.on.regionservers.timeout' is reached * - the 'hbase.master.wait.on.regionservers.maxtostart' number of * region servers is reached * - the 'hbase.master.wait.on.regionservers.mintostart' is reached AND * there have been no new region server in for - * 'hbase.master.wait.on.regionservers.interval' time + * 'hbase.master.wait.on.regionservers.interval' time AND + * the 'hbase.master.wait.on.regionservers.timeout' is reached * * @throws InterruptedException */ public void waitForRegionServers(MonitoredTask status) throws InterruptedException { final long interval = this.master.getConfiguration(). - getLong("hbase.master.wait.on.regionservers.interval", 1500); + getLong(WAIT_ON_REGIONSERVERS_INTERVAL, 1500); final long timeout = this.master.getConfiguration(). - getLong("hbase.master.wait.on.regionservers.timeout", 4500); - final int minToStart = this.master.getConfiguration(). - getInt("hbase.master.wait.on.regionservers.mintostart", 1); - final int maxToStart = this.master.getConfiguration(). - getInt("hbase.master.wait.on.regionservers.maxtostart", Integer.MAX_VALUE); + getLong(WAIT_ON_REGIONSERVERS_TIMEOUT, 4500); + int minToStart = this.master.getConfiguration(). + getInt(WAIT_ON_REGIONSERVERS_MINTOSTART, 1); + if (minToStart < 1) { + LOG.warn(String.format( + "The value of '%s' (%d) can not be less than 1, ignoring.", + WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart)); + minToStart = 1; + } + int maxToStart = this.master.getConfiguration(). + getInt(WAIT_ON_REGIONSERVERS_MAXTOSTART, Integer.MAX_VALUE); + if (maxToStart < minToStart) { + LOG.warn(String.format( + "The value of '%s' (%d) is set less than '%s' (%d), ignoring.", + WAIT_ON_REGIONSERVERS_MAXTOSTART, maxToStart, + WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart)); + maxToStart = Integer.MAX_VALUE; + } long now = System.currentTimeMillis(); final long startTime = now; @@ -710,9 +735,8 @@ public class ServerManager { int oldCount = 0; while ( !this.master.isStopped() && - slept < timeout && count < maxToStart && - (lastCountChange+interval > now || count < minToStart) + (lastCountChange+interval > now || timeout > slept || count < minToStart) ){ // Log some info at every interval time or if there is a change diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java index ed2a453a09b..a7f4793f17b 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java @@ -69,6 +69,7 @@ import org.apache.hadoop.hbase.io.hfile.Compression; import org.apache.hadoop.hbase.io.hfile.Compression.Algorithm; import org.apache.hadoop.hbase.mapreduce.MapreduceTestingShim; import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.ServerManager; import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.regionserver.HStore; @@ -79,6 +80,7 @@ import org.apache.hadoop.hbase.security.User; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.util.JVMClusterUtil; +import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; import org.apache.hadoop.hbase.util.RegionSplitter; import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.zookeeper.EmptyWatcher; @@ -730,9 +732,13 @@ public class HBaseTestingUtility { createRootDir(); // These settings will make the server waits until this exact number of - // regions servers are connected. - conf.setInt("hbase.master.wait.on.regionservers.mintostart", numSlaves); - conf.setInt("hbase.master.wait.on.regionservers.maxtostart", numSlaves); + // regions servers are connected. + if (conf.getInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, -1) == -1) { + conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, numSlaves); + } + if (conf.getInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, -1) == -1) { + conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, numSlaves); + } Configuration c = new Configuration(this.conf); this.hbaseCluster = @@ -816,6 +822,9 @@ public class HBaseTestingUtility { zooKeeperWatcher = null; } + // unset the configuration for MIN and MAX RS to start + conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, -1); + conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, -1); if (this.hbaseCluster != null) { this.hbaseCluster.shutdown(); // Wait till hbase is down before going on to shutdown zk. @@ -1542,9 +1551,28 @@ public class HBaseTestingUtility { public void expireRegionServerSession(int index) throws Exception { HRegionServer rs = getMiniHBaseCluster().getRegionServer(index); expireSession(rs.getZooKeeper(), false); + decrementMinRegionServerCount(); } + private void decrementMinRegionServerCount() { + // decrement the count for this.conf, for newly spwaned master + // this.hbaseCluster shares this configuration too + decrementMinRegionServerCount(getConfiguration()); + // each master thread keeps a copy of configuration + for (MasterThread master : getHBaseCluster().getMasterThreads()) { + decrementMinRegionServerCount(master.getMaster().getConfiguration()); + } + } + + private void decrementMinRegionServerCount(Configuration conf) { + int currentCount = conf.getInt( + ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, -1); + if (currentCount != -1) { + conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, + Math.max(currentCount - 1, 1)); + } + } public void expireSession(ZooKeeperWatcher nodeZK) throws Exception { expireSession(nodeZK, false); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java index 651d2b0f8e1..dbe3c02296f 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java @@ -444,7 +444,7 @@ public class TestZooKeeper { * Tests whether the logs are split when master recovers from a expired zookeeper session and an * RS goes down. */ - @Test(timeout = 180000) + @Test(timeout = 240000) public void testLogSplittingAfterMasterRecoveryDueToZKExpiry() throws IOException, KeeperException, InterruptedException { MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java index 75900c9623f..f7faeab5219 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java @@ -157,8 +157,8 @@ public class TestMasterFailover { // Need to drop the timeout much lower conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000); conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 4000); - conf.setInt("hbase.master.wait.on.regionservers.mintostart", 3); - conf.setInt("hbase.master.wait.on.regionservers.maxtostart", 3); + conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 3); + conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 3); // Start the cluster HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf); @@ -460,8 +460,8 @@ public class TestMasterFailover { // Need to drop the timeout much lower conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000); conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 4000); - conf.setInt("hbase.master.wait.on.regionservers.mintostart", 1); - conf.setInt("hbase.master.wait.on.regionservers.maxtostart", 2); + conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1); + conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 2); TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS); MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); log("Cluster started"); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterNoCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterNoCluster.java index ae628f0e20d..a458242894b 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterNoCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterNoCluster.java @@ -247,8 +247,8 @@ public class TestMasterNoCluster { public void testCatalogDeploys() throws IOException, KeeperException, InterruptedException, DeserializationException, ServiceException { final Configuration conf = TESTUTIL.getConfiguration(); - conf.setInt("hbase.master.wait.on.regionservers.mintostart", 1); - conf.setInt("hbase.master.wait.on.regionservers.maxtostart", 1); + conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1); + conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 1); final long now = System.currentTimeMillis(); // Name for our single mocked up regionserver. diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java index 34fb3fe8148..3eb278d2e88 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java @@ -69,8 +69,11 @@ public class TestRSKilledWhenMasterInitializing { @BeforeClass public static void setUpBeforeClass() throws Exception { // Set it so that this test runs with my custom master - TESTUTIL.getConfiguration().setClass(HConstants.MASTER_IMPL, - TestingMaster.class, HMaster.class); + Configuration conf = TESTUTIL.getConfiguration(); + conf.setClass(HConstants.MASTER_IMPL, TestingMaster.class, HMaster.class); + conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 3); + conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 4); + // Start up the cluster. TESTUTIL.startMiniCluster(NUM_MASTERS, NUM_RS); }