HBASE-6389 Modify the conditions to ensure that Master waits for sufficient number of Region Servers before starting region assignments

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1405111 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Stack 2012-11-02 19:20:52 +00:00
parent 624021fb4d
commit 6f6f425a31
6 changed files with 77 additions and 22 deletions

View File

@ -88,6 +88,18 @@ import com.google.protobuf.ServiceException;
*/
@InterfaceAudience.Private
public class ServerManager {
public static final String WAIT_ON_REGIONSERVERS_MAXTOSTART =
"hbase.master.wait.on.regionservers.maxtostart";
public static final String WAIT_ON_REGIONSERVERS_MINTOSTART =
"hbase.master.wait.on.regionservers.mintostart";
public static final String WAIT_ON_REGIONSERVERS_TIMEOUT =
"hbase.master.wait.on.regionservers.timeout";
public static final String WAIT_ON_REGIONSERVERS_INTERVAL =
"hbase.master.wait.on.regionservers.interval";
private static final Log LOG = LogFactory.getLog(ServerManager.class);
// Set if we are to shutdown the cluster.
@ -681,25 +693,38 @@ public class ServerManager {
* Wait for the region servers to report in.
* We will wait until one of this condition is met:
* - the master is stopped
* - the 'hbase.master.wait.on.regionservers.timeout' is reached
* - the 'hbase.master.wait.on.regionservers.maxtostart' number of
* region servers is reached
* - the 'hbase.master.wait.on.regionservers.mintostart' is reached AND
* there have been no new region server in for
* 'hbase.master.wait.on.regionservers.interval' time
* 'hbase.master.wait.on.regionservers.interval' time AND
* the 'hbase.master.wait.on.regionservers.timeout' is reached
*
* @throws InterruptedException
*/
public void waitForRegionServers(MonitoredTask status)
throws InterruptedException {
final long interval = this.master.getConfiguration().
getLong("hbase.master.wait.on.regionservers.interval", 1500);
getLong(WAIT_ON_REGIONSERVERS_INTERVAL, 1500);
final long timeout = this.master.getConfiguration().
getLong("hbase.master.wait.on.regionservers.timeout", 4500);
final int minToStart = this.master.getConfiguration().
getInt("hbase.master.wait.on.regionservers.mintostart", 1);
final int maxToStart = this.master.getConfiguration().
getInt("hbase.master.wait.on.regionservers.maxtostart", Integer.MAX_VALUE);
getLong(WAIT_ON_REGIONSERVERS_TIMEOUT, 4500);
int minToStart = this.master.getConfiguration().
getInt(WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
if (minToStart < 1) {
LOG.warn(String.format(
"The value of '%s' (%d) can not be less than 1, ignoring.",
WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
minToStart = 1;
}
int maxToStart = this.master.getConfiguration().
getInt(WAIT_ON_REGIONSERVERS_MAXTOSTART, Integer.MAX_VALUE);
if (maxToStart < minToStart) {
LOG.warn(String.format(
"The value of '%s' (%d) is set less than '%s' (%d), ignoring.",
WAIT_ON_REGIONSERVERS_MAXTOSTART, maxToStart,
WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
maxToStart = Integer.MAX_VALUE;
}
long now = System.currentTimeMillis();
final long startTime = now;
@ -710,9 +735,8 @@ public class ServerManager {
int oldCount = 0;
while (
!this.master.isStopped() &&
slept < timeout &&
count < maxToStart &&
(lastCountChange+interval > now || count < minToStart)
(lastCountChange+interval > now || timeout > slept || count < minToStart)
){
// Log some info at every interval time or if there is a change

View File

@ -69,6 +69,7 @@ import org.apache.hadoop.hbase.io.hfile.Compression;
import org.apache.hadoop.hbase.io.hfile.Compression.Algorithm;
import org.apache.hadoop.hbase.mapreduce.MapreduceTestingShim;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.ServerManager;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.regionserver.HStore;
@ -79,6 +80,7 @@ import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.JVMClusterUtil;
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
import org.apache.hadoop.hbase.util.RegionSplitter;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.zookeeper.EmptyWatcher;
@ -730,9 +732,13 @@ public class HBaseTestingUtility {
createRootDir();
// These settings will make the server waits until this exact number of
// regions servers are connected.
conf.setInt("hbase.master.wait.on.regionservers.mintostart", numSlaves);
conf.setInt("hbase.master.wait.on.regionservers.maxtostart", numSlaves);
// regions servers are connected.
if (conf.getInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, -1) == -1) {
conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, numSlaves);
}
if (conf.getInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, -1) == -1) {
conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, numSlaves);
}
Configuration c = new Configuration(this.conf);
this.hbaseCluster =
@ -816,6 +822,9 @@ public class HBaseTestingUtility {
zooKeeperWatcher = null;
}
// unset the configuration for MIN and MAX RS to start
conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, -1);
conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, -1);
if (this.hbaseCluster != null) {
this.hbaseCluster.shutdown();
// Wait till hbase is down before going on to shutdown zk.
@ -1542,9 +1551,28 @@ public class HBaseTestingUtility {
public void expireRegionServerSession(int index) throws Exception {
HRegionServer rs = getMiniHBaseCluster().getRegionServer(index);
expireSession(rs.getZooKeeper(), false);
decrementMinRegionServerCount();
}
private void decrementMinRegionServerCount() {
// decrement the count for this.conf, for newly spwaned master
// this.hbaseCluster shares this configuration too
decrementMinRegionServerCount(getConfiguration());
// each master thread keeps a copy of configuration
for (MasterThread master : getHBaseCluster().getMasterThreads()) {
decrementMinRegionServerCount(master.getMaster().getConfiguration());
}
}
private void decrementMinRegionServerCount(Configuration conf) {
int currentCount = conf.getInt(
ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, -1);
if (currentCount != -1) {
conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART,
Math.max(currentCount - 1, 1));
}
}
public void expireSession(ZooKeeperWatcher nodeZK) throws Exception {
expireSession(nodeZK, false);

View File

@ -444,7 +444,7 @@ public class TestZooKeeper {
* Tests whether the logs are split when master recovers from a expired zookeeper session and an
* RS goes down.
*/
@Test(timeout = 180000)
@Test(timeout = 240000)
public void testLogSplittingAfterMasterRecoveryDueToZKExpiry() throws IOException,
KeeperException, InterruptedException {
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();

View File

@ -157,8 +157,8 @@ public class TestMasterFailover {
// Need to drop the timeout much lower
conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000);
conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 4000);
conf.setInt("hbase.master.wait.on.regionservers.mintostart", 3);
conf.setInt("hbase.master.wait.on.regionservers.maxtostart", 3);
conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 3);
conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 3);
// Start the cluster
HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
@ -460,8 +460,8 @@ public class TestMasterFailover {
// Need to drop the timeout much lower
conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000);
conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 4000);
conf.setInt("hbase.master.wait.on.regionservers.mintostart", 1);
conf.setInt("hbase.master.wait.on.regionservers.maxtostart", 2);
conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 2);
TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
log("Cluster started");

View File

@ -247,8 +247,8 @@ public class TestMasterNoCluster {
public void testCatalogDeploys()
throws IOException, KeeperException, InterruptedException, DeserializationException, ServiceException {
final Configuration conf = TESTUTIL.getConfiguration();
conf.setInt("hbase.master.wait.on.regionservers.mintostart", 1);
conf.setInt("hbase.master.wait.on.regionservers.maxtostart", 1);
conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 1);
final long now = System.currentTimeMillis();
// Name for our single mocked up regionserver.

View File

@ -69,8 +69,11 @@ public class TestRSKilledWhenMasterInitializing {
@BeforeClass
public static void setUpBeforeClass() throws Exception {
// Set it so that this test runs with my custom master
TESTUTIL.getConfiguration().setClass(HConstants.MASTER_IMPL,
TestingMaster.class, HMaster.class);
Configuration conf = TESTUTIL.getConfiguration();
conf.setClass(HConstants.MASTER_IMPL, TestingMaster.class, HMaster.class);
conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 3);
conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 4);
// Start up the cluster.
TESTUTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
}