diff --git a/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index c5691ed0ba2..9de1784ccd8 100644 --- a/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -516,48 +516,73 @@ public class ServerManager { } /** - * Waits for the regionservers to report in. + * Wait for the region servers to report in. + * We will wait until one of this condition is met: + * - the master is stopped + * - the 'hbase.master.wait.on.regionservers.timeout' is reached + * - the 'hbase.master.wait.on.regionservers.maxtostart' number of + * region servers is reached + * - the 'hbase.master.wait.on.regionservers.mintostart' is reached AND + * there have been no new region server in for + * 'hbase.master.wait.on.regionservers.interval' time + * * @throws InterruptedException */ public void waitForRegionServers(MonitoredTask status) throws InterruptedException { - long interval = this.master.getConfiguration(). + final long interval = this.master.getConfiguration(). getLong("hbase.master.wait.on.regionservers.interval", 1500); - long timeout = this.master.getConfiguration(). + final long timeout = this.master.getConfiguration(). getLong("hbase.master.wait.on.regionservers.timeout", 4500); - int minToStart = this.master.getConfiguration(). + final int minToStart = this.master.getConfiguration(). getInt("hbase.master.wait.on.regionservers.mintostart", 1); - int maxToStart = this.master.getConfiguration(). - getInt("hbase.master.wait.on.regionservers.maxtostart", Integer.MAX_VALUE); - // So, number of regionservers > 0 and its been n since last check in, break, - // else just stall here - int count = 0; - long slept = 0; - for (int oldcount = countOfRegionServers(); !this.master.isStopped();) { - Thread.sleep(interval); - slept += interval; - count = countOfRegionServers(); + final int maxToStart = this.master.getConfiguration(). + getInt("hbase.master.wait.on.regionservers.maxtostart", Integer.MAX_VALUE); - String msg; - if (count == oldcount && count >= minToStart && slept >= timeout) { - LOG.info("Finished waiting for regionserver count to settle; " + - "count=" + count + ", sleptFor=" + slept); - break; + long now = System.currentTimeMillis(); + final long startTime = now; + long slept = 0; + long lastLogTime = 0; + long lastCountChange = startTime; + int count = countOfRegionServers(); + int oldCount = 0; + while ( + !this.master.isStopped() && + slept < timeout && + count < maxToStart && + !(lastCountChange+interval > now && count >= minToStart) + ){ + + // Log some info at every interval time or if there is a change + if (oldCount != count || lastLogTime+interval < now){ + lastLogTime = now; + String msg = + "Waiting for region servers count to settle; currently"+ + " checked in " + count + ", slept for " + slept + " ms," + + " expecting minimum of " + minToStart + ", maximum of "+ maxToStart+ + ", timeout of "+timeout+" ms, interval of "+interval+" ms."; + LOG.info(msg); + status.setStatus(msg); } - if (count >= maxToStart) { - LOG.info("At least the max configured number of regionserver(s) have " + - "checked in: " + count); - break; + + // We sleep for some time + final long sleepTime = 50; + Thread.sleep(sleepTime); + now = System.currentTimeMillis(); + slept = now - startTime; + + oldCount = count; + count = countOfRegionServers(); + if (count != oldCount) { + lastCountChange = now; } - if (count == 0) { - msg = "Waiting on regionserver(s) to checkin"; - } else { - msg = "Waiting on regionserver(s) count to settle; currently=" + count; - } - LOG.info(msg); - status.setStatus(msg); - oldcount = count; } + + LOG.info("Finished waiting for region servers count to settle;" + + " checked in " + count + ", slept for " + slept + " ms," + + " expecting minimum of " + minToStart + ", maximum of "+ maxToStart+","+ + " master is "+ (this.master.isStopped() ? "stopped.": "running.") + ); } /** diff --git a/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java b/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java index 6fca020631a..06b20c627f7 100644 --- a/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java +++ b/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java @@ -561,6 +561,12 @@ public class HBaseTestingUtility { throws IOException, InterruptedException { // Now do the mini hbase cluster. Set the hbase.rootdir in config. createRootDir(); + + // These settings will make the server waits until this exact number of + // regions servers are connected. + conf.setInt("hbase.master.wait.on.regionservers.mintostart", numSlaves); + conf.setInt("hbase.master.wait.on.regionservers.maxtostart", numSlaves); + Configuration c = new Configuration(this.conf); this.hbaseCluster = new MiniHBaseCluster(c, numMasters, numSlaves); // Don't leave here till we've done a successful scan of the .META. diff --git a/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java b/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java index c86dc7f433a..2eb68cf41ca 100644 --- a/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java +++ b/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java @@ -71,8 +71,6 @@ public class TestMasterFailover { // Create config to use for this cluster Configuration conf = HBaseConfiguration.create(); - conf.setInt("hbase.master.wait.on.regionservers.mintostart", 3); - conf.setInt("hbase.master.wait.on.regionservers.maxtostart", 3); // Start the cluster HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf); diff --git a/src/test/resources/hbase-site.xml b/src/test/resources/hbase-site.xml index b5e7168031a..2a9ab29f359 100644 --- a/src/test/resources/hbase-site.xml +++ b/src/test/resources/hbase-site.xml @@ -22,12 +22,6 @@ */ --> - - hbase.master.wait.on.regionservers.interval - 100 - How long we wait on regionservers to check in - - hbase.regionserver.msginterval 1000