From 7a36cab817ccfd01d2413bb3aca682848fed5e65 Mon Sep 17 00:00:00 2001
From: Zhihong Yu <tedyu@apache.org>
Date: Wed, 7 Dec 2011 22:59:06 +0000
Subject: [PATCH] HBASE-4610  Port HBASE-3380 (Master failover can split logs
 of live servers) to 92/trunk

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1211695 13f79535-47bb-0310-9956-ffa450edef68
---
 CHANGES.txt                                   |  1 +
 .../hadoop/hbase/master/ServerManager.java    | 20 ++++++++++++++++++-
 .../hbase/master/TestMasterFailover.java      | 11 +++++++++-
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index d7bacac82f0..ac5ce595152 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -452,6 +452,7 @@ Release 0.92.0 - Unreleased
    HBASE-4878  Master crash when splitting hlog may cause data loss (Chunhui Shen)
    HBASE-4945  NPE in HRegion.bulkLoadHFiles (Andrew P and Lars H)
    HBASE-4942  HMaster is unable to start of HFile V1 is used (Honghua Zhu)
+   HBASE-4610  Port HBASE-3380 (Master failover can split logs of live servers) to 92/trunk
 
   TESTS
    HBASE-4450  test for number of blocks read: to serve as baseline for expected
diff --git a/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index f76a56a57f0..974e04a90ba 100644
--- a/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -522,16 +522,34 @@ public class ServerManager {
   public void waitForRegionServers(MonitoredTask status)
   throws InterruptedException {
     long interval = this.master.getConfiguration().
-      getLong("hbase.master.wait.on.regionservers.interval", 3000);
+      getLong("hbase.master.wait.on.regionservers.interval", 1500);
+    long timeout = this.master.getConfiguration().
+    getLong("hbase.master.wait.on.regionservers.timeout", 4500);
+    int minToStart = this.master.getConfiguration().
+    getInt("hbase.master.wait.on.regionservers.mintostart", 1);
+    int maxToStart = this.master.getConfiguration().
+    getInt("hbase.master.wait.on.regionservers.maxtostart", Integer.MAX_VALUE);    
     // So, number of regionservers > 0 and its been n since last check in, break,
     // else just stall here
     int count = 0;
+    long slept = 0;
     for (int oldcount = countOfRegionServers(); !this.master.isStopped();) {
       Thread.sleep(interval);
+      slept += interval;
       count = countOfRegionServers();
       if (count == oldcount && count > 0) break;
 
       String msg;
+      if (count == oldcount && count >= minToStart && slept >= timeout) {
+        LOG.info("Finished waiting for regionserver count to settle; " +
+            "count=" + count + ", sleptFor=" + slept);
+        break;
+      }
+      if (count >= maxToStart) {
+        LOG.info("At least the max configured number of regionserver(s) have " +
+            "checked in: " + count);
+        break;
+      }
       if (count == 0) {
         msg = "Waiting on regionserver(s) to checkin";
       } else {
diff --git a/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java b/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java
index 649e927c9b0..61c0469a1bb 100644
--- a/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java
+++ b/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java
@@ -69,8 +69,13 @@ public class TestMasterFailover {
     final int NUM_MASTERS = 3;
     final int NUM_RS = 3;
 
+    // Create config to use for this cluster
+    Configuration conf = HBaseConfiguration.create();
+    conf.setInt("hbase.master.wait.on.regionservers.mintostart", 3);
+    conf.setInt("hbase.master.wait.on.regionservers.maxtostart", 3);
+
     // Start the cluster
-    HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
+    HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
     TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
 
@@ -298,6 +303,8 @@ public class TestMasterFailover {
     // Need to drop the timeout much lower
     conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000);
     conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 4000);
+    conf.setInt("hbase.master.wait.on.regionservers.mintostart", 3);
+    conf.setInt("hbase.master.wait.on.regionservers.maxtostart", 3);
 
     // Start the cluster
     HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
@@ -596,6 +603,8 @@ public class TestMasterFailover {
     // Need to drop the timeout much lower
     conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000);
     conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 4000);
+    conf.setInt("hbase.master.wait.on.regionservers.mintostart", 1);
+    conf.setInt("hbase.master.wait.on.regionservers.maxtostart", 2);
 
     // Create and start the cluster
     HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);