From 7a36cab817ccfd01d2413bb3aca682848fed5e65 Mon Sep 17 00:00:00 2001 From: Zhihong Yu <tedyu@apache.org> Date: Wed, 7 Dec 2011 22:59:06 +0000 Subject: [PATCH] HBASE-4610 Port HBASE-3380 (Master failover can split logs of live servers) to 92/trunk git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1211695 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 1 + .../hadoop/hbase/master/ServerManager.java | 20 ++++++++++++++++++- .../hbase/master/TestMasterFailover.java | 11 +++++++++- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index d7bacac82f0..ac5ce595152 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -452,6 +452,7 @@ Release 0.92.0 - Unreleased HBASE-4878 Master crash when splitting hlog may cause data loss (Chunhui Shen) HBASE-4945 NPE in HRegion.bulkLoadHFiles (Andrew P and Lars H) HBASE-4942 HMaster is unable to start of HFile V1 is used (Honghua Zhu) + HBASE-4610 Port HBASE-3380 (Master failover can split logs of live servers) to 92/trunk TESTS HBASE-4450 test for number of blocks read: to serve as baseline for expected diff --git a/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index f76a56a57f0..974e04a90ba 100644 --- a/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -522,16 +522,34 @@ public class ServerManager { public void waitForRegionServers(MonitoredTask status) throws InterruptedException { long interval = this.master.getConfiguration(). - getLong("hbase.master.wait.on.regionservers.interval", 3000); + getLong("hbase.master.wait.on.regionservers.interval", 1500); + long timeout = this.master.getConfiguration(). + getLong("hbase.master.wait.on.regionservers.timeout", 4500); + int minToStart = this.master.getConfiguration(). + getInt("hbase.master.wait.on.regionservers.mintostart", 1); + int maxToStart = this.master.getConfiguration(). + getInt("hbase.master.wait.on.regionservers.maxtostart", Integer.MAX_VALUE); // So, number of regionservers > 0 and its been n since last check in, break, // else just stall here int count = 0; + long slept = 0; for (int oldcount = countOfRegionServers(); !this.master.isStopped();) { Thread.sleep(interval); + slept += interval; count = countOfRegionServers(); if (count == oldcount && count > 0) break; String msg; + if (count == oldcount && count >= minToStart && slept >= timeout) { + LOG.info("Finished waiting for regionserver count to settle; " + + "count=" + count + ", sleptFor=" + slept); + break; + } + if (count >= maxToStart) { + LOG.info("At least the max configured number of regionserver(s) have " + + "checked in: " + count); + break; + } if (count == 0) { msg = "Waiting on regionserver(s) to checkin"; } else { diff --git a/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java b/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java index 649e927c9b0..61c0469a1bb 100644 --- a/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java +++ b/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java @@ -69,8 +69,13 @@ public class TestMasterFailover { final int NUM_MASTERS = 3; final int NUM_RS = 3; + // Create config to use for this cluster + Configuration conf = HBaseConfiguration.create(); + conf.setInt("hbase.master.wait.on.regionservers.mintostart", 3); + conf.setInt("hbase.master.wait.on.regionservers.maxtostart", 3); + // Start the cluster - HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); + HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf); TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS); MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); @@ -298,6 +303,8 @@ public class TestMasterFailover { // Need to drop the timeout much lower conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000); conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 4000); + conf.setInt("hbase.master.wait.on.regionservers.mintostart", 3); + conf.setInt("hbase.master.wait.on.regionservers.maxtostart", 3); // Start the cluster HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf); @@ -596,6 +603,8 @@ public class TestMasterFailover { // Need to drop the timeout much lower conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000); conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 4000); + conf.setInt("hbase.master.wait.on.regionservers.mintostart", 1); + conf.setInt("hbase.master.wait.on.regionservers.maxtostart", 2); // Create and start the cluster HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);