From 453de3a66cc8eb91b0c6ed666f7aef0b54358677 Mon Sep 17 00:00:00 2001 From: Zhihong Yu Date: Sat, 31 Dec 2011 15:43:19 +0000 Subject: [PATCH] HBASE-4397 -ROOT-, .META. tables stay offline for too long in recovery phase after all RSs are shutdown at the same time (Ming Ma) git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1226110 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 2 ++ .../hbase/master/AssignmentManager.java | 20 ++++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/CHANGES.txt b/CHANGES.txt index b1b48b20613..5fe100a7d9d 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -466,6 +466,8 @@ Release 0.92.0 - Unreleased HBASE-5099 ZK event thread waiting for root region assignment may block server shutdown handler for the region sever the root region was on (Jimmy) HBASE-5100 Rollback of split could cause closed region to be opened again (Chunhui) + HBASE-4397 -ROOT-, .META. tables stay offline for too long in recovery phase after all RSs + are shutdown at the same time (Ming Ma) TESTS HBASE-4450 test for number of blocks read: to serve as baseline for expected diff --git a/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index cdfefb59ceb..35c4e9f8006 100644 --- a/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -191,7 +191,7 @@ public class AssignmentManager extends ZooKeeperListener { Configuration conf = master.getConfiguration(); this.timeoutMonitor = new TimeoutMonitor( conf.getInt("hbase.master.assignment.timeoutmonitor.period", 10000), - master, + master, serverManager, conf.getInt("hbase.master.assignment.timeoutmonitor.timeout", 1800000)); Threads.setDaemonThreadRunning(timeoutMonitor.getThread(), master.getServerName() + ".timeoutMonitor"); @@ -1498,6 +1498,7 @@ public class AssignmentManager extends ZooKeeperListener { state.update(RegionState.State.OFFLINE); // Force a new plan and reassign. Will return null if no servers. if (getRegionPlan(state, plan.getDestination(), true) == null) { + this.timeoutMonitor.setAllRegionServersOffline(true); LOG.warn("Unable to find a viable location to assign region " + state.getRegion().getRegionNameAsString()); return; @@ -2512,6 +2513,8 @@ public class AssignmentManager extends ZooKeeperListener { public class TimeoutMonitor extends Chore { private final int timeout; private boolean bulkAssign = false; + private boolean allRegionServersOffline = false; + private ServerManager serverManager; /** * Creates a periodic monitor to check for time outs on region transition @@ -2523,9 +2526,11 @@ public class AssignmentManager extends ZooKeeperListener { * @param timeout */ public TimeoutMonitor(final int period, final Stoppable stopper, + ServerManager serverManager, final int timeout) { super("AssignmentTimeoutMonitor", period, stopper); this.timeout = timeout; + this.serverManager = serverManager; } /** @@ -2539,10 +2544,18 @@ public class AssignmentManager extends ZooKeeperListener { return result; } + private synchronized void setAllRegionServersOffline( + boolean allRegionServersOffline) { + this.allRegionServersOffline = allRegionServersOffline; + } + @Override protected void chore() { // If bulkAssign in progress, suspend checks if (this.bulkAssign) return; + boolean allRSsOffline = this.serverManager.getOnlineServersList(). + isEmpty(); + synchronized (regionsInTransition) { // Iterate all regions in transition checking for time outs long now = System.currentTimeMillis(); @@ -2550,9 +2563,14 @@ public class AssignmentManager extends ZooKeeperListener { if (regionState.getStamp() + timeout <= now) { //decide on action upon timeout actOnTimeOut(regionState); + } else if (this.allRegionServersOffline && !allRSsOffline) { + // if some RSs just came back online, we can start the + // the assignment right away + actOnTimeOut(regionState); } } } + setAllRegionServersOffline(allRSsOffline); } private void actOnTimeOut(RegionState regionState) {