From 75bce302e28e7349e7d4842476320513c065e429 Mon Sep 17 00:00:00 2001
From: Zhihong Yu <tedyu@apache.org>
Date: Sat, 10 Sep 2011 04:34:54 +0000
Subject: [PATCH] HBASE-4340  Hbase can't balance if ServerShutdownHandler
 encountered                exception (Jinchao Gao)

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1167452 13f79535-47bb-0310-9956-ffa450edef68
---
 CHANGES.txt                                   |   3 +-
 .../master/handler/ServerShutdownHandler.java | 141 +++++++++---------
 2 files changed, 75 insertions(+), 69 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 6badd0566c4..7861156613c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -258,7 +258,8 @@ Release 0.91.0 - Unreleased
    HBASE-4301  META migration from 0.90 to trunk fails (Subbu Iyer)
    HBASE-4331  Bypassing default actions in prePut fails sometimes with
                HTable client (Lars Hofhansl via garyh)
-               
+   HBASE-4340  Hbase can't balance if ServerShutdownHandler encountered
+               exception (Jinchao Gao)
 
   IMPROVEMENTS
    HBASE-3290  Max Compaction Size (Nicolas Spiegelberg via Stack)  
diff --git a/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java b/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
index 34067afeec4..742aea4af9f 100644
--- a/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
+++ b/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
@@ -166,83 +166,88 @@ public class ServerShutdownHandler extends EventHandler {
     final ServerName serverName = this.serverName;
 
     LOG.info("Splitting logs for " + serverName);
-    this.services.getMasterFileSystem().splitLog(serverName);
+    try {
+      this.services.getMasterFileSystem().splitLog(serverName);
 
-    // Clean out anything in regions in transition.  Being conservative and
-    // doing after log splitting.  Could do some states before -- OPENING?
-    // OFFLINE? -- and then others after like CLOSING that depend on log
-    // splitting.
-    List<RegionState> regionsInTransition =
-      this.services.getAssignmentManager().processServerShutdown(this.serverName);
+      // Clean out anything in regions in transition.  Being conservative and
+      // doing after log splitting.  Could do some states before -- OPENING?
+      // OFFLINE? -- and then others after like CLOSING that depend on log
+      // splitting.
+      List<RegionState> regionsInTransition =
+        this.services.getAssignmentManager()
+        .processServerShutdown(this.serverName);
 
-    // Assign root and meta if we were carrying them.
-    if (isCarryingRoot()) { // -ROOT-
-      LOG.info("Server " + serverName + " was carrying ROOT. Trying to assign.");
-      verifyAndAssignRootWithRetries();
-    }
-
-    // Carrying meta?
-    if (isCarryingMeta()) {
-      LOG.info("Server " + serverName + " was carrying META. Trying to assign.");
-      this.services.getAssignmentManager().assignMeta();
-    }
-
-    // Wait on meta to come online; we need it to progress.
-    // TODO: Best way to hold strictly here?  We should build this retry logic
-    // into the MetaReader operations themselves.
-    // TODO: Is the reading of .META. necessary when the Master has state of
-    // cluster in its head?  It should be possible to do without reading .META.
-    // in all but one case. On split, the RS updates the .META.
-    // table and THEN informs the master of the split via zk nodes in
-    // 'unassigned' dir.  Currently the RS puts ephemeral nodes into zk so if
-    // the regionserver dies, these nodes do not stick around and this server
-    // shutdown processing does fixup (see the fixupDaughters method below).
-    // If we wanted to skip the .META. scan, we'd have to change at least the
-    // final SPLIT message to be permanent in zk so in here we'd know a SPLIT
-    // completed (zk is updated after edits to .META. have gone in).  See
-    // {@link SplitTransaction}.  We'd also have to be figure another way for
-    // doing the below .META. daughters fixup.
-    NavigableMap<HRegionInfo, Result> hris = null;
-    while (!this.server.isStopped()) {
-      try {
-        this.server.getCatalogTracker().waitForMeta();
-        hris = MetaReader.getServerUserRegions(this.server.getCatalogTracker(),
-          this.serverName);
-        break;
-      } catch (InterruptedException e) {
-        Thread.currentThread().interrupt();
-        throw new IOException("Interrupted", e);
-      } catch (IOException ioe) {
-        LOG.info("Received exception accessing META during server shutdown of " +
-            serverName + ", retrying META read", ioe);
+      // Assign root and meta if we were carrying them.
+      if (isCarryingRoot()) { // -ROOT-
+        LOG.info("Server " + serverName +
+            " was carrying ROOT. Trying to assign.");
+        verifyAndAssignRootWithRetries();
       }
-    }
 
-    // Skip regions that were in transition unless CLOSING or PENDING_CLOSE
-    for (RegionState rit : regionsInTransition) {
-      if (!rit.isClosing() && !rit.isPendingClose()) {
-        LOG.debug("Removed " + rit.getRegion().getRegionNameAsString() +
-          " from list of regions to assign because in RIT");
-        hris.remove(rit.getRegion());
+      // Carrying meta?
+      if (isCarryingMeta()) {
+        LOG.info("Server " + serverName + " was carrying META. Trying to assign.");
+        this.services.getAssignmentManager().assignMeta();
       }
-    }
 
-    LOG.info("Reassigning " + (hris == null? 0: hris.size()) +
-      " region(s) that " + serverName +
-      " was carrying (skipping " + regionsInTransition.size() +
-      " regions(s) that are already in transition)");
-
-    // Iterate regions that were on this server and assign them
-    if (hris != null) {
-      for (Map.Entry<HRegionInfo, Result> e: hris.entrySet()) {
-        if (processDeadRegion(e.getKey(), e.getValue(),
-            this.services.getAssignmentManager(),
-            this.server.getCatalogTracker())) {
-          this.services.getAssignmentManager().assign(e.getKey(), true);
+      // Wait on meta to come online; we need it to progress.
+      // TODO: Best way to hold strictly here?  We should build this retry logic
+      // into the MetaReader operations themselves.
+      // TODO: Is the reading of .META. necessary when the Master has state of
+      // cluster in its head?  It should be possible to do without reading .META.
+      // in all but one case. On split, the RS updates the .META.
+      // table and THEN informs the master of the split via zk nodes in
+      // 'unassigned' dir.  Currently the RS puts ephemeral nodes into zk so if
+      // the regionserver dies, these nodes do not stick around and this server
+      // shutdown processing does fixup (see the fixupDaughters method below).
+      // If we wanted to skip the .META. scan, we'd have to change at least the
+      // final SPLIT message to be permanent in zk so in here we'd know a SPLIT
+      // completed (zk is updated after edits to .META. have gone in).  See
+      // {@link SplitTransaction}.  We'd also have to be figure another way for
+      // doing the below .META. daughters fixup.
+      NavigableMap<HRegionInfo, Result> hris = null;
+      while (!this.server.isStopped()) {
+        try {
+          this.server.getCatalogTracker().waitForMeta();
+          hris = MetaReader.getServerUserRegions(this.server.getCatalogTracker(),
+              this.serverName);
+          break;
+        } catch (InterruptedException e) {
+          Thread.currentThread().interrupt();
+          throw new IOException("Interrupted", e);
+        } catch (IOException ioe) {
+          LOG.info("Received exception accessing META during server shutdown of " +
+              serverName + ", retrying META read", ioe);
         }
       }
+
+      // Skip regions that were in transition unless CLOSING or PENDING_CLOSE
+      for (RegionState rit : regionsInTransition) {
+        if (!rit.isClosing() && !rit.isPendingClose()) {
+          LOG.debug("Removed " + rit.getRegion().getRegionNameAsString() +
+          " from list of regions to assign because in RIT");
+          hris.remove(rit.getRegion());
+        }
+      }
+
+      LOG.info("Reassigning " + (hris == null? 0: hris.size()) +
+          " region(s) that " + serverName +
+          " was carrying (skipping " + regionsInTransition.size() +
+      " regions(s) that are already in transition)");
+
+      // Iterate regions that were on this server and assign them
+      if (hris != null) {
+        for (Map.Entry<HRegionInfo, Result> e: hris.entrySet()) {
+          if (processDeadRegion(e.getKey(), e.getValue(),
+              this.services.getAssignmentManager(),
+              this.server.getCatalogTracker())) {
+            this.services.getAssignmentManager().assign(e.getKey(), true);
+          }
+        }
+      }
+    } finally {
+      this.deadServers.finish(serverName);
     }
-    this.deadServers.finish(serverName);
     LOG.info("Finished processing of shutdown of " + serverName);
   }