HBASE-19838 Can not shutdown backup master cleanly when it has already tried to become the active master

On Master@shutdown, close the shared Master connection to kill any ongoing RPCs by hosted clients. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Call close ont the Master shared clusterconnection to kill any ongoing rpcs. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Remove guts of close; we were closing the Masters connection....not our responsibility. Added unit test written by Duo Zhang which demonstrates the case where Master will not go down. Signed-off-by: zhangduo <zhangduo@apache.org>
2018-01-22 14:44:16 -08:00 · 2018-01-22 14:44:16 -08:00 · 739b9b4a8e
parent f952779ba2
commit 739b9b4a8e
4 changed files with 132 additions and 16 deletions
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@ -2616,7 +2616,7 @@ public class HMaster extends HRegionServer implements MasterServices {
  }

  @Override
-  public void abort(final String msg, final Throwable t) {
+  public void abort(String reason, Throwable cause) {
    if (isAborted() || isStopped()) {
      return;
    }
@ -2625,8 +2625,9 @@ public class HMaster extends HRegionServer implements MasterServices {
      LOG.error(HBaseMarkers.FATAL, "Master server abort: loaded coprocessors are: " +
          getLoadedCoprocessors());
    }
-    if (t != null) {
-      LOG.error(HBaseMarkers.FATAL, msg, t);
+    String msg = "***** ABORTING master " + this + ": " + reason + " *****";
+    if (cause != null) {
+      LOG.error(HBaseMarkers.FATAL, msg, cause);
    } else {
      LOG.error(HBaseMarkers.FATAL, msg);
    }
@ -2677,14 +2678,19 @@ public class HMaster extends HRegionServer implements MasterServices {
    return rsFatals;
  }

+  /**
+   * Shutdown the cluster.
+   * Master runs a coordinated stop of all RegionServers and then itself.
+   */
  public void shutdown() throws IOException {
    if (cpHost != null) {
      cpHost.preShutdown();
    }
-
+    // Tell the servermanager cluster is down.
    if (this.serverManager != null) {
      this.serverManager.shutdownCluster();
    }
+    // Set the cluster down flag; broadcast across the cluster.
    if (this.clusterStatusTracker != null){
      try {
        this.clusterStatusTracker.setClusterDown();
@ -2692,6 +2698,13 @@ public class HMaster extends HRegionServer implements MasterServices {
        LOG.error("ZooKeeper exception trying to set cluster as down in ZK", e);
      }
    }
+    // Shutdown our cluster connection. This will kill any hosted RPCs that might be going on;
+    // this is what we want especially if the Master is in startup phase doing call outs to
+    // hbase:meta, etc. when cluster is down. Without ths connection close, we'd have to wait on
+    // the rpc to timeout.
+    if (this.clusterConnection != null) {
+      this.clusterConnection.close();
+    }
  }

  public void stopMaster() throws IOException {
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@ -201,9 +201,7 @@ public class ServerManager {
    maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
    warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
    this.connection = connect? master.getClusterConnection(): null;
-    this.rpcControllerFactory = this.connection == null
-        ? null
-        : connection.getRpcControllerFactory();
+    this.rpcControllerFactory = this.connection == null? null: connection.getRpcControllerFactory();
  }

  /**
@ -968,16 +966,10 @@ public class ServerManager {
  }

  /**
-   * Stop the ServerManager.  Currently closes the connection to the master.
+   * Stop the ServerManager.
   */
  public void stop() {
-    if (connection != null) {
-      try {
-        connection.close();
-      } catch (IOException e) {
-        LOG.error("Attempt to close connection to master failed", e);
-      }
-    }
+    // Nothing to do.
  }

  /**
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
@ -248,6 +248,9 @@ public class HRegionServer extends HasThread implements
   * Cluster connection to be shared by services.
   * Initialized at server startup and closed when server shuts down.
   * Clients must never close it explicitly.
+   * Clients hosted by this Server should make use of this clusterConnection rather than create
+   * their own; if they create their own, there is no way for the hosting server to shutdown
+   * ongoing client RPCs.
   */
  protected ClusterConnection clusterConnection;

--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestShutdownBackupMaster.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestShutdownBackupMaster.java
@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master;
+
+import static org.junit.Assert.assertNotNull;
+
+import java.io.IOException;
+import java.util.concurrent.CountDownLatch;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.CategoryBasedTimeout;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.MiniHBaseCluster;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
+import org.apache.zookeeper.KeeperException;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.rules.TestRule;
+
+/**
+ * Test to confirm that we will not hang when stop a backup master which is trying to become the
+ * active master. See HBASE-19838
+ */
+@Category({ MasterTests.class, MediumTests.class })
+public class TestShutdownBackupMaster {
+  @Rule public final TestRule timeout = CategoryBasedTimeout.builder().withTimeout(this.getClass()).
+          withLookingForStuckThread(true).build();
+
+  private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
+
+  private static volatile CountDownLatch ARRIVE;
+
+  private static volatile CountDownLatch CONTINUE;
+
+  public static final class MockHMaster extends HMaster {
+
+    public MockHMaster(Configuration conf) throws IOException, KeeperException {
+      super(conf);
+    }
+
+    @Override
+    void initClusterSchemaService() throws IOException, InterruptedException {
+      if (ARRIVE != null) {
+        ARRIVE.countDown();
+        CONTINUE.await();
+      }
+      super.initClusterSchemaService();
+    }
+  }
+
+  @BeforeClass
+  public static void setUpBeforeClass() throws Exception {
+    UTIL.getConfiguration().setClass(HConstants.MASTER_IMPL, MockHMaster.class, HMaster.class);
+    UTIL.startMiniCluster(2, 2);
+    UTIL.waitUntilAllSystemRegionsAssigned();
+  }
+
+  @AfterClass
+  public static void tearDownAfterClass() throws Exception {
+    // make sure that we can stop the cluster cleanly
+    UTIL.shutdownMiniCluster();
+  }
+
+  @Test
+  public void testShutdownWhileBecomingActive() throws InterruptedException {
+    MiniHBaseCluster cluster = UTIL.getHBaseCluster();
+    HMaster activeMaster = null;
+    HMaster backupMaster = null;
+    for (MasterThread t : cluster.getMasterThreads()) {
+      if (t.getMaster().isActiveMaster()) {
+        activeMaster = t.getMaster();
+      } else {
+        backupMaster = t.getMaster();
+      }
+    }
+    assertNotNull(activeMaster);
+    assertNotNull(backupMaster);
+    ARRIVE = new CountDownLatch(1);
+    CONTINUE = new CountDownLatch(1);
+    activeMaster.abort("Aborting active master for test");
+    // wait until we arrive the initClusterSchemaService
+    ARRIVE.await();
+    // killall RSes
+    cluster.getRegionServerThreads().stream().map(t -> t.getRegionServer())
+        .forEachOrdered(rs -> rs.abort("Aborting RS for test"));
+    CONTINUE.countDown();
+  }
+}