HBASE-19838 Can not shutdown backup master cleanly when it has already tried to become the active master

On Master@shutdown, close the shared Master connection to kill any
ongoing RPCs by hosted clients.

M hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
Call close ont the Master shared clusterconnection to kill any ongoing
rpcs.

M hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
Remove guts of close; we were closing the Masters connection....not
our responsibility.

Added unit test written by Duo Zhang which demonstrates the case where
Master will not go down.

Signed-off-by: zhangduo <zhangduo@apache.org>
This commit is contained in:
Michael Stack 2018-01-22 14:44:16 -08:00 committed by zhangduo
parent f952779ba2
commit 739b9b4a8e
4 changed files with 132 additions and 16 deletions

View File

@ -2616,7 +2616,7 @@ public class HMaster extends HRegionServer implements MasterServices {
}
@Override
public void abort(final String msg, final Throwable t) {
public void abort(String reason, Throwable cause) {
if (isAborted() || isStopped()) {
return;
}
@ -2625,8 +2625,9 @@ public class HMaster extends HRegionServer implements MasterServices {
LOG.error(HBaseMarkers.FATAL, "Master server abort: loaded coprocessors are: " +
getLoadedCoprocessors());
}
if (t != null) {
LOG.error(HBaseMarkers.FATAL, msg, t);
String msg = "***** ABORTING master " + this + ": " + reason + " *****";
if (cause != null) {
LOG.error(HBaseMarkers.FATAL, msg, cause);
} else {
LOG.error(HBaseMarkers.FATAL, msg);
}
@ -2677,14 +2678,19 @@ public class HMaster extends HRegionServer implements MasterServices {
return rsFatals;
}
/**
* Shutdown the cluster.
* Master runs a coordinated stop of all RegionServers and then itself.
*/
public void shutdown() throws IOException {
if (cpHost != null) {
cpHost.preShutdown();
}
// Tell the servermanager cluster is down.
if (this.serverManager != null) {
this.serverManager.shutdownCluster();
}
// Set the cluster down flag; broadcast across the cluster.
if (this.clusterStatusTracker != null){
try {
this.clusterStatusTracker.setClusterDown();
@ -2692,6 +2698,13 @@ public class HMaster extends HRegionServer implements MasterServices {
LOG.error("ZooKeeper exception trying to set cluster as down in ZK", e);
}
}
// Shutdown our cluster connection. This will kill any hosted RPCs that might be going on;
// this is what we want especially if the Master is in startup phase doing call outs to
// hbase:meta, etc. when cluster is down. Without ths connection close, we'd have to wait on
// the rpc to timeout.
if (this.clusterConnection != null) {
this.clusterConnection.close();
}
}
public void stopMaster() throws IOException {

View File

@ -201,9 +201,7 @@ public class ServerManager {
maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
this.connection = connect? master.getClusterConnection(): null;
this.rpcControllerFactory = this.connection == null
? null
: connection.getRpcControllerFactory();
this.rpcControllerFactory = this.connection == null? null: connection.getRpcControllerFactory();
}
/**
@ -968,16 +966,10 @@ public class ServerManager {
}
/**
* Stop the ServerManager. Currently closes the connection to the master.
* Stop the ServerManager.
*/
public void stop() {
if (connection != null) {
try {
connection.close();
} catch (IOException e) {
LOG.error("Attempt to close connection to master failed", e);
}
}
// Nothing to do.
}
/**

View File

@ -248,6 +248,9 @@ public class HRegionServer extends HasThread implements
* Cluster connection to be shared by services.
* Initialized at server startup and closed when server shuts down.
* Clients must never close it explicitly.
* Clients hosted by this Server should make use of this clusterConnection rather than create
* their own; if they create their own, there is no way for the hosting server to shutdown
* ongoing client RPCs.
*/
protected ClusterConnection clusterConnection;

View File

@ -0,0 +1,108 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master;
import static org.junit.Assert.assertNotNull;
import java.io.IOException;
import java.util.concurrent.CountDownLatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.CategoryBasedTimeout;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.testclassification.MasterTests;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
import org.apache.zookeeper.KeeperException;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Rule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.rules.TestRule;
/**
* Test to confirm that we will not hang when stop a backup master which is trying to become the
* active master. See HBASE-19838
*/
@Category({ MasterTests.class, MediumTests.class })
public class TestShutdownBackupMaster {
@Rule public final TestRule timeout = CategoryBasedTimeout.builder().withTimeout(this.getClass()).
withLookingForStuckThread(true).build();
private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
private static volatile CountDownLatch ARRIVE;
private static volatile CountDownLatch CONTINUE;
public static final class MockHMaster extends HMaster {
public MockHMaster(Configuration conf) throws IOException, KeeperException {
super(conf);
}
@Override
void initClusterSchemaService() throws IOException, InterruptedException {
if (ARRIVE != null) {
ARRIVE.countDown();
CONTINUE.await();
}
super.initClusterSchemaService();
}
}
@BeforeClass
public static void setUpBeforeClass() throws Exception {
UTIL.getConfiguration().setClass(HConstants.MASTER_IMPL, MockHMaster.class, HMaster.class);
UTIL.startMiniCluster(2, 2);
UTIL.waitUntilAllSystemRegionsAssigned();
}
@AfterClass
public static void tearDownAfterClass() throws Exception {
// make sure that we can stop the cluster cleanly
UTIL.shutdownMiniCluster();
}
@Test
public void testShutdownWhileBecomingActive() throws InterruptedException {
MiniHBaseCluster cluster = UTIL.getHBaseCluster();
HMaster activeMaster = null;
HMaster backupMaster = null;
for (MasterThread t : cluster.getMasterThreads()) {
if (t.getMaster().isActiveMaster()) {
activeMaster = t.getMaster();
} else {
backupMaster = t.getMaster();
}
}
assertNotNull(activeMaster);
assertNotNull(backupMaster);
ARRIVE = new CountDownLatch(1);
CONTINUE = new CountDownLatch(1);
activeMaster.abort("Aborting active master for test");
// wait until we arrive the initClusterSchemaService
ARRIVE.await();
// killall RSes
cluster.getRegionServerThreads().stream().map(t -> t.getRegionServer())
.forEachOrdered(rs -> rs.abort("Aborting RS for test"));
CONTINUE.countDown();
}
}