HBASE-19838 Can not shutdown backup master cleanly when it has already tried to become the active master
On Master@shutdown, close the shared Master connection to kill any ongoing RPCs by hosted clients. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Call close ont the Master shared clusterconnection to kill any ongoing rpcs. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Remove guts of close; we were closing the Masters connection....not our responsibility. Added unit test written by Duo Zhang which demonstrates the case where Master will not go down. Signed-off-by: zhangduo <zhangduo@apache.org>
This commit is contained in:
parent
541f8ad8a8
commit
970636c5af
|
@ -2605,7 +2605,7 @@ public class HMaster extends HRegionServer implements MasterServices {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void abort(final String msg, final Throwable t) {
|
||||
public void abort(String reason, Throwable cause) {
|
||||
if (isAborted() || isStopped()) {
|
||||
return;
|
||||
}
|
||||
|
@ -2614,8 +2614,9 @@ public class HMaster extends HRegionServer implements MasterServices {
|
|||
LOG.error(HBaseMarkers.FATAL, "Master server abort: loaded coprocessors are: " +
|
||||
getLoadedCoprocessors());
|
||||
}
|
||||
if (t != null) {
|
||||
LOG.error(HBaseMarkers.FATAL, msg, t);
|
||||
String msg = "***** ABORTING master " + this + ": " + reason + " *****";
|
||||
if (cause != null) {
|
||||
LOG.error(HBaseMarkers.FATAL, msg, cause);
|
||||
} else {
|
||||
LOG.error(HBaseMarkers.FATAL, msg);
|
||||
}
|
||||
|
@ -2666,14 +2667,19 @@ public class HMaster extends HRegionServer implements MasterServices {
|
|||
return rsFatals;
|
||||
}
|
||||
|
||||
/**
|
||||
* Shutdown the cluster.
|
||||
* Master runs a coordinated stop of all RegionServers and then itself.
|
||||
*/
|
||||
public void shutdown() throws IOException {
|
||||
if (cpHost != null) {
|
||||
cpHost.preShutdown();
|
||||
}
|
||||
|
||||
// Tell the servermanager cluster is down.
|
||||
if (this.serverManager != null) {
|
||||
this.serverManager.shutdownCluster();
|
||||
}
|
||||
// Set the cluster down flag; broadcast across the cluster.
|
||||
if (this.clusterStatusTracker != null){
|
||||
try {
|
||||
this.clusterStatusTracker.setClusterDown();
|
||||
|
@ -2681,6 +2687,13 @@ public class HMaster extends HRegionServer implements MasterServices {
|
|||
LOG.error("ZooKeeper exception trying to set cluster as down in ZK", e);
|
||||
}
|
||||
}
|
||||
// Shutdown our cluster connection. This will kill any hosted RPCs that might be going on;
|
||||
// this is what we want especially if the Master is in startup phase doing call outs to
|
||||
// hbase:meta, etc. when cluster is down. Without ths connection close, we'd have to wait on
|
||||
// the rpc to timeout.
|
||||
if (this.clusterConnection != null) {
|
||||
this.clusterConnection.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void stopMaster() throws IOException {
|
||||
|
|
|
@ -200,10 +200,8 @@ public class ServerManager {
|
|||
Configuration c = master.getConfiguration();
|
||||
maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
|
||||
warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
|
||||
this.connection = connect ? master.getClusterConnection() : null;
|
||||
this.rpcControllerFactory = this.connection == null
|
||||
? null
|
||||
: connection.getRpcControllerFactory();
|
||||
this.connection = connect? master.getClusterConnection(): null;
|
||||
this.rpcControllerFactory = this.connection == null? null: connection.getRpcControllerFactory();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -968,16 +966,10 @@ public class ServerManager {
|
|||
}
|
||||
|
||||
/**
|
||||
* Stop the ServerManager. Currently closes the connection to the master.
|
||||
* Stop the ServerManager.
|
||||
*/
|
||||
public void stop() {
|
||||
if (connection != null) {
|
||||
try {
|
||||
connection.close();
|
||||
} catch (IOException e) {
|
||||
LOG.error("Attempt to close connection to master failed", e);
|
||||
}
|
||||
}
|
||||
// Nothing to do.
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -251,6 +251,9 @@ public class HRegionServer extends HasThread implements
|
|||
* Cluster connection to be shared by services.
|
||||
* Initialized at server startup and closed when server shuts down.
|
||||
* Clients must never close it explicitly.
|
||||
* Clients hosted by this Server should make use of this clusterConnection rather than create
|
||||
* their own; if they create their own, there is no way for the hosting server to shutdown
|
||||
* ongoing client RPCs.
|
||||
*/
|
||||
protected ClusterConnection clusterConnection;
|
||||
|
||||
|
|
|
@ -0,0 +1,108 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master;
|
||||
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.CategoryBasedTimeout;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.MiniHBaseCluster;
|
||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
import org.junit.rules.TestRule;
|
||||
|
||||
/**
|
||||
* Test to confirm that we will not hang when stop a backup master which is trying to become the
|
||||
* active master. See HBASE-19838
|
||||
*/
|
||||
@Category({ MasterTests.class, MediumTests.class })
|
||||
public class TestShutdownBackupMaster {
|
||||
@Rule public final TestRule timeout = CategoryBasedTimeout.builder().withTimeout(this.getClass()).
|
||||
withLookingForStuckThread(true).build();
|
||||
|
||||
private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
|
||||
|
||||
private static volatile CountDownLatch ARRIVE;
|
||||
|
||||
private static volatile CountDownLatch CONTINUE;
|
||||
|
||||
public static final class MockHMaster extends HMaster {
|
||||
|
||||
public MockHMaster(Configuration conf) throws IOException, KeeperException {
|
||||
super(conf);
|
||||
}
|
||||
|
||||
@Override
|
||||
void initClusterSchemaService() throws IOException, InterruptedException {
|
||||
if (ARRIVE != null) {
|
||||
ARRIVE.countDown();
|
||||
CONTINUE.await();
|
||||
}
|
||||
super.initClusterSchemaService();
|
||||
}
|
||||
}
|
||||
|
||||
@BeforeClass
|
||||
public static void setUpBeforeClass() throws Exception {
|
||||
UTIL.getConfiguration().setClass(HConstants.MASTER_IMPL, MockHMaster.class, HMaster.class);
|
||||
UTIL.startMiniCluster(2, 2);
|
||||
UTIL.waitUntilAllSystemRegionsAssigned();
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void tearDownAfterClass() throws Exception {
|
||||
// make sure that we can stop the cluster cleanly
|
||||
UTIL.shutdownMiniCluster();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testShutdownWhileBecomingActive() throws InterruptedException {
|
||||
MiniHBaseCluster cluster = UTIL.getHBaseCluster();
|
||||
HMaster activeMaster = null;
|
||||
HMaster backupMaster = null;
|
||||
for (MasterThread t : cluster.getMasterThreads()) {
|
||||
if (t.getMaster().isActiveMaster()) {
|
||||
activeMaster = t.getMaster();
|
||||
} else {
|
||||
backupMaster = t.getMaster();
|
||||
}
|
||||
}
|
||||
assertNotNull(activeMaster);
|
||||
assertNotNull(backupMaster);
|
||||
ARRIVE = new CountDownLatch(1);
|
||||
CONTINUE = new CountDownLatch(1);
|
||||
activeMaster.abort("Aborting active master for test");
|
||||
// wait until we arrive the initClusterSchemaService
|
||||
ARRIVE.await();
|
||||
// killall RSes
|
||||
cluster.getRegionServerThreads().stream().map(t -> t.getRegionServer())
|
||||
.forEachOrdered(rs -> rs.abort("Aborting RS for test"));
|
||||
CONTINUE.countDown();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue