HBASE-19838 Can not shutdown backup master cleanly when it has already tried to become the active master
On Master@shutdown, close the shared Master connection to kill any ongoing RPCs by hosted clients. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Call close ont the Master shared clusterconnection to kill any ongoing rpcs. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Remove guts of close; we were closing the Masters connection....not our responsibility. Added unit test written by Duo Zhang which demonstrates the case where Master will not go down. Signed-off-by: zhangduo <zhangduo@apache.org>
This commit is contained in:
parent
f952779ba2
commit
739b9b4a8e
|
@ -2616,7 +2616,7 @@ public class HMaster extends HRegionServer implements MasterServices {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void abort(final String msg, final Throwable t) {
|
public void abort(String reason, Throwable cause) {
|
||||||
if (isAborted() || isStopped()) {
|
if (isAborted() || isStopped()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -2625,8 +2625,9 @@ public class HMaster extends HRegionServer implements MasterServices {
|
||||||
LOG.error(HBaseMarkers.FATAL, "Master server abort: loaded coprocessors are: " +
|
LOG.error(HBaseMarkers.FATAL, "Master server abort: loaded coprocessors are: " +
|
||||||
getLoadedCoprocessors());
|
getLoadedCoprocessors());
|
||||||
}
|
}
|
||||||
if (t != null) {
|
String msg = "***** ABORTING master " + this + ": " + reason + " *****";
|
||||||
LOG.error(HBaseMarkers.FATAL, msg, t);
|
if (cause != null) {
|
||||||
|
LOG.error(HBaseMarkers.FATAL, msg, cause);
|
||||||
} else {
|
} else {
|
||||||
LOG.error(HBaseMarkers.FATAL, msg);
|
LOG.error(HBaseMarkers.FATAL, msg);
|
||||||
}
|
}
|
||||||
|
@ -2677,14 +2678,19 @@ public class HMaster extends HRegionServer implements MasterServices {
|
||||||
return rsFatals;
|
return rsFatals;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Shutdown the cluster.
|
||||||
|
* Master runs a coordinated stop of all RegionServers and then itself.
|
||||||
|
*/
|
||||||
public void shutdown() throws IOException {
|
public void shutdown() throws IOException {
|
||||||
if (cpHost != null) {
|
if (cpHost != null) {
|
||||||
cpHost.preShutdown();
|
cpHost.preShutdown();
|
||||||
}
|
}
|
||||||
|
// Tell the servermanager cluster is down.
|
||||||
if (this.serverManager != null) {
|
if (this.serverManager != null) {
|
||||||
this.serverManager.shutdownCluster();
|
this.serverManager.shutdownCluster();
|
||||||
}
|
}
|
||||||
|
// Set the cluster down flag; broadcast across the cluster.
|
||||||
if (this.clusterStatusTracker != null){
|
if (this.clusterStatusTracker != null){
|
||||||
try {
|
try {
|
||||||
this.clusterStatusTracker.setClusterDown();
|
this.clusterStatusTracker.setClusterDown();
|
||||||
|
@ -2692,6 +2698,13 @@ public class HMaster extends HRegionServer implements MasterServices {
|
||||||
LOG.error("ZooKeeper exception trying to set cluster as down in ZK", e);
|
LOG.error("ZooKeeper exception trying to set cluster as down in ZK", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Shutdown our cluster connection. This will kill any hosted RPCs that might be going on;
|
||||||
|
// this is what we want especially if the Master is in startup phase doing call outs to
|
||||||
|
// hbase:meta, etc. when cluster is down. Without ths connection close, we'd have to wait on
|
||||||
|
// the rpc to timeout.
|
||||||
|
if (this.clusterConnection != null) {
|
||||||
|
this.clusterConnection.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void stopMaster() throws IOException {
|
public void stopMaster() throws IOException {
|
||||||
|
|
|
@ -200,10 +200,8 @@ public class ServerManager {
|
||||||
Configuration c = master.getConfiguration();
|
Configuration c = master.getConfiguration();
|
||||||
maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
|
maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
|
||||||
warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
|
warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
|
||||||
this.connection = connect ? master.getClusterConnection() : null;
|
this.connection = connect? master.getClusterConnection(): null;
|
||||||
this.rpcControllerFactory = this.connection == null
|
this.rpcControllerFactory = this.connection == null? null: connection.getRpcControllerFactory();
|
||||||
? null
|
|
||||||
: connection.getRpcControllerFactory();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -968,16 +966,10 @@ public class ServerManager {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Stop the ServerManager. Currently closes the connection to the master.
|
* Stop the ServerManager.
|
||||||
*/
|
*/
|
||||||
public void stop() {
|
public void stop() {
|
||||||
if (connection != null) {
|
// Nothing to do.
|
||||||
try {
|
|
||||||
connection.close();
|
|
||||||
} catch (IOException e) {
|
|
||||||
LOG.error("Attempt to close connection to master failed", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -248,6 +248,9 @@ public class HRegionServer extends HasThread implements
|
||||||
* Cluster connection to be shared by services.
|
* Cluster connection to be shared by services.
|
||||||
* Initialized at server startup and closed when server shuts down.
|
* Initialized at server startup and closed when server shuts down.
|
||||||
* Clients must never close it explicitly.
|
* Clients must never close it explicitly.
|
||||||
|
* Clients hosted by this Server should make use of this clusterConnection rather than create
|
||||||
|
* their own; if they create their own, there is no way for the hosting server to shutdown
|
||||||
|
* ongoing client RPCs.
|
||||||
*/
|
*/
|
||||||
protected ClusterConnection clusterConnection;
|
protected ClusterConnection clusterConnection;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,108 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.hbase.master;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertNotNull;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.concurrent.CountDownLatch;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.hbase.CategoryBasedTimeout;
|
||||||
|
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||||
|
import org.apache.hadoop.hbase.HConstants;
|
||||||
|
import org.apache.hadoop.hbase.MiniHBaseCluster;
|
||||||
|
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||||
|
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||||
|
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
|
||||||
|
import org.apache.zookeeper.KeeperException;
|
||||||
|
import org.junit.AfterClass;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
import org.junit.Rule;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.junit.experimental.categories.Category;
|
||||||
|
import org.junit.rules.TestRule;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test to confirm that we will not hang when stop a backup master which is trying to become the
|
||||||
|
* active master. See HBASE-19838
|
||||||
|
*/
|
||||||
|
@Category({ MasterTests.class, MediumTests.class })
|
||||||
|
public class TestShutdownBackupMaster {
|
||||||
|
@Rule public final TestRule timeout = CategoryBasedTimeout.builder().withTimeout(this.getClass()).
|
||||||
|
withLookingForStuckThread(true).build();
|
||||||
|
|
||||||
|
private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
|
||||||
|
|
||||||
|
private static volatile CountDownLatch ARRIVE;
|
||||||
|
|
||||||
|
private static volatile CountDownLatch CONTINUE;
|
||||||
|
|
||||||
|
public static final class MockHMaster extends HMaster {
|
||||||
|
|
||||||
|
public MockHMaster(Configuration conf) throws IOException, KeeperException {
|
||||||
|
super(conf);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
void initClusterSchemaService() throws IOException, InterruptedException {
|
||||||
|
if (ARRIVE != null) {
|
||||||
|
ARRIVE.countDown();
|
||||||
|
CONTINUE.await();
|
||||||
|
}
|
||||||
|
super.initClusterSchemaService();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void setUpBeforeClass() throws Exception {
|
||||||
|
UTIL.getConfiguration().setClass(HConstants.MASTER_IMPL, MockHMaster.class, HMaster.class);
|
||||||
|
UTIL.startMiniCluster(2, 2);
|
||||||
|
UTIL.waitUntilAllSystemRegionsAssigned();
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterClass
|
||||||
|
public static void tearDownAfterClass() throws Exception {
|
||||||
|
// make sure that we can stop the cluster cleanly
|
||||||
|
UTIL.shutdownMiniCluster();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testShutdownWhileBecomingActive() throws InterruptedException {
|
||||||
|
MiniHBaseCluster cluster = UTIL.getHBaseCluster();
|
||||||
|
HMaster activeMaster = null;
|
||||||
|
HMaster backupMaster = null;
|
||||||
|
for (MasterThread t : cluster.getMasterThreads()) {
|
||||||
|
if (t.getMaster().isActiveMaster()) {
|
||||||
|
activeMaster = t.getMaster();
|
||||||
|
} else {
|
||||||
|
backupMaster = t.getMaster();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertNotNull(activeMaster);
|
||||||
|
assertNotNull(backupMaster);
|
||||||
|
ARRIVE = new CountDownLatch(1);
|
||||||
|
CONTINUE = new CountDownLatch(1);
|
||||||
|
activeMaster.abort("Aborting active master for test");
|
||||||
|
// wait until we arrive the initClusterSchemaService
|
||||||
|
ARRIVE.await();
|
||||||
|
// killall RSes
|
||||||
|
cluster.getRegionServerThreads().stream().map(t -> t.getRegionServer())
|
||||||
|
.forEachOrdered(rs -> rs.abort("Aborting RS for test"));
|
||||||
|
CONTINUE.countDown();
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue