HBASE-21325 Force to terminate regionserver when abort hang in somewhere

Conflicts:
	hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
	hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbortTimeout.java

Amending-Author: Andrew Purtell <apurtell@apache.org>
This commit is contained in:
Guanghao Zhang 2018-10-19 19:34:04 +08:00 committed by Andrew Purtell
parent f3f3798575
commit e7ff91f35e
No known key found for this signature in database
GPG Key ID: 8597754DD5365CCD
2 changed files with 175 additions and 1 deletions

View File

@ -51,6 +51,8 @@ import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.SortedMap;
import java.util.Timer;
import java.util.TimerTask;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
@ -312,6 +314,11 @@ public class HRegionServer extends HasThread implements
// Go down hard. Used if file system becomes unavailable and also in
// debugging and unit tests.
private volatile boolean abortRequested;
public static final String ABORT_TIMEOUT = "hbase.regionserver.abort.timeout";
// Default abort timeout is 1200 seconds for safe
private static final long DEFAULT_ABORT_TIMEOUT = 1200000;
// Will run this task when abort timeout
public static final String ABORT_TIMEOUT_TASK = "hbase.regionserver.abort.timeout.task";
ConcurrentMap<String, Integer> rowlocks = new ConcurrentHashMap<String, Integer>();
@ -1041,12 +1048,31 @@ public class HRegionServer extends HasThread implements
abort(prefix + t.getMessage(), t);
}
}
// Run shutdown.
if (mxBean != null) {
MBeanUtil.unregisterMBean(mxBean);
mxBean = null;
}
if (this.leases != null) this.leases.closeAfterLeasesExpire();
if (abortRequested) {
Timer abortMonitor = new Timer("Abort regionserver monitor", true);
TimerTask abortTimeoutTask = null;
try {
abortTimeoutTask =
Class.forName(conf.get(ABORT_TIMEOUT_TASK, SystemExitWhenAbortTimeout.class.getName()))
.asSubclass(TimerTask.class).getDeclaredConstructor().newInstance();
} catch (Exception e) {
LOG.warn("Initialize abort timeout task failed", e);
}
if (abortTimeoutTask != null) {
abortMonitor.schedule(abortTimeoutTask, conf.getLong(ABORT_TIMEOUT, DEFAULT_ABORT_TIMEOUT));
}
}
if (this.leases != null) {
this.leases.closeAfterLeasesExpire();
}
if (this.splitLogWorker != null) {
splitLogWorker.stop();
}
@ -3552,4 +3578,15 @@ public class HRegionServer extends HasThread implements
public void unassign(byte[] regionName) throws IOException {
clusterConnection.getAdmin().unassign(regionName, false);
}
/**
* Force to terminate region server when abort timeout.
*/
private static class SystemExitWhenAbortTimeout extends TimerTask {
@Override
public void run() {
LOG.warn("Aborting region server timed out, terminate forcibly...");
System.exit(1);
}
}
}

View File

@ -0,0 +1,137 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.regionserver;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.IOException;
import java.util.TimerTask;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.coprocessor.BaseRegionObserver;
import org.apache.hadoop.hbase.coprocessor.ObserverContext;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.testclassification.RegionServerTests;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Threads;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@Category({ RegionServerTests.class, MediumTests.class })
public class TestRegionServerAbortTimeout {
private static final Logger LOG = LoggerFactory.getLogger(TestRegionServerAbortTimeout.class);
private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
private static TableName TABLE_NAME = TableName.valueOf("RSAbort");
private static byte[] CF = Bytes.toBytes("cf");
private static byte[] CQ = Bytes.toBytes("cq");
private static final int REGIONS_NUM = 5;
private static final int SLEEP_TIME_WHEN_CLOSE_REGION = 1000;
private static volatile boolean abortTimeoutTaskScheduled = false;
@BeforeClass
public static void setUp() throws Exception {
Configuration conf = UTIL.getConfiguration();
// Will schedule a abort timeout task after SLEEP_TIME_WHEN_CLOSE_REGION ms
conf.setLong(HRegionServer.ABORT_TIMEOUT, SLEEP_TIME_WHEN_CLOSE_REGION);
conf.set(HRegionServer.ABORT_TIMEOUT_TASK, TestAbortTimeoutTask.class.getName());
UTIL.startMiniCluster(2);
HTableDescriptor td = new HTableDescriptor(TABLE_NAME);
td.addCoprocessor(SleepWhenCloseCoprocessor.class.getName());
td.addFamily(new HColumnDescriptor(CF));
UTIL.getHBaseAdmin().createTable(td, Bytes.toBytes("0"), Bytes.toBytes("9"), REGIONS_NUM);
}
@AfterClass
public static void tearDown() throws Exception {
UTIL.getHBaseAdmin().disableTable(TABLE_NAME);
UTIL.getHBaseAdmin().deleteTable(TABLE_NAME);
UTIL.shutdownMiniCluster();
}
@Test
public void testAbortTimeout() throws Exception {
Thread writer = new Thread() {
public void run() {
try {
try (Table table = UTIL.getConnection().getTable(TABLE_NAME)) {
for (int i = 0; i < 10000; i++) {
table.put(new Put(Bytes.toBytes(i)).addColumn(CF, CQ, Bytes.toBytes(i)));
}
}
} catch (IOException e) {
LOG.warn("Failed to load data");
}
}
};
writer.setDaemon(true);
writer.start();
// Abort one region server
UTIL.getMiniHBaseCluster().getRegionServer(0).abort("Abort RS for test");
long startTime = System.currentTimeMillis();
long timeout = REGIONS_NUM * SLEEP_TIME_WHEN_CLOSE_REGION * 10;
while (System.currentTimeMillis() - startTime < timeout) {
if (UTIL.getMiniHBaseCluster().getLiveRegionServerThreads().size() == 1) {
assertTrue("Abort timer task should be scheduled", abortTimeoutTaskScheduled);
return;
}
Threads.sleep(SLEEP_TIME_WHEN_CLOSE_REGION);
}
fail("Failed to abort a region server in " + timeout + " ms");
}
static class TestAbortTimeoutTask extends TimerTask {
public TestAbortTimeoutTask() {
}
@Override
public void run() {
LOG.info("TestAbortTimeoutTask was scheduled");
abortTimeoutTaskScheduled = true;
}
}
public static class SleepWhenCloseCoprocessor extends BaseRegionObserver {
@Override
public void preClose(ObserverContext<RegionCoprocessorEnvironment> c, boolean abortRequested)
throws IOException {
Threads.sleep(SLEEP_TIME_WHEN_CLOSE_REGION);
}
}
}