diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 6e5ce80f32a..5f250c0c032 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -51,6 +51,8 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.SortedMap; +import java.util.Timer; +import java.util.TimerTask; import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; @@ -312,6 +314,11 @@ public class HRegionServer extends HasThread implements // Go down hard. Used if file system becomes unavailable and also in // debugging and unit tests. private volatile boolean abortRequested; + public static final String ABORT_TIMEOUT = "hbase.regionserver.abort.timeout"; + // Default abort timeout is 1200 seconds for safe + private static final long DEFAULT_ABORT_TIMEOUT = 1200000; + // Will run this task when abort timeout + public static final String ABORT_TIMEOUT_TASK = "hbase.regionserver.abort.timeout.task"; ConcurrentMap rowlocks = new ConcurrentHashMap(); @@ -1041,12 +1048,31 @@ public class HRegionServer extends HasThread implements abort(prefix + t.getMessage(), t); } } + // Run shutdown. if (mxBean != null) { MBeanUtil.unregisterMBean(mxBean); mxBean = null; } - if (this.leases != null) this.leases.closeAfterLeasesExpire(); + + if (abortRequested) { + Timer abortMonitor = new Timer("Abort regionserver monitor", true); + TimerTask abortTimeoutTask = null; + try { + abortTimeoutTask = + Class.forName(conf.get(ABORT_TIMEOUT_TASK, SystemExitWhenAbortTimeout.class.getName())) + .asSubclass(TimerTask.class).getDeclaredConstructor().newInstance(); + } catch (Exception e) { + LOG.warn("Initialize abort timeout task failed", e); + } + if (abortTimeoutTask != null) { + abortMonitor.schedule(abortTimeoutTask, conf.getLong(ABORT_TIMEOUT, DEFAULT_ABORT_TIMEOUT)); + } + } + + if (this.leases != null) { + this.leases.closeAfterLeasesExpire(); + } if (this.splitLogWorker != null) { splitLogWorker.stop(); } @@ -3552,4 +3578,15 @@ public class HRegionServer extends HasThread implements public void unassign(byte[] regionName) throws IOException { clusterConnection.getAdmin().unassign(regionName, false); } + + /** + * Force to terminate region server when abort timeout. + */ + private static class SystemExitWhenAbortTimeout extends TimerTask { + @Override + public void run() { + LOG.warn("Aborting region server timed out, terminate forcibly..."); + System.exit(1); + } + } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbortTimeout.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbortTimeout.java new file mode 100644 index 00000000000..ed129c5a4f5 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbortTimeout.java @@ -0,0 +1,137 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.regionserver; + +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.IOException; +import java.util.TimerTask; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.HColumnDescriptor; +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.coprocessor.BaseRegionObserver; +import org.apache.hadoop.hbase.coprocessor.ObserverContext; +import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.testclassification.RegionServerTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.Threads; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Category({ RegionServerTests.class, MediumTests.class }) +public class TestRegionServerAbortTimeout { + private static final Logger LOG = LoggerFactory.getLogger(TestRegionServerAbortTimeout.class); + + private static final HBaseTestingUtility UTIL = new HBaseTestingUtility(); + + private static TableName TABLE_NAME = TableName.valueOf("RSAbort"); + + private static byte[] CF = Bytes.toBytes("cf"); + + private static byte[] CQ = Bytes.toBytes("cq"); + + private static final int REGIONS_NUM = 5; + + private static final int SLEEP_TIME_WHEN_CLOSE_REGION = 1000; + + private static volatile boolean abortTimeoutTaskScheduled = false; + + @BeforeClass + public static void setUp() throws Exception { + Configuration conf = UTIL.getConfiguration(); + // Will schedule a abort timeout task after SLEEP_TIME_WHEN_CLOSE_REGION ms + conf.setLong(HRegionServer.ABORT_TIMEOUT, SLEEP_TIME_WHEN_CLOSE_REGION); + conf.set(HRegionServer.ABORT_TIMEOUT_TASK, TestAbortTimeoutTask.class.getName()); + UTIL.startMiniCluster(2); + HTableDescriptor td = new HTableDescriptor(TABLE_NAME); + td.addCoprocessor(SleepWhenCloseCoprocessor.class.getName()); + td.addFamily(new HColumnDescriptor(CF)); + UTIL.getHBaseAdmin().createTable(td, Bytes.toBytes("0"), Bytes.toBytes("9"), REGIONS_NUM); + } + + @AfterClass + public static void tearDown() throws Exception { + UTIL.getHBaseAdmin().disableTable(TABLE_NAME); + UTIL.getHBaseAdmin().deleteTable(TABLE_NAME); + UTIL.shutdownMiniCluster(); + } + + @Test + public void testAbortTimeout() throws Exception { + Thread writer = new Thread() { + public void run() { + try { + try (Table table = UTIL.getConnection().getTable(TABLE_NAME)) { + for (int i = 0; i < 10000; i++) { + table.put(new Put(Bytes.toBytes(i)).addColumn(CF, CQ, Bytes.toBytes(i))); + } + } + } catch (IOException e) { + LOG.warn("Failed to load data"); + } + } + }; + writer.setDaemon(true); + writer.start(); + + // Abort one region server + UTIL.getMiniHBaseCluster().getRegionServer(0).abort("Abort RS for test"); + + long startTime = System.currentTimeMillis(); + long timeout = REGIONS_NUM * SLEEP_TIME_WHEN_CLOSE_REGION * 10; + while (System.currentTimeMillis() - startTime < timeout) { + if (UTIL.getMiniHBaseCluster().getLiveRegionServerThreads().size() == 1) { + assertTrue("Abort timer task should be scheduled", abortTimeoutTaskScheduled); + return; + } + Threads.sleep(SLEEP_TIME_WHEN_CLOSE_REGION); + } + fail("Failed to abort a region server in " + timeout + " ms"); + } + + static class TestAbortTimeoutTask extends TimerTask { + + public TestAbortTimeoutTask() { + } + + @Override + public void run() { + LOG.info("TestAbortTimeoutTask was scheduled"); + abortTimeoutTaskScheduled = true; + } + } + + public static class SleepWhenCloseCoprocessor extends BaseRegionObserver { + @Override + public void preClose(ObserverContext c, boolean abortRequested) + throws IOException { + Threads.sleep(SLEEP_TIME_WHEN_CLOSE_REGION); + } + } +}