diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java index 8c0d2739829..e8a00416e57 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java @@ -20,6 +20,7 @@ package org.apache.hadoop.hbase; import java.io.IOException; import java.util.ArrayList; import java.util.Comparator; +import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.TreeSet; @@ -50,6 +51,12 @@ public class DistributedHBaseCluster extends HBaseCluster { private final Connection connection; private ClusterManager clusterManager; + /** + * List of RegionServers killed so far. ServerName also comprises startCode of a server, + * so any restarted instances of the same server will have different ServerName and will not + * coincide with past dead ones. So there's no need to cleanup this list. + */ + private Set killedRegionServers = new HashSet<>(); public DistributedHBaseCluster(Configuration conf, ClusterManager clusterManager) throws IOException { @@ -113,10 +120,16 @@ public class DistributedHBaseCluster extends HBaseCluster { @Override public void killRegionServer(ServerName serverName) throws IOException { LOG.info("Aborting RS: " + serverName.getServerName()); + killedRegionServers.add(serverName); clusterManager.kill(ServiceType.HBASE_REGIONSERVER, serverName.getHostname(), serverName.getPort()); } + @Override + public boolean isKilledRS(ServerName serverName) { + return killedRegionServers.contains(serverName); + } + @Override public void stopRegionServer(ServerName serverName) throws IOException { LOG.info("Stopping RS: " + serverName.getServerName()); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java index 723a7761d0a..ba70ef486a6 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java @@ -48,6 +48,7 @@ import org.apache.hadoop.hbase.zookeeper.MetaTableLocator; import org.apache.yetus.audience.InterfaceAudience; import org.apache.zookeeper.KeeperException; +import org.apache.hadoop.hbase.shaded.com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.hbase.shaded.com.google.common.base.Preconditions; /** @@ -317,12 +318,14 @@ public class RegionStateStore { * @param r Result to pull the region state from * @return the region state, or null if unknown. */ - protected State getRegionState(final Result r, int replicaId) { + @VisibleForTesting + public static State getRegionState(final Result r, int replicaId) { Cell cell = r.getColumnLatestCell(HConstants.CATALOG_FAMILY, getStateColumn(replicaId)); if (cell == null || cell.getValueLength() == 0) { return null; } - return State.valueOf(Bytes.toString(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength())); + return State.valueOf(Bytes.toString(cell.getValueArray(), cell.getValueOffset(), + cell.getValueLength())); } private static byte[] getStateColumn(int replicaId) { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java index dfff9f44759..cac957bc88a 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java @@ -128,6 +128,15 @@ public abstract class HBaseCluster implements Closeable, Configurable { */ public abstract void killRegionServer(ServerName serverName) throws IOException; + /** + * Keeping track of killed servers and being able to check if a particular server was killed makes + * it possible to do fault tolerance testing for dead servers in a deterministic way. A concrete + * example of such case is - killing servers and waiting for all regions of a particular table + * to be assigned. We can check for server column in META table and that its value is not one + * of the killed servers. + */ + public abstract boolean isKilledRS(ServerName serverName); + /** * Stops the given region server, by attempting a gradual stop. * @return whether the operation finished with success diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java index eec78920eb4..4e65651b566 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java @@ -67,6 +67,8 @@ import org.apache.hadoop.hbase.Waiter.Predicate; import org.apache.hadoop.hbase.client.ImmutableHRegionInfo; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionInfoBuilder; +import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.master.assignment.RegionStateStore; import org.apache.hadoop.hbase.trace.TraceUtil; import org.apache.hadoop.hbase.zookeeper.ZKWatcher; import org.apache.yetus.audience.InterfaceAudience; @@ -3418,8 +3420,27 @@ public class HBaseTestingUtility extends HBaseCommonTestingUtility { byte[] b = r.getValue(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER); HRegionInfo info = HRegionInfo.parseFromOrNull(b); if (info != null && info.getTable().equals(tableName)) { - b = r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); - allRegionsAssigned &= (b != null); + // Get server hosting this region from catalog family. Return false if no server + // hosting this region, or if the server hosting this region was recently killed + // (for fault tolerance testing). + byte[] server = + r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); + if (server == null) { + return false; + } else { + byte[] startCode = + r.getValue(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER); + ServerName serverName = + ServerName.valueOf(Bytes.toString(server).replaceFirst(":", ",") + "," + + Bytes.toLong(startCode)); + if (getHBaseCluster().isKilledRS(serverName)) { + return false; + } + } + if (RegionStateStore.getRegionState(r, info.getReplicaId()) + != RegionState.State.OPEN) { + return false; + } } } } finally { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java index a46fa9d347b..e02347d3c3d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java @@ -21,8 +21,10 @@ package org.apache.hadoop.hbase; import java.io.IOException; import java.security.PrivilegedAction; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.yetus.audience.InterfaceAudience; @@ -108,6 +110,12 @@ public class MiniHBaseCluster extends HBaseCluster { public static class MiniHBaseClusterRegionServer extends HRegionServer { private Thread shutdownThread = null; private User user = null; + /** + * List of RegionServers killed so far. ServerName also comprises startCode of a server, + * so any restarted instances of the same server will have different ServerName and will not + * coincide with past dead ones. So there's no need to cleanup this list. + */ + static Set killedServers = new HashSet<>(); public MiniHBaseClusterRegionServer(Configuration conf) throws IOException, InterruptedException { @@ -156,7 +164,8 @@ public class MiniHBaseCluster extends HBaseCluster { } @Override - public void kill() { + protected void kill() { + killedServers.add(getServerName()); super.kill(); } @@ -249,6 +258,11 @@ public class MiniHBaseCluster extends HBaseCluster { } } + @Override + public boolean isKilledRS(ServerName serverName) { + return MiniHBaseClusterRegionServer.killedServers.contains(serverName); + } + @Override public void stopRegionServer(ServerName serverName) throws IOException { stopRegionServer(getRegionServerIndex(serverName)); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/coprocessor/TestRegionObserverInterface.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/coprocessor/TestRegionObserverInterface.java index 49fc3fd5470..fe67a08774d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/coprocessor/TestRegionObserverInterface.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/coprocessor/TestRegionObserverInterface.java @@ -35,6 +35,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.CategoryBasedTimeout; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.CellUtil; import org.apache.hadoop.hbase.Coprocessor; @@ -88,12 +89,16 @@ import org.junit.Rule; import org.junit.Test; import org.junit.experimental.categories.Category; import org.junit.rules.TestName; +import org.junit.rules.TestRule; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; @Category({ CoprocessorTests.class, MediumTests.class }) public class TestRegionObserverInterface { private static final Log LOG = LogFactory.getLog(TestRegionObserverInterface.class); + @Rule + public final TestRule timeout = CategoryBasedTimeout.builder(). + withTimeout(this.getClass()).withLookingForStuckThread(true).build(); public static final TableName TEST_TABLE = TableName.valueOf("TestTable"); public final static byte[] A = Bytes.toBytes("a"); @@ -124,7 +129,7 @@ public class TestRegionObserverInterface { util.shutdownMiniCluster(); } - @Test(timeout = 300000) + @Test public void testRegionObserver() throws IOException { final TableName tableName = TableName.valueOf(TEST_TABLE.getNameAsString() + "." + name.getMethodName()); // recreate table every time in order to reset the status of the @@ -184,7 +189,7 @@ public class TestRegionObserverInterface { tableName, new Integer[] { 1, 1, 1, 1 }); } - @Test(timeout = 300000) + @Test public void testRowMutation() throws IOException { final TableName tableName = TableName.valueOf(TEST_TABLE.getNameAsString() + "." + name.getMethodName()); Table table = util.createTable(tableName, new byte[][] { A, B, C }); @@ -216,7 +221,7 @@ public class TestRegionObserverInterface { } } - @Test(timeout = 300000) + @Test public void testIncrementHook() throws IOException { final TableName tableName = TableName.valueOf(TEST_TABLE.getNameAsString() + "." + name.getMethodName()); Table table = util.createTable(tableName, new byte[][] { A, B, C }); @@ -239,7 +244,7 @@ public class TestRegionObserverInterface { } } - @Test(timeout = 300000) + @Test public void testCheckAndPutHooks() throws IOException { final TableName tableName = TableName.valueOf(TEST_TABLE.getNameAsString() + "." + name.getMethodName()); try (Table table = util.createTable(tableName, new byte[][] { A, B, C })) { @@ -260,7 +265,7 @@ public class TestRegionObserverInterface { } } - @Test(timeout = 300000) + @Test public void testCheckAndDeleteHooks() throws IOException { final TableName tableName = TableName.valueOf(TEST_TABLE.getNameAsString() + "." + name.getMethodName()); Table table = util.createTable(tableName, new byte[][] { A, B, C }); @@ -285,7 +290,7 @@ public class TestRegionObserverInterface { } } - @Test(timeout = 300000) + @Test public void testAppendHook() throws IOException { final TableName tableName = TableName.valueOf(TEST_TABLE.getNameAsString() + "." + name.getMethodName()); Table table = util.createTable(tableName, new byte[][] { A, B, C }); @@ -308,7 +313,7 @@ public class TestRegionObserverInterface { } } - @Test(timeout = 300000) + @Test // HBase-3583 public void testHBase3583() throws IOException { final TableName tableName = TableName.valueOf(name.getMethodName()); @@ -351,7 +356,7 @@ public class TestRegionObserverInterface { table.close(); } - @Test(timeout = 300000) + @Test public void testHBASE14489() throws IOException { final TableName tableName = TableName.valueOf(name.getMethodName()); Table table = util.createTable(tableName, new byte[][] { A }); @@ -476,7 +481,7 @@ public class TestRegionObserverInterface { * Tests overriding compaction handling via coprocessor hooks * @throws Exception */ - @Test(timeout = 300000) + @Test public void testCompactionOverride() throws Exception { final TableName compactTable = TableName.valueOf(name.getMethodName()); Admin admin = util.getAdmin(); @@ -546,7 +551,7 @@ public class TestRegionObserverInterface { table.close(); } - @Test(timeout = 300000) + @Test public void bulkLoadHFileTest() throws Exception { final String testName = TestRegionObserverInterface.class.getName() + "." + name.getMethodName(); final TableName tableName = TableName.valueOf(TEST_TABLE.getNameAsString() + "." + name.getMethodName()); @@ -575,7 +580,7 @@ public class TestRegionObserverInterface { } } - @Test(timeout = 300000) + @Test public void testRecovery() throws Exception { LOG.info(TestRegionObserverInterface.class.getName() + "." + name.getMethodName()); final TableName tableName = TableName.valueOf(TEST_TABLE.getNameAsString() + "." + name.getMethodName()); @@ -625,7 +630,7 @@ public class TestRegionObserverInterface { } } - @Test(timeout = 300000) + @Test public void testPreWALRestoreSkip() throws Exception { LOG.info(TestRegionObserverInterface.class.getName() + "." + name.getMethodName()); TableName tableName = TableName.valueOf(SimpleRegionObserver.TABLE_SKIPPED);