HBASE-19335 Fix waitUntilAllRegionsAssigned(). Ignore assignments to killed servers and when region state != OPEN.

Update timeouts for TestRegionObserverInterface.
Reason: There are ~10 tests there, each with 5 min individual timeout. Too much. The test class is labelled MediumTests,
let's used that with our standard CategoryBasedTimeout. 3 min per test function should be enough even on slower Apache machines.
This commit is contained in:
Apekshit Sharma 2017-11-22 16:33:52 -08:00
parent 7c1c370f2f
commit e70b628544
6 changed files with 82 additions and 17 deletions

View File

@ -20,6 +20,7 @@ package org.apache.hadoop.hbase;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
@ -50,6 +51,12 @@ public class DistributedHBaseCluster extends HBaseCluster {
private final Connection connection;
private ClusterManager clusterManager;
/**
* List of RegionServers killed so far. ServerName also comprises startCode of a server,
* so any restarted instances of the same server will have different ServerName and will not
* coincide with past dead ones. So there's no need to cleanup this list.
*/
private Set<ServerName> killedRegionServers = new HashSet<>();
public DistributedHBaseCluster(Configuration conf, ClusterManager clusterManager)
throws IOException {
@ -113,10 +120,16 @@ public class DistributedHBaseCluster extends HBaseCluster {
@Override
public void killRegionServer(ServerName serverName) throws IOException {
LOG.info("Aborting RS: " + serverName.getServerName());
killedRegionServers.add(serverName);
clusterManager.kill(ServiceType.HBASE_REGIONSERVER,
serverName.getHostname(), serverName.getPort());
}
@Override
public boolean isKilledRS(ServerName serverName) {
return killedRegionServers.contains(serverName);
}
@Override
public void stopRegionServer(ServerName serverName) throws IOException {
LOG.info("Stopping RS: " + serverName.getServerName());

View File

@ -48,6 +48,7 @@ import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
import org.apache.yetus.audience.InterfaceAudience;
import org.apache.zookeeper.KeeperException;
import org.apache.hadoop.hbase.shaded.com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.hbase.shaded.com.google.common.base.Preconditions;
/**
@ -317,12 +318,14 @@ public class RegionStateStore {
* @param r Result to pull the region state from
* @return the region state, or null if unknown.
*/
protected State getRegionState(final Result r, int replicaId) {
@VisibleForTesting
public static State getRegionState(final Result r, int replicaId) {
Cell cell = r.getColumnLatestCell(HConstants.CATALOG_FAMILY, getStateColumn(replicaId));
if (cell == null || cell.getValueLength() == 0) {
return null;
}
return State.valueOf(Bytes.toString(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength()));
return State.valueOf(Bytes.toString(cell.getValueArray(), cell.getValueOffset(),
cell.getValueLength()));
}
private static byte[] getStateColumn(int replicaId) {

View File

@ -128,6 +128,15 @@ public abstract class HBaseCluster implements Closeable, Configurable {
*/
public abstract void killRegionServer(ServerName serverName) throws IOException;
/**
* Keeping track of killed servers and being able to check if a particular server was killed makes
* it possible to do fault tolerance testing for dead servers in a deterministic way. A concrete
* example of such case is - killing servers and waiting for all regions of a particular table
* to be assigned. We can check for server column in META table and that its value is not one
* of the killed servers.
*/
public abstract boolean isKilledRS(ServerName serverName);
/**
* Stops the given region server, by attempting a gradual stop.
* @return whether the operation finished with success

View File

@ -67,6 +67,8 @@ import org.apache.hadoop.hbase.Waiter.Predicate;
import org.apache.hadoop.hbase.client.ImmutableHRegionInfo;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.master.assignment.RegionStateStore;
import org.apache.hadoop.hbase.trace.TraceUtil;
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
import org.apache.yetus.audience.InterfaceAudience;
@ -3418,8 +3420,27 @@ public class HBaseTestingUtility extends HBaseCommonTestingUtility {
byte[] b = r.getValue(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
HRegionInfo info = HRegionInfo.parseFromOrNull(b);
if (info != null && info.getTable().equals(tableName)) {
b = r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
allRegionsAssigned &= (b != null);
// Get server hosting this region from catalog family. Return false if no server
// hosting this region, or if the server hosting this region was recently killed
// (for fault tolerance testing).
byte[] server =
r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
if (server == null) {
return false;
} else {
byte[] startCode =
r.getValue(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER);
ServerName serverName =
ServerName.valueOf(Bytes.toString(server).replaceFirst(":", ",") + "," +
Bytes.toLong(startCode));
if (getHBaseCluster().isKilledRS(serverName)) {
return false;
}
}
if (RegionStateStore.getRegionState(r, info.getReplicaId())
!= RegionState.State.OPEN) {
return false;
}
}
}
} finally {

View File

@ -21,8 +21,10 @@ package org.apache.hadoop.hbase;
import java.io.IOException;
import java.security.PrivilegedAction;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.yetus.audience.InterfaceAudience;
@ -108,6 +110,12 @@ public class MiniHBaseCluster extends HBaseCluster {
public static class MiniHBaseClusterRegionServer extends HRegionServer {
private Thread shutdownThread = null;
private User user = null;
/**
* List of RegionServers killed so far. ServerName also comprises startCode of a server,
* so any restarted instances of the same server will have different ServerName and will not
* coincide with past dead ones. So there's no need to cleanup this list.
*/
static Set<ServerName> killedServers = new HashSet<>();
public MiniHBaseClusterRegionServer(Configuration conf)
throws IOException, InterruptedException {
@ -156,7 +164,8 @@ public class MiniHBaseCluster extends HBaseCluster {
}
@Override
public void kill() {
protected void kill() {
killedServers.add(getServerName());
super.kill();
}
@ -249,6 +258,11 @@ public class MiniHBaseCluster extends HBaseCluster {
}
}
@Override
public boolean isKilledRS(ServerName serverName) {
return MiniHBaseClusterRegionServer.killedServers.contains(serverName);
}
@Override
public void stopRegionServer(ServerName serverName) throws IOException {
stopRegionServer(getRegionServerIndex(serverName));

View File

@ -35,6 +35,7 @@ import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.CategoryBasedTimeout;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.Coprocessor;
@ -88,12 +89,16 @@ import org.junit.Rule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.rules.TestName;
import org.junit.rules.TestRule;
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
@Category({ CoprocessorTests.class, MediumTests.class })
public class TestRegionObserverInterface {
private static final Log LOG = LogFactory.getLog(TestRegionObserverInterface.class);
@Rule
public final TestRule timeout = CategoryBasedTimeout.builder().
withTimeout(this.getClass()).withLookingForStuckThread(true).build();
public static final TableName TEST_TABLE = TableName.valueOf("TestTable");
public final static byte[] A = Bytes.toBytes("a");
@ -124,7 +129,7 @@ public class TestRegionObserverInterface {
util.shutdownMiniCluster();
}
@Test(timeout = 300000)
@Test
public void testRegionObserver() throws IOException {
final TableName tableName = TableName.valueOf(TEST_TABLE.getNameAsString() + "." + name.getMethodName());
// recreate table every time in order to reset the status of the
@ -184,7 +189,7 @@ public class TestRegionObserverInterface {
tableName, new Integer[] { 1, 1, 1, 1 });
}
@Test(timeout = 300000)
@Test
public void testRowMutation() throws IOException {
final TableName tableName = TableName.valueOf(TEST_TABLE.getNameAsString() + "." + name.getMethodName());
Table table = util.createTable(tableName, new byte[][] { A, B, C });
@ -216,7 +221,7 @@ public class TestRegionObserverInterface {
}
}
@Test(timeout = 300000)
@Test
public void testIncrementHook() throws IOException {
final TableName tableName = TableName.valueOf(TEST_TABLE.getNameAsString() + "." + name.getMethodName());
Table table = util.createTable(tableName, new byte[][] { A, B, C });
@ -239,7 +244,7 @@ public class TestRegionObserverInterface {
}
}
@Test(timeout = 300000)
@Test
public void testCheckAndPutHooks() throws IOException {
final TableName tableName = TableName.valueOf(TEST_TABLE.getNameAsString() + "." + name.getMethodName());
try (Table table = util.createTable(tableName, new byte[][] { A, B, C })) {
@ -260,7 +265,7 @@ public class TestRegionObserverInterface {
}
}
@Test(timeout = 300000)
@Test
public void testCheckAndDeleteHooks() throws IOException {
final TableName tableName = TableName.valueOf(TEST_TABLE.getNameAsString() + "." + name.getMethodName());
Table table = util.createTable(tableName, new byte[][] { A, B, C });
@ -285,7 +290,7 @@ public class TestRegionObserverInterface {
}
}
@Test(timeout = 300000)
@Test
public void testAppendHook() throws IOException {
final TableName tableName = TableName.valueOf(TEST_TABLE.getNameAsString() + "." + name.getMethodName());
Table table = util.createTable(tableName, new byte[][] { A, B, C });
@ -308,7 +313,7 @@ public class TestRegionObserverInterface {
}
}
@Test(timeout = 300000)
@Test
// HBase-3583
public void testHBase3583() throws IOException {
final TableName tableName = TableName.valueOf(name.getMethodName());
@ -351,7 +356,7 @@ public class TestRegionObserverInterface {
table.close();
}
@Test(timeout = 300000)
@Test
public void testHBASE14489() throws IOException {
final TableName tableName = TableName.valueOf(name.getMethodName());
Table table = util.createTable(tableName, new byte[][] { A });
@ -476,7 +481,7 @@ public class TestRegionObserverInterface {
* Tests overriding compaction handling via coprocessor hooks
* @throws Exception
*/
@Test(timeout = 300000)
@Test
public void testCompactionOverride() throws Exception {
final TableName compactTable = TableName.valueOf(name.getMethodName());
Admin admin = util.getAdmin();
@ -546,7 +551,7 @@ public class TestRegionObserverInterface {
table.close();
}
@Test(timeout = 300000)
@Test
public void bulkLoadHFileTest() throws Exception {
final String testName = TestRegionObserverInterface.class.getName() + "." + name.getMethodName();
final TableName tableName = TableName.valueOf(TEST_TABLE.getNameAsString() + "." + name.getMethodName());
@ -575,7 +580,7 @@ public class TestRegionObserverInterface {
}
}
@Test(timeout = 300000)
@Test
public void testRecovery() throws Exception {
LOG.info(TestRegionObserverInterface.class.getName() + "." + name.getMethodName());
final TableName tableName = TableName.valueOf(TEST_TABLE.getNameAsString() + "." + name.getMethodName());
@ -625,7 +630,7 @@ public class TestRegionObserverInterface {
}
}
@Test(timeout = 300000)
@Test
public void testPreWALRestoreSkip() throws Exception {
LOG.info(TestRegionObserverInterface.class.getName() + "." + name.getMethodName());
TableName tableName = TableName.valueOf(SimpleRegionObserver.TABLE_SKIPPED);