HBASE-8144 Limit number of attempts to assign a region
git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1462213 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a0185e5462
commit
b0e1395fc6
|
@ -186,6 +186,16 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
/** Is the TimeOutManagement activated **/
|
||||
private final boolean tomActivated;
|
||||
|
||||
/**
|
||||
* A map to track the count a region fails to open in a row.
|
||||
* So that we don't try to open a region forever if the failure is
|
||||
* unrecoverable. We don't put this information in region states
|
||||
* because we don't expect this to happen frequently; we don't
|
||||
* want to copy this information over during each state transition either.
|
||||
*/
|
||||
private final ConcurrentHashMap<String, AtomicInteger>
|
||||
failedOpenTracker = new ConcurrentHashMap<String, AtomicInteger>();
|
||||
|
||||
/**
|
||||
* Constructs a new assignment manager.
|
||||
*
|
||||
|
@ -541,7 +551,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
public void process() throws IOException {
|
||||
ReentrantLock lock = locker.acquireLock(regionInfo.getEncodedName());
|
||||
try {
|
||||
unassign(regionInfo, rs, expectedVersion, sn, true);
|
||||
unassign(regionInfo, rs, expectedVersion, sn, true, null);
|
||||
} finally {
|
||||
lock.unlock();
|
||||
}
|
||||
|
@ -880,9 +890,25 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
// When there are more than one region server a new RS is selected as the
|
||||
// destination and the same is updated in the regionplan. (HBASE-5546)
|
||||
if (regionState != null) {
|
||||
getRegionPlan(regionState.getRegion(), sn, true);
|
||||
this.executorService.submit(new ClosedRegionHandler(server,
|
||||
this, regionState.getRegion()));
|
||||
AtomicInteger failedOpenCount = failedOpenTracker.get(encodedName);
|
||||
if (failedOpenCount == null) {
|
||||
failedOpenCount = new AtomicInteger();
|
||||
// No need to use putIfAbsent, or extra synchronization since
|
||||
// this whole handleRegion block is locked on the encoded region
|
||||
// name, and failedOpenTracker is updated only in this block
|
||||
failedOpenTracker.put(encodedName, failedOpenCount);
|
||||
}
|
||||
if (failedOpenCount.incrementAndGet() >= maximumAttempts) {
|
||||
regionStates.updateRegionState(
|
||||
regionState.getRegion(), RegionState.State.FAILED_OPEN);
|
||||
// remove the tracking info to save memory, also reset
|
||||
// the count for next open initiative
|
||||
failedOpenTracker.remove(encodedName);
|
||||
} else {
|
||||
getRegionPlan(regionState.getRegion(), sn, true);
|
||||
this.executorService.submit(new ClosedRegionHandler(server,
|
||||
this, regionState.getRegion()));
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -909,11 +935,16 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
+ " from server " + sn + " but region was in the state " + regionState
|
||||
+ " and not in expected PENDING_OPEN or OPENING states,"
|
||||
+ " or not on the expected server");
|
||||
// Close it without updating the internal region states,
|
||||
// so as not to create double assignments in unlucky scenarios
|
||||
// mentioned in OpenRegionHandler#process
|
||||
unassign(regionState.getRegion(), null, -1, null, false, sn);
|
||||
return;
|
||||
}
|
||||
// Handle OPENED by removing from transition and deleted zk node
|
||||
regionState = regionStates.updateRegionState(rt, RegionState.State.OPEN);
|
||||
if (regionState != null) {
|
||||
failedOpenTracker.remove(encodedName); // reset the count, if any
|
||||
this.executorService.submit(new OpenedRegionHandler(
|
||||
server, this, regionState.getRegion(), sn, expectedVersion));
|
||||
}
|
||||
|
@ -1571,12 +1602,23 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
}
|
||||
|
||||
/**
|
||||
* Send CLOSE RPC if the server is online, otherwise, offline the region
|
||||
* Send CLOSE RPC if the server is online, otherwise, offline the region.
|
||||
*
|
||||
* The RPC will be sent only to the region sever found in the region state
|
||||
* if it is passed in, otherwise, to the src server specified. If region
|
||||
* state is not specified, we don't update region state at all, instead
|
||||
* we just send the RPC call. This is useful for some cleanup without
|
||||
* messing around the region states (see handleRegion, on region opened
|
||||
* on an unexpected server scenario, for an example)
|
||||
*/
|
||||
private void unassign(final HRegionInfo region,
|
||||
final RegionState state, final int versionOfClosingNode,
|
||||
final ServerName dest, final boolean transitionInZK) {
|
||||
ServerName server = state.getServerName();
|
||||
final ServerName dest, final boolean transitionInZK,
|
||||
final ServerName src) {
|
||||
ServerName server = src;
|
||||
if (state != null) {
|
||||
server = state.getServerName();
|
||||
}
|
||||
for (int i = 1; i <= this.maximumAttempts; i++) {
|
||||
// ClosedRegionhandler can remove the server from this.regions
|
||||
if (!serverManager.isServerOnline(server)) {
|
||||
|
@ -1584,7 +1626,9 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
// delete the node. if no node exists need not bother.
|
||||
deleteClosingOrClosedNode(region);
|
||||
}
|
||||
regionOffline(region);
|
||||
if (state != null) {
|
||||
regionOffline(region);
|
||||
}
|
||||
return;
|
||||
}
|
||||
try {
|
||||
|
@ -1608,9 +1652,12 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
if (transitionInZK) {
|
||||
deleteClosingOrClosedNode(region);
|
||||
}
|
||||
regionOffline(region);
|
||||
if (state != null) {
|
||||
regionOffline(region);
|
||||
}
|
||||
return;
|
||||
} else if (t instanceof RegionAlreadyInTransitionException) {
|
||||
} else if (state != null
|
||||
&& t instanceof RegionAlreadyInTransitionException) {
|
||||
// RS is already processing this region, only need to update the timestamp
|
||||
LOG.debug("update " + state + " the timestamp.");
|
||||
state.updateTimestampToNow();
|
||||
|
@ -1622,7 +1669,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
}
|
||||
}
|
||||
// Run out of attempts
|
||||
if (!tomActivated) {
|
||||
if (!tomActivated && state != null) {
|
||||
regionStates.updateRegionState(region, RegionState.State.FAILED_CLOSE);
|
||||
}
|
||||
}
|
||||
|
@ -1649,7 +1696,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
case CLOSING:
|
||||
case PENDING_CLOSE:
|
||||
case FAILED_CLOSE:
|
||||
unassign(region, state, -1, null, false);
|
||||
unassign(region, state, -1, null, false, null);
|
||||
state = regionStates.getRegionState(region);
|
||||
if (state.isOffline()) break;
|
||||
case FAILED_OPEN:
|
||||
|
@ -1717,14 +1764,17 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
}
|
||||
}
|
||||
if (setOfflineInZK && versionOfOfflineNode == -1) {
|
||||
LOG.warn("Unable to set offline in ZooKeeper to assign " + region);
|
||||
if (!tomActivated) {
|
||||
regionStates.updateRegionState(region, RegionState.State.FAILED_OPEN);
|
||||
LOG.info("Unable to set offline in ZooKeeper to assign " + region);
|
||||
// Setting offline in ZK must have been failed due to ZK racing or some
|
||||
// exception which may make the server to abort. If it is ZK racing,
|
||||
// we should retry since we already reset the region state,
|
||||
// existing (re)assignment will fail anyway.
|
||||
if (!server.isAborted()) {
|
||||
continue;
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (this.server.isStopped()) {
|
||||
LOG.debug("Server stopped; skipping assign of " + region);
|
||||
if (this.server.isStopped() || this.server.isAborted()) {
|
||||
LOG.debug("Server stopped/aborted; skipping assign of " + region);
|
||||
return;
|
||||
}
|
||||
LOG.info("Assigning region " + region.getRegionNameAsString() +
|
||||
|
@ -2144,7 +2194,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
return;
|
||||
}
|
||||
|
||||
unassign(region, state, versionOfClosingNode, dest, true);
|
||||
unassign(region, state, versionOfClosingNode, dest, true, null);
|
||||
} finally {
|
||||
lock.unlock();
|
||||
}
|
||||
|
|
|
@ -253,6 +253,10 @@ public class RegionStates {
|
|||
newServerName = null;
|
||||
}
|
||||
|
||||
if (state == State.FAILED_CLOSE || state == State.FAILED_OPEN) {
|
||||
LOG.warn("Failed to transition " + hri + " on " + serverName + ": " + state);
|
||||
}
|
||||
|
||||
String regionName = hri.getEncodedName();
|
||||
RegionState regionState = new RegionState(
|
||||
hri, state, System.currentTimeMillis(), newServerName);
|
||||
|
|
|
@ -26,6 +26,8 @@ import java.io.IOException;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.HColumnDescriptor;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
|
@ -44,6 +46,7 @@ import org.apache.hadoop.hbase.coprocessor.RegionObserver;
|
|||
import org.apache.hadoop.hbase.master.balancer.StochasticLoadBalancer;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegionServer;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.FSUtils;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
@ -61,11 +64,13 @@ public class TestAssignmentManagerOnCluster {
|
|||
|
||||
@BeforeClass
|
||||
public static void setUpBeforeClass() throws Exception {
|
||||
// Using the test load balancer to control region plans
|
||||
// Using the mock load balancer to control region plans
|
||||
conf.setClass(HConstants.HBASE_MASTER_LOADBALANCER_CLASS,
|
||||
TestLoadBalancer.class, LoadBalancer.class);
|
||||
MockLoadBalancer.class, LoadBalancer.class);
|
||||
conf.setClass(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY,
|
||||
TestRegionObserver.class, RegionObserver.class);
|
||||
MockRegionObserver.class, RegionObserver.class);
|
||||
// Reduce the maximum attempts to speed up the test
|
||||
conf.setInt("hbase.assignment.maximum.attempts", 3);
|
||||
|
||||
TEST_UTIL.startMiniCluster(3);
|
||||
admin = TEST_UTIL.getHBaseAdmin();
|
||||
|
@ -141,7 +146,7 @@ public class TestAssignmentManagerOnCluster {
|
|||
/**
|
||||
* This tests moving a region
|
||||
*/
|
||||
@Test
|
||||
@Test (timeout=50000)
|
||||
public void testMoveRegion() throws Exception {
|
||||
String table = "testMoveRegion";
|
||||
try {
|
||||
|
@ -163,7 +168,7 @@ public class TestAssignmentManagerOnCluster {
|
|||
TEST_UTIL.getHBaseAdmin().move(hri.getEncodedNameAsBytes(),
|
||||
Bytes.toBytes(destServerName.getServerName()));
|
||||
|
||||
long timeoutTime = System.currentTimeMillis() + 800;
|
||||
long timeoutTime = System.currentTimeMillis() + 30000;
|
||||
while (true) {
|
||||
ServerName sn = regionStates.getRegionServerOfRegion(hri);
|
||||
if (sn != null && sn.equals(destServerName)) {
|
||||
|
@ -172,7 +177,8 @@ public class TestAssignmentManagerOnCluster {
|
|||
}
|
||||
long now = System.currentTimeMillis();
|
||||
if (now > timeoutTime) {
|
||||
fail("Failed to move the region in time");
|
||||
fail("Failed to move the region in time: "
|
||||
+ regionStates.getRegionState(hri));
|
||||
}
|
||||
regionStates.waitForUpdate(50);
|
||||
}
|
||||
|
@ -205,10 +211,60 @@ public class TestAssignmentManagerOnCluster {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This tests forcefully assign a region
|
||||
* while it's closing and re-assigned.
|
||||
*
|
||||
* This test should not be flaky. If it is flaky, it means something
|
||||
* wrong with AssignmentManager which should be reported and fixed
|
||||
*/
|
||||
@Test (timeout=60000)
|
||||
public void testForceAssignWhileClosing() throws Exception {
|
||||
String table = "testForceAssignWhileClosing";
|
||||
try {
|
||||
HTableDescriptor desc = new HTableDescriptor(table);
|
||||
desc.addFamily(new HColumnDescriptor(FAMILY));
|
||||
admin.createTable(desc);
|
||||
|
||||
HTable meta = new HTable(conf, HConstants.META_TABLE_NAME);
|
||||
HRegionInfo hri = new HRegionInfo(
|
||||
desc.getName(), Bytes.toBytes("A"), Bytes.toBytes("Z"));
|
||||
MetaEditor.addRegionToMeta(meta, hri);
|
||||
|
||||
HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
|
||||
master.assignRegion(hri);
|
||||
AssignmentManager am = master.getAssignmentManager();
|
||||
assertTrue(am.waitForAssignment(hri));
|
||||
|
||||
MockRegionObserver.enabled = true;
|
||||
am.unassign(hri);
|
||||
RegionState state = am.getRegionStates().getRegionState(hri);
|
||||
assertEquals(RegionState.State.FAILED_CLOSE, state.getState());
|
||||
|
||||
MockRegionObserver.enabled = false;
|
||||
am.unassign(hri, true);
|
||||
|
||||
// region is closing now, will be re-assigned automatically.
|
||||
// now, let's forcefully assign it again. it should be
|
||||
// assigned properly and no double-assignment
|
||||
am.assign(hri, true, true);
|
||||
|
||||
// region should be closed and re-assigned
|
||||
assertTrue(am.waitForAssignment(hri));
|
||||
|
||||
ServerName serverName = master.getAssignmentManager().
|
||||
getRegionStates().getRegionServerOfRegion(hri);
|
||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
|
||||
} finally {
|
||||
MockRegionObserver.enabled = false;
|
||||
TEST_UTIL.deleteTable(Bytes.toBytes(table));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This tests region close failed
|
||||
*/
|
||||
@Test
|
||||
@Test (timeout=30000)
|
||||
public void testCloseFailed() throws Exception {
|
||||
String table = "testCloseFailed";
|
||||
try {
|
||||
|
@ -226,24 +282,25 @@ public class TestAssignmentManagerOnCluster {
|
|||
AssignmentManager am = master.getAssignmentManager();
|
||||
assertTrue(am.waitForAssignment(hri));
|
||||
|
||||
TestRegionObserver.enabled = true;
|
||||
MockRegionObserver.enabled = true;
|
||||
am.unassign(hri);
|
||||
RegionState state = am.getRegionStates().getRegionState(hri);
|
||||
assertEquals(RegionState.State.FAILED_CLOSE, state.getState());
|
||||
|
||||
TestRegionObserver.enabled = false;
|
||||
MockRegionObserver.enabled = false;
|
||||
am.unassign(hri, true);
|
||||
state = am.getRegionStates().getRegionState(hri);
|
||||
assertTrue(RegionState.State.FAILED_CLOSE != state.getState());
|
||||
|
||||
am.assign(hri, true, true);
|
||||
// region may still be assigned now since it's closing,
|
||||
// let's check if it's assigned after it's out of transition
|
||||
am.waitOnRegionToClearRegionsInTransition(hri);
|
||||
|
||||
// region should be closed and re-assigned
|
||||
assertTrue(am.waitForAssignment(hri));
|
||||
|
||||
ServerName serverName = master.getAssignmentManager().
|
||||
getRegionStates().getRegionServerOfRegion(hri);
|
||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
|
||||
} finally {
|
||||
TestRegionObserver.enabled = false;
|
||||
MockRegionObserver.enabled = false;
|
||||
TEST_UTIL.deleteTable(Bytes.toBytes(table));
|
||||
}
|
||||
}
|
||||
|
@ -251,7 +308,7 @@ public class TestAssignmentManagerOnCluster {
|
|||
/**
|
||||
* This tests region open failed
|
||||
*/
|
||||
@Test
|
||||
@Test (timeout=30000)
|
||||
public void testOpenFailed() throws Exception {
|
||||
String table = "testOpenFailed";
|
||||
try {
|
||||
|
@ -264,7 +321,7 @@ public class TestAssignmentManagerOnCluster {
|
|||
desc.getName(), Bytes.toBytes("A"), Bytes.toBytes("Z"));
|
||||
MetaEditor.addRegionToMeta(meta, hri);
|
||||
|
||||
TestLoadBalancer.controledRegion = hri.getEncodedName();
|
||||
MockLoadBalancer.controledRegion = hri.getEncodedName();
|
||||
|
||||
HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
|
||||
master.assignRegion(hri);
|
||||
|
@ -274,7 +331,7 @@ public class TestAssignmentManagerOnCluster {
|
|||
RegionState state = am.getRegionStates().getRegionState(hri);
|
||||
assertEquals(RegionState.State.FAILED_OPEN, state.getState());
|
||||
|
||||
TestLoadBalancer.controledRegion = null;
|
||||
MockLoadBalancer.controledRegion = null;
|
||||
master.assignRegion(hri);
|
||||
assertTrue(am.waitForAssignment(hri));
|
||||
|
||||
|
@ -282,12 +339,56 @@ public class TestAssignmentManagerOnCluster {
|
|||
getRegionStates().getRegionServerOfRegion(hri);
|
||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
|
||||
} finally {
|
||||
TestLoadBalancer.controledRegion = null;
|
||||
MockLoadBalancer.controledRegion = null;
|
||||
TEST_UTIL.deleteTable(Bytes.toBytes(table));
|
||||
}
|
||||
}
|
||||
|
||||
static class TestLoadBalancer extends StochasticLoadBalancer {
|
||||
/**
|
||||
* This tests region open failure which is not recoverable
|
||||
*/
|
||||
@Test (timeout=30000)
|
||||
public void testOpenFailedUnrecoverable() throws Exception {
|
||||
String table = "testOpenFailedUnrecoverable";
|
||||
try {
|
||||
HTableDescriptor desc = new HTableDescriptor(table);
|
||||
desc.addFamily(new HColumnDescriptor(FAMILY));
|
||||
admin.createTable(desc);
|
||||
|
||||
HTable meta = new HTable(conf, HConstants.META_TABLE_NAME);
|
||||
HRegionInfo hri = new HRegionInfo(
|
||||
desc.getName(), Bytes.toBytes("A"), Bytes.toBytes("Z"));
|
||||
MetaEditor.addRegionToMeta(meta, hri);
|
||||
|
||||
FileSystem fs = FileSystem.get(conf);
|
||||
Path tableDir= FSUtils.getTablePath(FSUtils.getRootDir(conf), table);
|
||||
Path regionDir = new Path(tableDir, hri.getEncodedName());
|
||||
// create a file named the same as the region dir to
|
||||
// mess up with region opening
|
||||
fs.create(regionDir, true);
|
||||
|
||||
HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
|
||||
master.assignRegion(hri);
|
||||
AssignmentManager am = master.getAssignmentManager();
|
||||
assertFalse(am.waitForAssignment(hri));
|
||||
|
||||
RegionState state = am.getRegionStates().getRegionState(hri);
|
||||
assertEquals(RegionState.State.FAILED_OPEN, state.getState());
|
||||
|
||||
// remove the blocking file, so that region can be opened
|
||||
fs.delete(regionDir, true);
|
||||
master.assignRegion(hri);
|
||||
assertTrue(am.waitForAssignment(hri));
|
||||
|
||||
ServerName serverName = master.getAssignmentManager().
|
||||
getRegionStates().getRegionServerOfRegion(hri);
|
||||
TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
|
||||
} finally {
|
||||
TEST_UTIL.deleteTable(Bytes.toBytes(table));
|
||||
}
|
||||
}
|
||||
|
||||
static class MockLoadBalancer extends StochasticLoadBalancer {
|
||||
// For this region, if specified, always assign to nowhere
|
||||
static volatile String controledRegion = null;
|
||||
|
||||
|
@ -301,7 +402,7 @@ public class TestAssignmentManagerOnCluster {
|
|||
}
|
||||
}
|
||||
|
||||
public static class TestRegionObserver extends BaseRegionObserver {
|
||||
public static class MockRegionObserver extends BaseRegionObserver {
|
||||
// If enabled, fail all preClose calls
|
||||
static volatile boolean enabled = false;
|
||||
|
||||
|
|
Loading…
Reference in New Issue