HBASE-11760 Tighten up region state transition

This commit is contained in:
Jimmy Xiang 2014-08-14 16:39:12 -07:00
parent 4cb76aa929
commit 3a82cf238b
12 changed files with 644 additions and 584 deletions

View File

@ -36,10 +36,10 @@ public class RegionState {
@InterfaceStability.Evolving
public enum State {
OFFLINE, // region is in an offline state
PENDING_OPEN, // sent rpc to server to open but has not begun
PENDING_OPEN, // same as OPENING, to be removed
OPENING, // server has begun to open but not yet done
OPEN, // server opened region and updated meta
PENDING_CLOSE, // sent rpc to server to close but has not begun
PENDING_CLOSE, // same as CLOSING, to be removed
CLOSING, // server has begun to close but not yet done
CLOSED, // server closed region and updated meta
SPLITTING, // server started split of a region
@ -210,30 +210,28 @@ public class RegionState {
return serverName;
}
/**
* PENDING_CLOSE (to be removed) is the same as CLOSING
*/
public boolean isClosing() {
return state == State.CLOSING;
return state == State.PENDING_CLOSE || state == State.CLOSING;
}
public boolean isClosed() {
return state == State.CLOSED;
}
public boolean isPendingClose() {
return state == State.PENDING_CLOSE;
}
/**
* PENDING_OPEN (to be removed) is the same as OPENING
*/
public boolean isOpening() {
return state == State.OPENING;
return state == State.PENDING_OPEN || state == State.OPENING;
}
public boolean isOpened() {
return state == State.OPEN;
}
public boolean isPendingOpen() {
return state == State.PENDING_OPEN;
}
public boolean isOffline() {
return state == State.OFFLINE;
}
@ -270,40 +268,6 @@ public class RegionState {
return state == State.MERGING_NEW;
}
public boolean isOpenOrMergingOnServer(final ServerName sn) {
return isOnServer(sn) && (isOpened() || isMerging());
}
public boolean isOpenOrMergingNewOnServer(final ServerName sn) {
return isOnServer(sn) && (isOpened() || isMergingNew());
}
public boolean isOpenOrSplittingOnServer(final ServerName sn) {
return isOnServer(sn) && (isOpened() || isSplitting());
}
public boolean isOpenOrSplittingNewOnServer(final ServerName sn) {
return isOnServer(sn) && (isOpened() || isSplittingNew());
}
public boolean isPendingOpenOrOpeningOnServer(final ServerName sn) {
return isOnServer(sn) && isPendingOpenOrOpening();
}
// Failed open is also kind of pending open
public boolean isPendingOpenOrOpening() {
return isPendingOpen() || isOpening() || isFailedOpen();
}
public boolean isPendingCloseOrClosingOnServer(final ServerName sn) {
return isOnServer(sn) && isPendingCloseOrClosing();
}
// Failed close is also kind of pending close
public boolean isPendingCloseOrClosing() {
return isPendingClose() || isClosing() || isFailedClose();
}
public boolean isOnServer(final ServerName sn) {
return serverName != null && serverName.equals(sn);
}

View File

@ -1,39 +0,0 @@
/**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.regionserver;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import java.io.IOException;
/**
* This exception is thrown when a region server is asked to open or close
* a region but it's already processing it
*/
@SuppressWarnings("serial")
@InterfaceAudience.Public
@InterfaceStability.Stable
public class RegionAlreadyInTransitionException extends IOException {
public RegionAlreadyInTransitionException(String s) {
super(s);
}
}

View File

@ -34,18 +34,16 @@ public class AssignCallable implements Callable<Object> {
private AssignmentManager assignmentManager;
private HRegionInfo hri;
private boolean newPlan;
public AssignCallable(
AssignmentManager assignmentManager, HRegionInfo hri, boolean newPlan) {
AssignmentManager assignmentManager, HRegionInfo hri) {
this.assignmentManager = assignmentManager;
this.newPlan = newPlan;
this.hri = hri;
}
@Override
public Object call() throws Exception {
assignmentManager.assign(hri, newPlan);
assignmentManager.assign(hri);
return null;
}
}

View File

@ -561,7 +561,8 @@ public class RegionStates {
// Offline state is also kind of pending open if the region is in
// transition. The region could be in failed_close state too if we have
// tried several times to open it while this region server is not reachable)
if (state.isPendingOpenOrOpening() || state.isFailedClose() || state.isOffline()) {
if (isOneOfStates(state, State.OPENING, State.PENDING_OPEN,
State.FAILED_OPEN, State.FAILED_CLOSE, State.OFFLINE)) {
LOG.info("Found region in " + state + " to be reassigned by SSH for " + sn);
rits.add(hri);
} else {
@ -724,6 +725,12 @@ public class RegionStates {
lastAssignments.put(encodedName, serverName);
}
synchronized boolean isRegionOnServer(
final HRegionInfo hri, final ServerName serverName) {
Set<HRegionInfo> regions = serverHoldings.get(serverName);
return regions == null ? false : regions.contains(hri);
}
void splitRegion(HRegionInfo p,
HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
regionStateStore.splitRegion(p, a, b, sn);

View File

@ -724,9 +724,8 @@ public class ServerManager {
throws IOException {
AdminService.BlockingInterface admin = getRsAdmin(server);
if (admin == null) {
LOG.warn("Attempting to send OPEN RPC to server " + server.toString() +
throw new IOException("Attempting to send OPEN RPC to server " + server.toString() +
" failed because no RPC connection found to this server");
return RegionOpeningState.FAILED_OPENING;
}
OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(server,
region, favoredNodes,
@ -753,9 +752,8 @@ public class ServerManager {
throws IOException {
AdminService.BlockingInterface admin = getRsAdmin(server);
if (admin == null) {
LOG.warn("Attempting to send OPEN RPC to server " + server.toString() +
throw new IOException("Attempting to send OPEN RPC to server " + server.toString() +
" failed because no RPC connection found to this server");
return null;
}
OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(regionOpenInfos,

View File

@ -238,7 +238,7 @@ public class ServerShutdownHandler extends EventHandler {
}
toAssignRegions.add(hri);
} else if (rit != null) {
if ((rit.isPendingCloseOrClosing() || rit.isOffline())
if ((rit.isClosing() || rit.isFailedClose() || rit.isOffline())
&& am.getTableStateManager().isTableState(hri.getTable(),
ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING) ||
am.getReplicasToClose().contains(hri)) {

View File

@ -2480,10 +2480,9 @@ public class HRegionServer extends HasThread implements
* @param abort True if we are aborting
* @return True if closed a region.
* @throws NotServingRegionException if the region is not online
* @throws RegionAlreadyInTransitionException if the region is already closing
*/
protected boolean closeRegion(String encodedName, final boolean abort, final ServerName sn)
throws NotServingRegionException, RegionAlreadyInTransitionException {
throws NotServingRegionException {
//Check for permissions to close.
HRegion actualRegion = this.getFromOnlineRegions(encodedName);
if ((actualRegion != null) && (actualRegion.getCoprocessorHost() != null)) {
@ -2518,15 +2517,8 @@ public class HRegionServer extends HasThread implements
}
} else if (Boolean.FALSE.equals(previous)) {
LOG.info("Received CLOSE for the region: " + encodedName +
" ,which we are already trying to CLOSE, but not completed yet");
// The master will retry till the region is closed. We need to do this since
// the region could fail to close somehow. If we mark the region closed in master
// while it is not, there could be data loss.
// If the region stuck in closing for a while, and master runs out of retries,
// master will move the region to failed_to_close. Later on, if the region
// is indeed closed, master can properly re-assign it.
throw new RegionAlreadyInTransitionException("The region " + encodedName +
" was already closing. New CLOSE request is ignored.");
", which we are already trying to CLOSE, but not completed yet");
return true;
}
if (actualRegion == null) {

View File

@ -51,7 +51,6 @@ import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.KeyValueUtil;
import org.apache.hadoop.hbase.MetaTableAccessor;
import org.apache.hadoop.hbase.NotServingRegionException;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
@ -1180,7 +1179,6 @@ public class RSRpcServices implements HBaseRPCErrorHandler,
* @throws ServiceException
*/
@Override
@SuppressWarnings("deprecation")
@QosPriority(priority=HConstants.HIGH_QOS)
public OpenRegionResponse openRegion(final RpcController controller,
final OpenRegionRequest request) throws ServiceException {
@ -1236,35 +1234,15 @@ public class RSRpcServices implements HBaseRPCErrorHandler,
final HRegionInfo region = HRegionInfo.convert(regionOpenInfo.getRegion());
HTableDescriptor htd;
try {
final HRegion onlineRegion = regionServer.getFromOnlineRegions(region.getEncodedName());
String encodedName = region.getEncodedName();
byte[] encodedNameBytes = region.getEncodedNameAsBytes();
final HRegion onlineRegion = regionServer.getFromOnlineRegions(encodedName);
if (onlineRegion != null) {
//Check if the region can actually be opened.
if (onlineRegion.getCoprocessorHost() != null) {
onlineRegion.getCoprocessorHost().preOpen();
}
// See HBASE-5094. Cross check with hbase:meta if still this RS is owning
// the region.
Pair<HRegionInfo, ServerName> p = MetaTableAccessor.getRegion(
regionServer.getShortCircuitConnection(), region.getRegionName());
if (regionServer.serverName.equals(p.getSecond())) {
Boolean closing = regionServer.regionsInTransitionInRS.get(region.getEncodedNameAsBytes());
// Map regionsInTransitionInRSOnly has an entry for a region only if the region
// is in transition on this RS, so here closing can be null. If not null, it can
// be true or false. True means the region is opening on this RS; while false
// means the region is closing. Only return ALREADY_OPENED if not closing (i.e.
// not in transition any more, or still transition to open.
if (!Boolean.FALSE.equals(closing)
&& regionServer.getFromOnlineRegions(region.getEncodedName()) != null) {
LOG.warn("Attempted open of " + region.getEncodedName()
+ " but already online on this server");
builder.addOpeningState(RegionOpeningState.ALREADY_OPENED);
continue;
}
} else {
LOG.warn("The region " + region.getEncodedName() + " is online on this server"
+ " but hbase:meta does not have this server - continue opening.");
regionServer.removeFromOnlineRegions(onlineRegion, null);
}
// The region is already online. This should not happen any more.
String error = "Received OPEN for the region:"
+ region.getRegionNameAsString() + ", which is already online";
regionServer.abort(error);
throw new IOException(error);
}
LOG.info("Open " + region.getRegionNameAsString());
htd = htds.get(region.getTable());
@ -1274,18 +1252,23 @@ public class RSRpcServices implements HBaseRPCErrorHandler,
}
final Boolean previous = regionServer.regionsInTransitionInRS.putIfAbsent(
region.getEncodedNameAsBytes(), Boolean.TRUE);
encodedNameBytes, Boolean.TRUE);
if (Boolean.FALSE.equals(previous)) {
// There is a close in progress. This should not happen any more.
throw new RegionAlreadyInTransitionException("Received OPEN for the region:"
+ region.getRegionNameAsString() + " , which we are already trying to CLOSE ");
if (regionServer.getFromOnlineRegions(encodedName) != null) {
// There is a close in progress. This should not happen any more.
String error = "Received OPEN for the region:"
+ region.getRegionNameAsString() + ", which we are already trying to CLOSE";
regionServer.abort(error);
throw new IOException(error);
}
regionServer.regionsInTransitionInRS.put(encodedNameBytes, Boolean.TRUE);
}
if (Boolean.TRUE.equals(previous)) {
// An open is in progress. This is supported, but let's log this.
LOG.info("Receiving OPEN for the region:" +
region.getRegionNameAsString() + " , which we are already trying to OPEN"
region.getRegionNameAsString() + ", which we are already trying to OPEN"
+ " - ignoring this new request for this region.");
}
@ -1293,7 +1276,7 @@ public class RSRpcServices implements HBaseRPCErrorHandler,
// want to keep returning the stale moved record while we are opening/if we close again.
regionServer.removeFromMovedRegions(region.getEncodedName());
if (previous == null) {
if (previous == null || !previous.booleanValue()) {
// check if the region to be opened is marked in recovering state in ZK
if (ZKSplitLog.isRegionMarkedRecoveringInZK(regionServer.getZooKeeper(),
region.getEncodedName())) {

View File

@ -122,7 +122,7 @@ public class CloseRegionHandler extends EventHandler {
LOG.debug("Closed " + region.getRegionNameAsString());
} finally {
this.rsServices.getRegionsInTransitionInRS().
remove(this.regionInfo.getEncodedNameAsBytes());
remove(this.regionInfo.getEncodedNameAsBytes(), Boolean.FALSE);
}
}
}

View File

@ -191,12 +191,10 @@ public class TestAssignmentManagerOnCluster {
// Region is assigned now. Let's assign it again.
// Master should not abort, and region should be assigned.
RegionState oldState = regionStates.getRegionState(hri);
TEST_UTIL.getHBaseAdmin().assign(hri.getRegionName());
master.getAssignmentManager().waitForAssignment(hri);
RegionState newState = regionStates.getRegionState(hri);
assertTrue(newState.isOpened()
&& newState.getStamp() != oldState.getStamp());
assertTrue(newState.isOpened());
} finally {
TEST_UTIL.deleteTable(Bytes.toBytes(table));
}
@ -231,7 +229,7 @@ public class TestAssignmentManagerOnCluster {
// Use the first server as the destination server
ServerName destServer = onlineServers.iterator().next();
// Created faked dead server
// Created faked dead server that is still online in master
deadServer = ServerName.valueOf(destServer.getHostname(),
destServer.getPort(), destServer.getStartcode() - 100L);
master.serverManager.recordNewServerWithLock(deadServer, ServerLoad.EMPTY_SERVERLOAD);
@ -415,14 +413,11 @@ public class TestAssignmentManagerOnCluster {
}
/**
* This test should not be flaky. If it is flaky, it means something
* wrong with AssignmentManager which should be reported and fixed
*
* This tests forcefully assign a region while it's closing and re-assigned.
* This tests assign a region while it's closing.
*/
@Test (timeout=60000)
public void testForceAssignWhileClosing() throws Exception {
String table = "testForceAssignWhileClosing";
public void testAssignWhileClosing() throws Exception {
String table = "testAssignWhileClosing";
try {
HTableDescriptor desc = new HTableDescriptor(TableName.valueOf(table));
desc.addFamily(new HColumnDescriptor(FAMILY));
@ -664,14 +659,6 @@ public class TestAssignmentManagerOnCluster {
MyRegionObserver.postCloseEnabled.set(true);
am.unassign(hri);
// Now region should pending_close or closing
// Unassign it again so that we can trigger already
// in transition exception. This test is to make sure this scenario
// is handled properly.
am.server.getConfiguration().setLong(
AssignmentManager.ALREADY_IN_TRANSITION_WAITTIME, 1000);
am.getRegionStates().updateRegionState(hri, RegionState.State.FAILED_CLOSE);
am.unassign(hri);
// Let region closing move ahead. The region should be closed
// properly and re-assigned automatically
@ -727,7 +714,7 @@ public class TestAssignmentManagerOnCluster {
am.unassign(hri);
RegionState state = am.getRegionStates().getRegionState(hri);
ServerName oldServerName = state.getServerName();
assertTrue(state.isPendingOpenOrOpening() && oldServerName != null);
assertTrue(state.isOpening() && oldServerName != null);
// Now the region is stuck in opening
// Let's forcefully re-assign it to trigger closing/opening
@ -816,6 +803,7 @@ public class TestAssignmentManagerOnCluster {
// You can't unassign a dead region before SSH either
am.unassign(hri);
state = regionStates.getRegionState(hri);
assertTrue(state.isFailedClose());
// Enable SSH so that log can be split

View File

@ -26,7 +26,6 @@ import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.MediumTests;
import org.apache.hadoop.hbase.MetaTableAccessor;
import org.apache.hadoop.hbase.NotServingRegionException;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.client.HTable;
@ -247,29 +246,6 @@ public class TestRegionServerNoMaster {
checkRegionIsOpened(HTU, getRS(), hri);
}
@Test
public void testOpenClosingRegion() throws Exception {
Assert.assertTrue(getRS().getRegion(regionName).isAvailable());
try {
// we re-opened meta so some of its data is lost
ServerName sn = getRS().getServerName();
MetaTableAccessor.updateRegionLocation(getRS().getShortCircuitConnection(),
hri, sn, getRS().getRegion(regionName).getOpenSeqNum());
// fake region to be closing now, need to clear state afterwards
getRS().regionsInTransitionInRS.put(hri.getEncodedNameAsBytes(), Boolean.FALSE);
AdminProtos.OpenRegionRequest orr =
RequestConverter.buildOpenRegionRequest(sn, hri, null, null);
getRS().rpcServices.openRegion(null, orr);
Assert.fail("The closing region should not be opened");
} catch (ServiceException se) {
Assert.assertTrue("The region should be already in transition",
se.getCause() instanceof RegionAlreadyInTransitionException);
} finally {
getRS().regionsInTransitionInRS.remove(hri.getEncodedNameAsBytes());
}
}
@Test(timeout = 60000)
public void testMultipleCloseFromMaster() throws Exception {
for (int i = 0; i < 10; i++) {
@ -277,11 +253,10 @@ public class TestRegionServerNoMaster {
RequestConverter.buildCloseRegionRequest(getRS().getServerName(), regionName, null);
try {
AdminProtos.CloseRegionResponse responseClose = getRS().rpcServices.closeRegion(null, crr);
Assert.assertEquals("The first request should succeeds", 0, i);
Assert.assertTrue("request " + i + " failed",
responseClose.getClosed() || responseClose.hasClosed());
} catch (ServiceException se) {
Assert.assertTrue("The next queries should throw an exception.", i > 0);
Assert.assertTrue("The next queries may throw an exception.", i > 0);
}
}