HBASE-4287 If region opening fails, change region in transition into a FAILED_OPEN state so that it can be retried quickly.

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1170449 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Todd Lipcon 2011-09-14 06:45:34 +00:00
parent a902f9ab1c
commit 1df9e04262
6 changed files with 131 additions and 37 deletions

View File

@ -477,6 +477,8 @@ Release 0.91.0 - Unreleased
(Lars Hofhansl)
HBASE-4359 Show dead RegionServer names in the HMaster info page
(Harsh J)
HBASE-4287 If region opening fails, change region in transition into
a FAILED_OPEN state so that it can be retried quickly. (todd)
TASKS
HBASE-3559 Move report of split to master OFF the heartbeat channel

View File

@ -109,6 +109,7 @@ public abstract class EventHandler implements Runnable, Comparable<Runnable> {
RS_ZK_REGION_OPENED (4), // RS has finished opening a region
RS_ZK_REGION_SPLITTING (5), // RS has started a region split
RS_ZK_REGION_SPLIT (6), // RS split has completed.
RS_ZK_REGION_FAILED_OPEN (7), // RS failed to open a region
// Messages originating from Master to RS
M_RS_OPEN_REGION (20), // Master asking RS to open a region

View File

@ -118,6 +118,7 @@ public class ExecutorService {
// Master executor services
case RS_ZK_REGION_CLOSED:
case RS_ZK_REGION_FAILED_OPEN:
return ExecutorType.MASTER_CLOSE_REGION;
case RS_ZK_REGION_OPENED:

View File

@ -465,6 +465,7 @@ public class AssignmentManager extends ZooKeeperListener {
break;
case RS_ZK_REGION_CLOSED:
case RS_ZK_REGION_FAILED_OPEN:
// Region is closed, insert into RIT and handle it
addToRITandCallClose(regionInfo, RegionState.State.CLOSED, data);
break;
@ -706,6 +707,21 @@ public class AssignmentManager extends ZooKeeperListener {
this, regionState.getRegion()));
break;
case RS_ZK_REGION_FAILED_OPEN:
if (regionState == null ||
(!regionState.isPendingOpen() && !regionState.isOpening())) {
LOG.warn("Received FAILED_OPEN for region " + prettyPrintedRegionName +
" from server " + data.getOrigin() + " but region was in " +
" the state " + regionState + " and not in PENDING_OPEN or OPENING");
return;
}
// Handle this the same as if it were opened and then closed.
regionState.update(RegionState.State.CLOSED,
data.getStamp(), data.getOrigin());
this.executorService.submit(new ClosedRegionHandler(master,
this, regionState.getRegion()));
break;
case RS_ZK_REGION_OPENING:
// Should see OPENING after we have asked it to OPEN or additional
// times after already being in state of OPENING

View File

@ -106,7 +106,11 @@ public class OpenRegionHandler extends EventHandler {
// Open region. After a successful open, failures in subsequent
// processing needs to do a close as part of cleanup.
region = openRegion();
if (region == null) return;
if (region == null) {
tryTransitionToFailedOpen(regionInfo);
return;
}
boolean failed = true;
if (tickleOpening("post_region_open")) {
if (updateMeta(region)) failed = false;
@ -114,10 +118,17 @@ public class OpenRegionHandler extends EventHandler {
if (failed || this.server.isStopped() ||
this.rsServices.isStopping()) {
cleanupFailedOpen(region);
tryTransitionToFailedOpen(regionInfo);
return;
}
if (!transitionToOpened(region)) {
// If we fail to transition to opened, it's because of one of two cases:
// (a) we lost our ZK lease
// OR (b) someone else opened the region before us
// In either case, we don't need to transition to FAILED_OPEN state.
// In case (a), the Master will process us as a dead server. In case
// (b) the region is already being handled elsewhere anyway.
cleanupFailedOpen(region);
return;
}
@ -137,7 +148,7 @@ public class OpenRegionHandler extends EventHandler {
* state meantime so master doesn't timeout our region-in-transition.
* Caller must cleanup region if this fails.
*/
private boolean updateMeta(final HRegion r) {
boolean updateMeta(final HRegion r) {
if (this.server.isStopped() || this.rsServices.isStopping()) {
return false;
}
@ -272,31 +283,32 @@ public class OpenRegionHandler extends EventHandler {
}
/**
* @return Instance of HRegion if successful open else null.
* @param Region we're working on.
* This is not guaranteed to succeed, we just do our best.
* @return Transition znode to CLOSED state.
*/
HRegion openRegion(Path tableDir) {
HRegion region = null;
private boolean tryTransitionToFailedOpen(final HRegionInfo hri) {
boolean result = false;
final String name = hri.getRegionNameAsString();
try {
// Instantiate the region. This also periodically tickles our zk OPENING
// state so master doesn't timeout this region in transition.
region = HRegion.openHRegion(tableDir, this.regionInfo, this.htd,
this.rsServices.getWAL(), this.server.getConfiguration(),
this.rsServices,
new CancelableProgressable() {
public boolean progress() {
// We may lose the znode ownership during the open. Currently its
// too hard interrupting ongoing region open. Just let it complete
// and check we still have the znode after region open.
return tickleOpening("open_region_progress");
}
});
} catch (IOException e) {
// We failed open. Let our znode expire in regions-in-transition and
// Master will assign elsewhere. Presumes nothing to close.
LOG.error("Failed open of region=" +
this.regionInfo.getRegionNameAsString(), e);
LOG.info("Opening of region " + hri + " failed, marking as FAILED_OPEN in ZK");
if (ZKAssign.transitionNode(
this.server.getZooKeeper(), hri,
this.server.getServerName(),
EventType.RS_ZK_REGION_OPENING,
EventType.RS_ZK_REGION_FAILED_OPEN,
this.version) == -1) {
LOG.warn("Unable to mark region " + hri + " as FAILED_OPEN. " +
"It's likely that the master already timed out this open " +
"attempt, and thus another RS already has the region.");
} else {
result = true;
}
} catch (KeeperException e) {
LOG.error("Failed transitioning node " + name +
" from OPENING to FAILED_OPEN", e);
}
return region;
return result;
}
/**
@ -318,11 +330,12 @@ public class OpenRegionHandler extends EventHandler {
return tickleOpening("open_region_progress");
}
});
} catch (IOException e) {
// We failed open. Let our znode expire in regions-in-transition and
// Master will assign elsewhere. Presumes nothing to close.
} catch (Throwable t) {
// We failed open. Our caller will see the 'null' return value
// and transition the node back to FAILED_OPEN. If that fails,
// we rely on the Timeout Monitor in the master to reassign.
LOG.error("Failed open of region=" +
this.regionInfo.getRegionNameAsString(), e);
this.regionInfo.getRegionNameAsString(), t);
}
return region;
}

View File

@ -25,11 +25,16 @@ import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.Server;
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.executor.RegionTransitionData;
import org.apache.hadoop.hbase.executor.EventHandler.EventType;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.RegionServerServices;
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
@ -40,6 +45,9 @@ import org.apache.zookeeper.KeeperException.NodeExistsException;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import org.mockito.Mockito;
import org.mockito.invocation.InvocationOnMock;
import org.mockito.stubbing.Answer;
/**
* Test of the {@link OpenRegionHandler}.
@ -47,6 +55,11 @@ import org.junit.Test;
public class TestOpenRegionHandler {
static final Log LOG = LogFactory.getLog(TestOpenRegionHandler.class);
private final static HBaseTestingUtility HTU = new HBaseTestingUtility();
private static final HTableDescriptor TEST_HTD =
new HTableDescriptor("TestOpenRegionHandler.java");
private static final HRegionInfo TEST_HRI =
new HRegionInfo(TEST_HTD.getName(), HConstants.EMPTY_END_ROW,
HConstants.EMPTY_END_ROW);
@BeforeClass public static void before() throws Exception {
HTU.startMiniZKCluster();
@ -69,10 +82,8 @@ public class TestOpenRegionHandler {
final Server server = new MockServer(HTU);
final RegionServerServices rss = new MockRegionServerServices();
HTableDescriptor htd = new HTableDescriptor("TestOpenRegionHandler.java");
final HRegionInfo hri =
new HRegionInfo(htd.getName(), HConstants.EMPTY_END_ROW,
HConstants.EMPTY_END_ROW);
HTableDescriptor htd = TEST_HTD;
final HRegionInfo hri = TEST_HRI;
HRegion region =
HRegion.createHRegion(hri, HBaseTestingUtility.getTestDir(), HTU
.getConfiguration(), htd);
@ -80,8 +91,7 @@ public class TestOpenRegionHandler {
OpenRegionHandler handler = new OpenRegionHandler(server, rss, hri, htd) {
HRegion openRegion() {
// Open region first, then remove znode as though it'd been hijacked.
//HRegion region = super.openRegion();
HRegion region = super.openRegion(HBaseTestingUtility.getTestDir());
HRegion region = super.openRegion();
// Don't actually open region BUT remove the znode as though it'd
// been hijacked on us.
@ -103,4 +113,55 @@ public class TestOpenRegionHandler {
// post OPENING; again will expect it to come back w/o NPE or exception.
handler.process();
}
@Test
public void testFailedOpenRegion() throws Exception {
Server server = new MockServer(HTU);
RegionServerServices rsServices = Mockito.mock(RegionServerServices.class);
// Create it OFFLINE, which is what it expects
ZKAssign.createNodeOffline(server.getZooKeeper(), TEST_HRI, server.getServerName());
// Create the handler
OpenRegionHandler handler =
new OpenRegionHandler(server, rsServices, TEST_HRI, TEST_HTD) {
@Override
HRegion openRegion() {
// Fake failure of opening a region due to an IOE, which is caught
return null;
}
};
handler.process();
// Handler should have transitioned it to FAILED_OPEN
RegionTransitionData data =
ZKAssign.getData(server.getZooKeeper(), TEST_HRI.getEncodedName());
assertEquals(EventType.RS_ZK_REGION_FAILED_OPEN, data.getEventType());
}
@Test
public void testFailedUpdateMeta() throws Exception {
Server server = new MockServer(HTU);
RegionServerServices rsServices = Mockito.mock(RegionServerServices.class);
// Create it OFFLINE, which is what it expects
ZKAssign.createNodeOffline(server.getZooKeeper(), TEST_HRI, server.getServerName());
// Create the handler
OpenRegionHandler handler =
new OpenRegionHandler(server, rsServices, TEST_HRI, TEST_HTD) {
@Override
boolean updateMeta(final HRegion r) {
// Fake failure of updating META
return false;
}
};
handler.process();
// Handler should have transitioned it to FAILED_OPEN
RegionTransitionData data =
ZKAssign.getData(server.getZooKeeper(), TEST_HRI.getEncodedName());
assertEquals(EventType.RS_ZK_REGION_FAILED_OPEN, data.getEventType());
}
}