HBASE-4287 If region opening fails, change region in transition into a FAILED_OPEN state so that it can be retried quickly.

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1170449 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Todd Lipcon 2011-09-14 06:45:34 +00:00
parent a902f9ab1c
commit 1df9e04262
6 changed files with 131 additions and 37 deletions

View File

@ -477,6 +477,8 @@ Release 0.91.0 - Unreleased
(Lars Hofhansl) (Lars Hofhansl)
HBASE-4359 Show dead RegionServer names in the HMaster info page HBASE-4359 Show dead RegionServer names in the HMaster info page
(Harsh J) (Harsh J)
HBASE-4287 If region opening fails, change region in transition into
a FAILED_OPEN state so that it can be retried quickly. (todd)
TASKS TASKS
HBASE-3559 Move report of split to master OFF the heartbeat channel HBASE-3559 Move report of split to master OFF the heartbeat channel

View File

@ -109,6 +109,7 @@ public abstract class EventHandler implements Runnable, Comparable<Runnable> {
RS_ZK_REGION_OPENED (4), // RS has finished opening a region RS_ZK_REGION_OPENED (4), // RS has finished opening a region
RS_ZK_REGION_SPLITTING (5), // RS has started a region split RS_ZK_REGION_SPLITTING (5), // RS has started a region split
RS_ZK_REGION_SPLIT (6), // RS split has completed. RS_ZK_REGION_SPLIT (6), // RS split has completed.
RS_ZK_REGION_FAILED_OPEN (7), // RS failed to open a region
// Messages originating from Master to RS // Messages originating from Master to RS
M_RS_OPEN_REGION (20), // Master asking RS to open a region M_RS_OPEN_REGION (20), // Master asking RS to open a region

View File

@ -118,6 +118,7 @@ public class ExecutorService {
// Master executor services // Master executor services
case RS_ZK_REGION_CLOSED: case RS_ZK_REGION_CLOSED:
case RS_ZK_REGION_FAILED_OPEN:
return ExecutorType.MASTER_CLOSE_REGION; return ExecutorType.MASTER_CLOSE_REGION;
case RS_ZK_REGION_OPENED: case RS_ZK_REGION_OPENED:

View File

@ -465,6 +465,7 @@ public class AssignmentManager extends ZooKeeperListener {
break; break;
case RS_ZK_REGION_CLOSED: case RS_ZK_REGION_CLOSED:
case RS_ZK_REGION_FAILED_OPEN:
// Region is closed, insert into RIT and handle it // Region is closed, insert into RIT and handle it
addToRITandCallClose(regionInfo, RegionState.State.CLOSED, data); addToRITandCallClose(regionInfo, RegionState.State.CLOSED, data);
break; break;
@ -706,6 +707,21 @@ public class AssignmentManager extends ZooKeeperListener {
this, regionState.getRegion())); this, regionState.getRegion()));
break; break;
case RS_ZK_REGION_FAILED_OPEN:
if (regionState == null ||
(!regionState.isPendingOpen() && !regionState.isOpening())) {
LOG.warn("Received FAILED_OPEN for region " + prettyPrintedRegionName +
" from server " + data.getOrigin() + " but region was in " +
" the state " + regionState + " and not in PENDING_OPEN or OPENING");
return;
}
// Handle this the same as if it were opened and then closed.
regionState.update(RegionState.State.CLOSED,
data.getStamp(), data.getOrigin());
this.executorService.submit(new ClosedRegionHandler(master,
this, regionState.getRegion()));
break;
case RS_ZK_REGION_OPENING: case RS_ZK_REGION_OPENING:
// Should see OPENING after we have asked it to OPEN or additional // Should see OPENING after we have asked it to OPEN or additional
// times after already being in state of OPENING // times after already being in state of OPENING

View File

@ -106,7 +106,11 @@ public class OpenRegionHandler extends EventHandler {
// Open region. After a successful open, failures in subsequent // Open region. After a successful open, failures in subsequent
// processing needs to do a close as part of cleanup. // processing needs to do a close as part of cleanup.
region = openRegion(); region = openRegion();
if (region == null) return; if (region == null) {
tryTransitionToFailedOpen(regionInfo);
return;
}
boolean failed = true; boolean failed = true;
if (tickleOpening("post_region_open")) { if (tickleOpening("post_region_open")) {
if (updateMeta(region)) failed = false; if (updateMeta(region)) failed = false;
@ -114,10 +118,17 @@ public class OpenRegionHandler extends EventHandler {
if (failed || this.server.isStopped() || if (failed || this.server.isStopped() ||
this.rsServices.isStopping()) { this.rsServices.isStopping()) {
cleanupFailedOpen(region); cleanupFailedOpen(region);
tryTransitionToFailedOpen(regionInfo);
return; return;
} }
if (!transitionToOpened(region)) { if (!transitionToOpened(region)) {
// If we fail to transition to opened, it's because of one of two cases:
// (a) we lost our ZK lease
// OR (b) someone else opened the region before us
// In either case, we don't need to transition to FAILED_OPEN state.
// In case (a), the Master will process us as a dead server. In case
// (b) the region is already being handled elsewhere anyway.
cleanupFailedOpen(region); cleanupFailedOpen(region);
return; return;
} }
@ -137,7 +148,7 @@ public class OpenRegionHandler extends EventHandler {
* state meantime so master doesn't timeout our region-in-transition. * state meantime so master doesn't timeout our region-in-transition.
* Caller must cleanup region if this fails. * Caller must cleanup region if this fails.
*/ */
private boolean updateMeta(final HRegion r) { boolean updateMeta(final HRegion r) {
if (this.server.isStopped() || this.rsServices.isStopping()) { if (this.server.isStopped() || this.rsServices.isStopping()) {
return false; return false;
} }
@ -272,31 +283,32 @@ public class OpenRegionHandler extends EventHandler {
} }
/** /**
* @return Instance of HRegion if successful open else null. * @param Region we're working on.
* This is not guaranteed to succeed, we just do our best.
* @return Transition znode to CLOSED state.
*/ */
HRegion openRegion(Path tableDir) { private boolean tryTransitionToFailedOpen(final HRegionInfo hri) {
HRegion region = null; boolean result = false;
final String name = hri.getRegionNameAsString();
try { try {
// Instantiate the region. This also periodically tickles our zk OPENING LOG.info("Opening of region " + hri + " failed, marking as FAILED_OPEN in ZK");
// state so master doesn't timeout this region in transition. if (ZKAssign.transitionNode(
region = HRegion.openHRegion(tableDir, this.regionInfo, this.htd, this.server.getZooKeeper(), hri,
this.rsServices.getWAL(), this.server.getConfiguration(), this.server.getServerName(),
this.rsServices, EventType.RS_ZK_REGION_OPENING,
new CancelableProgressable() { EventType.RS_ZK_REGION_FAILED_OPEN,
public boolean progress() { this.version) == -1) {
// We may lose the znode ownership during the open. Currently its LOG.warn("Unable to mark region " + hri + " as FAILED_OPEN. " +
// too hard interrupting ongoing region open. Just let it complete "It's likely that the master already timed out this open " +
// and check we still have the znode after region open. "attempt, and thus another RS already has the region.");
return tickleOpening("open_region_progress"); } else {
result = true;
} }
}); } catch (KeeperException e) {
} catch (IOException e) { LOG.error("Failed transitioning node " + name +
// We failed open. Let our znode expire in regions-in-transition and " from OPENING to FAILED_OPEN", e);
// Master will assign elsewhere. Presumes nothing to close.
LOG.error("Failed open of region=" +
this.regionInfo.getRegionNameAsString(), e);
} }
return region; return result;
} }
/** /**
@ -318,11 +330,12 @@ public class OpenRegionHandler extends EventHandler {
return tickleOpening("open_region_progress"); return tickleOpening("open_region_progress");
} }
}); });
} catch (IOException e) { } catch (Throwable t) {
// We failed open. Let our znode expire in regions-in-transition and // We failed open. Our caller will see the 'null' return value
// Master will assign elsewhere. Presumes nothing to close. // and transition the node back to FAILED_OPEN. If that fails,
// we rely on the Timeout Monitor in the master to reassign.
LOG.error("Failed open of region=" + LOG.error("Failed open of region=" +
this.regionInfo.getRegionNameAsString(), e); this.regionInfo.getRegionNameAsString(), t);
} }
return region; return region;
} }

View File

@ -25,11 +25,16 @@ import java.io.IOException;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.Server;
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.executor.RegionTransitionData;
import org.apache.hadoop.hbase.executor.EventHandler.EventType;
import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.RegionServerServices; import org.apache.hadoop.hbase.regionserver.RegionServerServices;
import org.apache.hadoop.hbase.zookeeper.ZKAssign; import org.apache.hadoop.hbase.zookeeper.ZKAssign;
@ -40,6 +45,9 @@ import org.apache.zookeeper.KeeperException.NodeExistsException;
import org.junit.AfterClass; import org.junit.AfterClass;
import org.junit.BeforeClass; import org.junit.BeforeClass;
import org.junit.Test; import org.junit.Test;
import org.mockito.Mockito;
import org.mockito.invocation.InvocationOnMock;
import org.mockito.stubbing.Answer;
/** /**
* Test of the {@link OpenRegionHandler}. * Test of the {@link OpenRegionHandler}.
@ -47,6 +55,11 @@ import org.junit.Test;
public class TestOpenRegionHandler { public class TestOpenRegionHandler {
static final Log LOG = LogFactory.getLog(TestOpenRegionHandler.class); static final Log LOG = LogFactory.getLog(TestOpenRegionHandler.class);
private final static HBaseTestingUtility HTU = new HBaseTestingUtility(); private final static HBaseTestingUtility HTU = new HBaseTestingUtility();
private static final HTableDescriptor TEST_HTD =
new HTableDescriptor("TestOpenRegionHandler.java");
private static final HRegionInfo TEST_HRI =
new HRegionInfo(TEST_HTD.getName(), HConstants.EMPTY_END_ROW,
HConstants.EMPTY_END_ROW);
@BeforeClass public static void before() throws Exception { @BeforeClass public static void before() throws Exception {
HTU.startMiniZKCluster(); HTU.startMiniZKCluster();
@ -69,10 +82,8 @@ public class TestOpenRegionHandler {
final Server server = new MockServer(HTU); final Server server = new MockServer(HTU);
final RegionServerServices rss = new MockRegionServerServices(); final RegionServerServices rss = new MockRegionServerServices();
HTableDescriptor htd = new HTableDescriptor("TestOpenRegionHandler.java"); HTableDescriptor htd = TEST_HTD;
final HRegionInfo hri = final HRegionInfo hri = TEST_HRI;
new HRegionInfo(htd.getName(), HConstants.EMPTY_END_ROW,
HConstants.EMPTY_END_ROW);
HRegion region = HRegion region =
HRegion.createHRegion(hri, HBaseTestingUtility.getTestDir(), HTU HRegion.createHRegion(hri, HBaseTestingUtility.getTestDir(), HTU
.getConfiguration(), htd); .getConfiguration(), htd);
@ -80,8 +91,7 @@ public class TestOpenRegionHandler {
OpenRegionHandler handler = new OpenRegionHandler(server, rss, hri, htd) { OpenRegionHandler handler = new OpenRegionHandler(server, rss, hri, htd) {
HRegion openRegion() { HRegion openRegion() {
// Open region first, then remove znode as though it'd been hijacked. // Open region first, then remove znode as though it'd been hijacked.
//HRegion region = super.openRegion(); HRegion region = super.openRegion();
HRegion region = super.openRegion(HBaseTestingUtility.getTestDir());
// Don't actually open region BUT remove the znode as though it'd // Don't actually open region BUT remove the znode as though it'd
// been hijacked on us. // been hijacked on us.
@ -103,4 +113,55 @@ public class TestOpenRegionHandler {
// post OPENING; again will expect it to come back w/o NPE or exception. // post OPENING; again will expect it to come back w/o NPE or exception.
handler.process(); handler.process();
} }
@Test
public void testFailedOpenRegion() throws Exception {
Server server = new MockServer(HTU);
RegionServerServices rsServices = Mockito.mock(RegionServerServices.class);
// Create it OFFLINE, which is what it expects
ZKAssign.createNodeOffline(server.getZooKeeper(), TEST_HRI, server.getServerName());
// Create the handler
OpenRegionHandler handler =
new OpenRegionHandler(server, rsServices, TEST_HRI, TEST_HTD) {
@Override
HRegion openRegion() {
// Fake failure of opening a region due to an IOE, which is caught
return null;
}
};
handler.process();
// Handler should have transitioned it to FAILED_OPEN
RegionTransitionData data =
ZKAssign.getData(server.getZooKeeper(), TEST_HRI.getEncodedName());
assertEquals(EventType.RS_ZK_REGION_FAILED_OPEN, data.getEventType());
}
@Test
public void testFailedUpdateMeta() throws Exception {
Server server = new MockServer(HTU);
RegionServerServices rsServices = Mockito.mock(RegionServerServices.class);
// Create it OFFLINE, which is what it expects
ZKAssign.createNodeOffline(server.getZooKeeper(), TEST_HRI, server.getServerName());
// Create the handler
OpenRegionHandler handler =
new OpenRegionHandler(server, rsServices, TEST_HRI, TEST_HTD) {
@Override
boolean updateMeta(final HRegion r) {
// Fake failure of updating META
return false;
}
};
handler.process();
// Handler should have transitioned it to FAILED_OPEN
RegionTransitionData data =
ZKAssign.getData(server.getZooKeeper(), TEST_HRI.getEncodedName());
assertEquals(EventType.RS_ZK_REGION_FAILED_OPEN, data.getEventType());
}
} }