HBASE-4287 If region opening fails, change region in transition into a FAILED_OPEN state so that it can be retried quickly.
git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1170449 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a902f9ab1c
commit
1df9e04262
|
@ -477,6 +477,8 @@ Release 0.91.0 - Unreleased
|
|||
(Lars Hofhansl)
|
||||
HBASE-4359 Show dead RegionServer names in the HMaster info page
|
||||
(Harsh J)
|
||||
HBASE-4287 If region opening fails, change region in transition into
|
||||
a FAILED_OPEN state so that it can be retried quickly. (todd)
|
||||
|
||||
TASKS
|
||||
HBASE-3559 Move report of split to master OFF the heartbeat channel
|
||||
|
|
|
@ -109,6 +109,7 @@ public abstract class EventHandler implements Runnable, Comparable<Runnable> {
|
|||
RS_ZK_REGION_OPENED (4), // RS has finished opening a region
|
||||
RS_ZK_REGION_SPLITTING (5), // RS has started a region split
|
||||
RS_ZK_REGION_SPLIT (6), // RS split has completed.
|
||||
RS_ZK_REGION_FAILED_OPEN (7), // RS failed to open a region
|
||||
|
||||
// Messages originating from Master to RS
|
||||
M_RS_OPEN_REGION (20), // Master asking RS to open a region
|
||||
|
|
|
@ -118,6 +118,7 @@ public class ExecutorService {
|
|||
// Master executor services
|
||||
|
||||
case RS_ZK_REGION_CLOSED:
|
||||
case RS_ZK_REGION_FAILED_OPEN:
|
||||
return ExecutorType.MASTER_CLOSE_REGION;
|
||||
|
||||
case RS_ZK_REGION_OPENED:
|
||||
|
|
|
@ -465,6 +465,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
break;
|
||||
|
||||
case RS_ZK_REGION_CLOSED:
|
||||
case RS_ZK_REGION_FAILED_OPEN:
|
||||
// Region is closed, insert into RIT and handle it
|
||||
addToRITandCallClose(regionInfo, RegionState.State.CLOSED, data);
|
||||
break;
|
||||
|
@ -705,6 +706,21 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
this.executorService.submit(new ClosedRegionHandler(master,
|
||||
this, regionState.getRegion()));
|
||||
break;
|
||||
|
||||
case RS_ZK_REGION_FAILED_OPEN:
|
||||
if (regionState == null ||
|
||||
(!regionState.isPendingOpen() && !regionState.isOpening())) {
|
||||
LOG.warn("Received FAILED_OPEN for region " + prettyPrintedRegionName +
|
||||
" from server " + data.getOrigin() + " but region was in " +
|
||||
" the state " + regionState + " and not in PENDING_OPEN or OPENING");
|
||||
return;
|
||||
}
|
||||
// Handle this the same as if it were opened and then closed.
|
||||
regionState.update(RegionState.State.CLOSED,
|
||||
data.getStamp(), data.getOrigin());
|
||||
this.executorService.submit(new ClosedRegionHandler(master,
|
||||
this, regionState.getRegion()));
|
||||
break;
|
||||
|
||||
case RS_ZK_REGION_OPENING:
|
||||
// Should see OPENING after we have asked it to OPEN or additional
|
||||
|
|
|
@ -106,7 +106,11 @@ public class OpenRegionHandler extends EventHandler {
|
|||
// Open region. After a successful open, failures in subsequent
|
||||
// processing needs to do a close as part of cleanup.
|
||||
region = openRegion();
|
||||
if (region == null) return;
|
||||
if (region == null) {
|
||||
tryTransitionToFailedOpen(regionInfo);
|
||||
return;
|
||||
}
|
||||
|
||||
boolean failed = true;
|
||||
if (tickleOpening("post_region_open")) {
|
||||
if (updateMeta(region)) failed = false;
|
||||
|
@ -114,10 +118,17 @@ public class OpenRegionHandler extends EventHandler {
|
|||
if (failed || this.server.isStopped() ||
|
||||
this.rsServices.isStopping()) {
|
||||
cleanupFailedOpen(region);
|
||||
tryTransitionToFailedOpen(regionInfo);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!transitionToOpened(region)) {
|
||||
// If we fail to transition to opened, it's because of one of two cases:
|
||||
// (a) we lost our ZK lease
|
||||
// OR (b) someone else opened the region before us
|
||||
// In either case, we don't need to transition to FAILED_OPEN state.
|
||||
// In case (a), the Master will process us as a dead server. In case
|
||||
// (b) the region is already being handled elsewhere anyway.
|
||||
cleanupFailedOpen(region);
|
||||
return;
|
||||
}
|
||||
|
@ -137,7 +148,7 @@ public class OpenRegionHandler extends EventHandler {
|
|||
* state meantime so master doesn't timeout our region-in-transition.
|
||||
* Caller must cleanup region if this fails.
|
||||
*/
|
||||
private boolean updateMeta(final HRegion r) {
|
||||
boolean updateMeta(final HRegion r) {
|
||||
if (this.server.isStopped() || this.rsServices.isStopping()) {
|
||||
return false;
|
||||
}
|
||||
|
@ -270,33 +281,34 @@ public class OpenRegionHandler extends EventHandler {
|
|||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return Instance of HRegion if successful open else null.
|
||||
* @param Region we're working on.
|
||||
* This is not guaranteed to succeed, we just do our best.
|
||||
* @return Transition znode to CLOSED state.
|
||||
*/
|
||||
HRegion openRegion(Path tableDir) {
|
||||
HRegion region = null;
|
||||
private boolean tryTransitionToFailedOpen(final HRegionInfo hri) {
|
||||
boolean result = false;
|
||||
final String name = hri.getRegionNameAsString();
|
||||
try {
|
||||
// Instantiate the region. This also periodically tickles our zk OPENING
|
||||
// state so master doesn't timeout this region in transition.
|
||||
region = HRegion.openHRegion(tableDir, this.regionInfo, this.htd,
|
||||
this.rsServices.getWAL(), this.server.getConfiguration(),
|
||||
this.rsServices,
|
||||
new CancelableProgressable() {
|
||||
public boolean progress() {
|
||||
// We may lose the znode ownership during the open. Currently its
|
||||
// too hard interrupting ongoing region open. Just let it complete
|
||||
// and check we still have the znode after region open.
|
||||
return tickleOpening("open_region_progress");
|
||||
}
|
||||
});
|
||||
} catch (IOException e) {
|
||||
// We failed open. Let our znode expire in regions-in-transition and
|
||||
// Master will assign elsewhere. Presumes nothing to close.
|
||||
LOG.error("Failed open of region=" +
|
||||
this.regionInfo.getRegionNameAsString(), e);
|
||||
LOG.info("Opening of region " + hri + " failed, marking as FAILED_OPEN in ZK");
|
||||
if (ZKAssign.transitionNode(
|
||||
this.server.getZooKeeper(), hri,
|
||||
this.server.getServerName(),
|
||||
EventType.RS_ZK_REGION_OPENING,
|
||||
EventType.RS_ZK_REGION_FAILED_OPEN,
|
||||
this.version) == -1) {
|
||||
LOG.warn("Unable to mark region " + hri + " as FAILED_OPEN. " +
|
||||
"It's likely that the master already timed out this open " +
|
||||
"attempt, and thus another RS already has the region.");
|
||||
} else {
|
||||
result = true;
|
||||
}
|
||||
} catch (KeeperException e) {
|
||||
LOG.error("Failed transitioning node " + name +
|
||||
" from OPENING to FAILED_OPEN", e);
|
||||
}
|
||||
return region;
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -318,11 +330,12 @@ public class OpenRegionHandler extends EventHandler {
|
|||
return tickleOpening("open_region_progress");
|
||||
}
|
||||
});
|
||||
} catch (IOException e) {
|
||||
// We failed open. Let our znode expire in regions-in-transition and
|
||||
// Master will assign elsewhere. Presumes nothing to close.
|
||||
} catch (Throwable t) {
|
||||
// We failed open. Our caller will see the 'null' return value
|
||||
// and transition the node back to FAILED_OPEN. If that fails,
|
||||
// we rely on the Timeout Monitor in the master to reassign.
|
||||
LOG.error("Failed open of region=" +
|
||||
this.regionInfo.getRegionNameAsString(), e);
|
||||
this.regionInfo.getRegionNameAsString(), t);
|
||||
}
|
||||
return region;
|
||||
}
|
||||
|
@ -390,4 +403,4 @@ public class OpenRegionHandler extends EventHandler {
|
|||
private boolean isGoodVersion() {
|
||||
return this.version != -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,11 +25,16 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.HBaseConfiguration;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.HRegionInfo;
|
||||
import org.apache.hadoop.hbase.HTableDescriptor;
|
||||
import org.apache.hadoop.hbase.Server;
|
||||
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
|
||||
import org.apache.hadoop.hbase.executor.RegionTransitionData;
|
||||
import org.apache.hadoop.hbase.executor.EventHandler.EventType;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegion;
|
||||
import org.apache.hadoop.hbase.regionserver.RegionServerServices;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
|
||||
|
@ -40,6 +45,9 @@ import org.apache.zookeeper.KeeperException.NodeExistsException;
|
|||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
import org.mockito.Mockito;
|
||||
import org.mockito.invocation.InvocationOnMock;
|
||||
import org.mockito.stubbing.Answer;
|
||||
|
||||
/**
|
||||
* Test of the {@link OpenRegionHandler}.
|
||||
|
@ -47,6 +55,11 @@ import org.junit.Test;
|
|||
public class TestOpenRegionHandler {
|
||||
static final Log LOG = LogFactory.getLog(TestOpenRegionHandler.class);
|
||||
private final static HBaseTestingUtility HTU = new HBaseTestingUtility();
|
||||
private static final HTableDescriptor TEST_HTD =
|
||||
new HTableDescriptor("TestOpenRegionHandler.java");
|
||||
private static final HRegionInfo TEST_HRI =
|
||||
new HRegionInfo(TEST_HTD.getName(), HConstants.EMPTY_END_ROW,
|
||||
HConstants.EMPTY_END_ROW);
|
||||
|
||||
@BeforeClass public static void before() throws Exception {
|
||||
HTU.startMiniZKCluster();
|
||||
|
@ -69,10 +82,8 @@ public class TestOpenRegionHandler {
|
|||
final Server server = new MockServer(HTU);
|
||||
final RegionServerServices rss = new MockRegionServerServices();
|
||||
|
||||
HTableDescriptor htd = new HTableDescriptor("TestOpenRegionHandler.java");
|
||||
final HRegionInfo hri =
|
||||
new HRegionInfo(htd.getName(), HConstants.EMPTY_END_ROW,
|
||||
HConstants.EMPTY_END_ROW);
|
||||
HTableDescriptor htd = TEST_HTD;
|
||||
final HRegionInfo hri = TEST_HRI;
|
||||
HRegion region =
|
||||
HRegion.createHRegion(hri, HBaseTestingUtility.getTestDir(), HTU
|
||||
.getConfiguration(), htd);
|
||||
|
@ -80,9 +91,8 @@ public class TestOpenRegionHandler {
|
|||
OpenRegionHandler handler = new OpenRegionHandler(server, rss, hri, htd) {
|
||||
HRegion openRegion() {
|
||||
// Open region first, then remove znode as though it'd been hijacked.
|
||||
//HRegion region = super.openRegion();
|
||||
HRegion region = super.openRegion(HBaseTestingUtility.getTestDir());
|
||||
|
||||
HRegion region = super.openRegion();
|
||||
|
||||
// Don't actually open region BUT remove the znode as though it'd
|
||||
// been hijacked on us.
|
||||
ZooKeeperWatcher zkw = this.server.getZooKeeper();
|
||||
|
@ -103,4 +113,55 @@ public class TestOpenRegionHandler {
|
|||
// post OPENING; again will expect it to come back w/o NPE or exception.
|
||||
handler.process();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFailedOpenRegion() throws Exception {
|
||||
Server server = new MockServer(HTU);
|
||||
RegionServerServices rsServices = Mockito.mock(RegionServerServices.class);
|
||||
|
||||
// Create it OFFLINE, which is what it expects
|
||||
ZKAssign.createNodeOffline(server.getZooKeeper(), TEST_HRI, server.getServerName());
|
||||
|
||||
// Create the handler
|
||||
OpenRegionHandler handler =
|
||||
new OpenRegionHandler(server, rsServices, TEST_HRI, TEST_HTD) {
|
||||
@Override
|
||||
HRegion openRegion() {
|
||||
// Fake failure of opening a region due to an IOE, which is caught
|
||||
return null;
|
||||
}
|
||||
};
|
||||
handler.process();
|
||||
|
||||
// Handler should have transitioned it to FAILED_OPEN
|
||||
RegionTransitionData data =
|
||||
ZKAssign.getData(server.getZooKeeper(), TEST_HRI.getEncodedName());
|
||||
assertEquals(EventType.RS_ZK_REGION_FAILED_OPEN, data.getEventType());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFailedUpdateMeta() throws Exception {
|
||||
Server server = new MockServer(HTU);
|
||||
RegionServerServices rsServices = Mockito.mock(RegionServerServices.class);
|
||||
|
||||
// Create it OFFLINE, which is what it expects
|
||||
ZKAssign.createNodeOffline(server.getZooKeeper(), TEST_HRI, server.getServerName());
|
||||
|
||||
// Create the handler
|
||||
OpenRegionHandler handler =
|
||||
new OpenRegionHandler(server, rsServices, TEST_HRI, TEST_HTD) {
|
||||
@Override
|
||||
boolean updateMeta(final HRegion r) {
|
||||
// Fake failure of updating META
|
||||
return false;
|
||||
}
|
||||
};
|
||||
handler.process();
|
||||
|
||||
// Handler should have transitioned it to FAILED_OPEN
|
||||
RegionTransitionData data =
|
||||
ZKAssign.getData(server.getZooKeeper(), TEST_HRI.getEncodedName());
|
||||
assertEquals(EventType.RS_ZK_REGION_FAILED_OPEN, data.getEventType());
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue