HBASE-2758 META region stuck in RS2ZK_REGION_OPENED state (Karthik Ranganathan via jgray)

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@957099 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jonathan Gray 2010-06-23 03:31:19 +00:00
parent 97657cccd2
commit 77fcd6cb63
4 changed files with 153 additions and 17 deletions

View File

@ -410,6 +410,8 @@ Release 0.21.0 - Unreleased
HBASE-2769 Fix typo in warning message for HBaseConfiguration
HBASE-2768 Fix teardown order in TestFilter
HBASE-2763 Cross-port HADOOP-6833 IPC parameter leak bug
HBASE-2758 META region stuck in RS2ZK_REGION_OPENED state
(Karthik Ranganathan via jgray)
IMPROVEMENTS
HBASE-1760 Cleanup TODOs in HTable

View File

@ -162,6 +162,9 @@ public class HMaster extends Thread implements HMasterInterface,
private Map<String, Integer> fragmentation = null;
private final RegionServerOperationQueue regionServerOperationQueue;
// True if this is the master that started the cluster.
boolean isClusterStartup;
/**
* Constructor
* @param conf configuration
@ -169,6 +172,14 @@ public class HMaster extends Thread implements HMasterInterface,
*/
public HMaster(Configuration conf) throws IOException {
this.conf = conf;
// Figure out if this is a fresh cluster start. This is done by checking the
// number of RS ephemeral nodes. RS ephemeral nodes are created only after
// the primary master has written the address to ZK. So this has to be done
// before we race to write our address to zookeeper.
zooKeeperWrapper = ZooKeeperWrapper.createInstance(conf, HMaster.class.getName());
isClusterStartup = (zooKeeperWrapper.scanRSDirectory().size() == 0);
// Set filesystem to be that of this.rootdir else we get complaints about
// mismatched filesystems if hbase.rootdir is hdfs and fs.defaultFS is
// default localfs. Presumption is that rootdir is fully-qualified before
@ -206,8 +217,6 @@ public class HMaster extends Thread implements HMasterInterface,
// We'll succeed if we are only master or if we win the race when many
// masters. Otherwise we park here inside in writeAddressToZooKeeper.
// TODO: Bring up the UI to redirect to active Master.
zooKeeperWrapper =
ZooKeeperWrapper.createInstance(conf, HMaster.class.getName());
zooKeeperWrapper.registerListener(this);
this.zkMasterAddressWatcher =
new ZKMasterAddressWatcher(this.zooKeeperWrapper, this.shutdownRequested);
@ -219,10 +228,10 @@ public class HMaster extends Thread implements HMasterInterface,
serverManager = new ServerManager(this);
// Start the unassigned watcher - which will create the unassgined region
// Start the unassigned watcher - which will create the unassigned region
// in ZK. This is needed before RegionManager() constructor tries to assign
// the root region.
ZKUnassignedWatcher.start(this.conf, serverManager, address.toString());
ZKUnassignedWatcher.start(this.conf, this);
// start the "close region" executor service
HBaseEventType.RS2ZK_REGION_CLOSED.startMasterExecutorService(address.toString());
// start the "open region" executor service
@ -239,6 +248,22 @@ public class HMaster extends Thread implements HMasterInterface,
LOG.info("HMaster initialized on " + this.address.toString());
}
/**
* Returns true if this master process was responsible for starting the
* cluster.
*/
public boolean isClusterStartup() {
return isClusterStartup;
}
public void resetClusterStartup() {
isClusterStartup = false;
}
public HServerAddress getHServerAddress() {
return address;
}
/*
* Get the rootdir. Make sure its wholesome and exists before returning.
* @param rd
@ -1156,6 +1181,9 @@ public class HMaster extends Thread implements HMasterInterface,
throw new Exception("Another Master is currently active");
}
// we are a failed over master, reset the fact that we started the
// cluster
resetClusterStartup();
// Verify the cluster to see if anything happened while we were away
joinCluster();
} catch (Exception e) {

View File

@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.master.handler.MasterCloseRegionHandler;
import org.apache.hadoop.hbase.master.handler.MasterOpenRegionHandler;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWrapper;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWrapper.ZNodePathAndData;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.Watcher;
import org.apache.zookeeper.Watcher.Event.EventType;
@ -45,20 +46,37 @@ public class ZKUnassignedWatcher implements Watcher {
String serverName;
ServerManager serverManager;
public static void start(Configuration conf, ServerManager serverManager,
String serverName) throws IOException {
new ZKUnassignedWatcher(conf, serverManager, serverName);
public static void start(Configuration conf, HMaster master)
throws IOException {
new ZKUnassignedWatcher(conf, master);
LOG.debug("Started ZKUnassigned watcher");
}
public ZKUnassignedWatcher(Configuration conf, ServerManager serverManager,
String serverName) throws IOException {
this.serverName = serverName;
this.serverManager = serverManager;
zkWrapper =
ZooKeeperWrapper.getInstance(conf, HMaster.class.getName());
public ZKUnassignedWatcher(Configuration conf, HMaster master)
throws IOException {
this.serverName = master.getHServerAddress().toString();
this.serverManager = master.getServerManager();
zkWrapper = ZooKeeperWrapper.getInstance(conf, HMaster.class.getName());
String unassignedZNode = zkWrapper.getRegionInTransitionZNode();
// If the UNASSIGNED ZNode exists and this is a fresh cluster start, then
// delete it.
if(master.isClusterStartup() && zkWrapper.exists(unassignedZNode, false)) {
LOG.info("Cluster start, but found " + unassignedZNode + ", deleting it.");
try {
zkWrapper.deleteZNode(unassignedZNode, true);
} catch (KeeperException e) {
LOG.error("Could not delete znode " + unassignedZNode, e);
throw new IOException(e);
} catch (InterruptedException e) {
LOG.error("Could not delete znode " + unassignedZNode, e);
throw new IOException(e);
}
}
// If the UNASSIGNED ZNode does not exist, create it.
zkWrapper.createZNodeIfNotExists(zkWrapper.getRegionInTransitionZNode());
zkWrapper.createZNodeIfNotExists(unassignedZNode);
// TODO: get the outstanding changes in UNASSIGNED
// Set a watch on Zookeeper's UNASSIGNED node if it exists.
@ -66,7 +84,7 @@ public class ZKUnassignedWatcher implements Watcher {
}
/**
* This is the processing loop that gets triggerred from the ZooKeeperWrapper.
* This is the processing loop that gets triggered from the ZooKeeperWrapper.
* This zookeeper events process function dies the following:
* - WATCHES the following events: NodeCreated, NodeDataChanged, NodeChildrenChanged
* - IGNORES the following events: None, NodeDeleted

View File

@ -0,0 +1,88 @@
/**
* Copyright 2010 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master;
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.executor.RegionTransitionEventData;
import org.apache.hadoop.hbase.executor.HBaseEventHandler.HBaseEventType;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Writables;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWrapper;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
public class TestRestartCluster {
private static final Log LOG = LogFactory.getLog(TestRestartCluster.class);
private static Configuration conf;
private static HBaseTestingUtility utility;
private static ZooKeeperWrapper zkWrapper;
private static final byte[] TABLENAME = Bytes.toBytes("master_transitions");
private static final byte [][] FAMILIES = new byte [][] {Bytes.toBytes("a")};
@BeforeClass public static void beforeAllTests() throws Exception {
conf = HBaseConfiguration.create();
utility = new HBaseTestingUtility(conf);
}
@AfterClass public static void afterAllTests() throws IOException {
utility.shutdownMiniCluster();
}
@Before public void setup() throws IOException {
}
@Test (timeout=300000) public void testRestartClusterAfterKill()throws Exception {
utility.startMiniZKCluster();
zkWrapper = ZooKeeperWrapper.createInstance(conf, "cluster1");
// create the unassigned region, throw up a region opened state for META
String unassignedZNode = zkWrapper.getRegionInTransitionZNode();
zkWrapper.createZNodeIfNotExists(unassignedZNode);
byte[] data = null;
HBaseEventType hbEventType = HBaseEventType.RS2ZK_REGION_OPENED;
try {
data = Writables.getBytes(new RegionTransitionEventData(hbEventType, HMaster.MASTER));
} catch (IOException e) {
LOG.error("Error creating event data for " + hbEventType, e);
}
zkWrapper.createUnassignedRegion(HRegionInfo.ROOT_REGIONINFO.getEncodedName(), data);
zkWrapper.createUnassignedRegion(HRegionInfo.FIRST_META_REGIONINFO.getEncodedName(), data);
LOG.debug("Created UNASSIGNED zNode for ROOT and META regions in state " + HBaseEventType.M2ZK_REGION_OFFLINE);
// start the HB cluster
LOG.info("Starting HBase cluster...");
utility.startMiniCluster(2);
utility.createTable(TABLENAME, FAMILIES);
LOG.info("Created a table, waiting for table to be available...");
utility.waitTableAvailable(TABLENAME, 60*1000);
LOG.info("Master deleted unassgined region and started up successfully.");
}
}