HBASE-1921 When the Master's session times out and there's only one, cluster is wedged

git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@830820 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jean-Daniel Cryans 2009-10-29 01:24:03 +00:00
parent 67c1fc3436
commit c7df216c11
5 changed files with 84 additions and 49 deletions

View File

@ -143,6 +143,7 @@ Release 0.21.0 - Unreleased
HBASE-1756 Refactor HLog (changing package first)
HBASE-1926 Remove unused xmlenc jar from trunk
HBASE-1936 HLog group commit
HBASE-1921 When the Master's session times out and there's only one, cluster is wedged
OPTIMIZATIONS
HBASE-410 [testing] Speed up the test suite

View File

@ -126,7 +126,7 @@ public class HMaster extends Thread implements HConstants, HMasterInterface,
// Metrics is set when we call run.
private final MasterMetrics metrics;
// Our zk client.
private final ZooKeeperWrapper zooKeeperWrapper;
private ZooKeeperWrapper zooKeeperWrapper;
// Watcher for master address and for cluster shutdown.
private final ZKMasterAddressWatcher zkMasterAddressWatcher;
// A Sleeper that sleeps for threadWakeFrequency; sleep if nothing todo.
@ -187,7 +187,7 @@ public class HMaster extends Thread implements HConstants, HMasterInterface,
this.zooKeeperWrapper = new ZooKeeperWrapper(conf, this);
this.zkMasterAddressWatcher =
new ZKMasterAddressWatcher(this.zooKeeperWrapper, this.shutdownRequested);
this.zkMasterAddressWatcher.writeAddressToZooKeeper(this.address);
this.zkMasterAddressWatcher.writeAddressToZooKeeper(this.address, true);
serverManager = new ServerManager(this);
regionManager = new RegionManager(this);
@ -1131,10 +1131,28 @@ public class HMaster extends Thread implements HConstants, HMasterInterface,
(event.getType().equals(EventType.NodeDeleted) &&
event.getPath().equals(this.zooKeeperWrapper.getMasterElectionZNode())) &&
!shutdownRequested.get()) {
LOG.error("Master lost its znode, killing itself now");
LOG.info("Master lost its znode, trying to get a new one");
// Can we still be the master? If not, goodbye
zooKeeperWrapper.close();
try {
zooKeeperWrapper = new ZooKeeperWrapper(conf, this);
this.zkMasterAddressWatcher.setZookeeper(zooKeeperWrapper);
if(!this.zkMasterAddressWatcher.
writeAddressToZooKeeper(this.address,false)) {
throw new Exception("Another Master is currently active");
}
// Verify the cluster to see if anything happened while we were away
joinCluster();
} catch (Exception e) {
LOG.error("Killing master because of", e);
System.exit(1);
}
}
}
private static void printUsageAndExit() {
System.err.println("Usage: Master start|stop");

View File

@ -125,8 +125,6 @@ public class RegionManager implements HConstants {
regionsToFlush = Collections.synchronizedSortedMap(
new TreeMap<byte[],Pair<HRegionInfo,HServerAddress>>
(Bytes.BYTES_COMPARATOR));
private final ZooKeeperWrapper zooKeeperWrapper;
private final int zooKeeperNumRetries;
private final int zooKeeperPause;
@ -143,7 +141,6 @@ public class RegionManager implements HConstants {
// Scans the meta table
metaScannerThread = new MetaScanner(master);
zooKeeperWrapper = master.getZooKeeperWrapper();
zooKeeperNumRetries = conf.getInt(ZOOKEEPER_RETRIES, DEFAULT_ZOOKEEPER_RETRIES);
zooKeeperPause = conf.getInt(ZOOKEEPER_PAUSE, DEFAULT_ZOOKEEPER_PAUSE);
@ -602,8 +599,8 @@ public class RegionManager implements HConstants {
} catch(Exception iex) {
LOG.warn("meta scanner", iex);
}
zooKeeperWrapper.clearRSDirectory();
zooKeeperWrapper.close();
master.getZooKeeperWrapper().clearRSDirectory();
master.getZooKeeperWrapper().close();
}
/**
@ -1121,7 +1118,7 @@ public class RegionManager implements HConstants {
private void writeRootRegionLocationToZooKeeper(HServerAddress address) {
for (int attempt = 0; attempt < zooKeeperNumRetries; ++attempt) {
if (zooKeeperWrapper.writeRootRegionLocation(address)) {
if (master.getZooKeeperWrapper().writeRootRegionLocation(address)) {
return;
}

View File

@ -41,13 +41,14 @@ import org.apache.zookeeper.Watcher.Event.EventType;
*/
class ZKMasterAddressWatcher implements Watcher {
private static final Log LOG = LogFactory.getLog(ZKMasterAddressWatcher.class);
private final ZooKeeperWrapper zookeeper;
private ZooKeeperWrapper zookeeper;
private final AtomicBoolean requestShutdown;
/**
* Create this watcher using passed ZooKeeperWrapper instance.
* @param zk ZooKeeper
* @param requestShutdown Flag to set to request shutdown.
* @param flag Flag to set to request shutdown.
*/
ZKMasterAddressWatcher(final ZooKeeperWrapper zk, final AtomicBoolean flag) {
this.requestShutdown = flag;
@ -98,17 +99,30 @@ class ZKMasterAddressWatcher implements Watcher {
* address (or until cluster shutdown).
* @param address Address whose format is HServerAddress.toString
*/
void writeAddressToZooKeeper(final HServerAddress address) {
while (true) {
boolean writeAddressToZooKeeper(
final HServerAddress address, boolean retry) {
do {
waitForMasterAddressAvailability();
// Check if we need to shutdown instead of taking control
if (this.requestShutdown.get()) return;
if (this.requestShutdown.get()) {
LOG.debug("Won't start Master because cluster is shuting down");
return false;
}
if(this.zookeeper.writeMasterAddress(address)) {
this.zookeeper.setClusterState(true);
// Watch our own node
this.zookeeper.readMasterAddress(this);
return;
}
}
return true;
}
} while(retry);
return false;
}
/**
* Reset the ZK in case a new connection is required
* @param zookeeper new instance
*/
public void setZookeeper(ZooKeeperWrapper zookeeper) {
this.zookeeper = zookeeper;
}
}

View File

@ -95,21 +95,29 @@ public class TestZooKeeper extends HBaseClusterTestCase {
connection.relocateRegion(HConstants.ROOT_TABLE_NAME, HConstants.EMPTY_BYTE_ARRAY);
}
public void testRegionServerSessionExpired() {
try {
public void testRegionServerSessionExpired() throws Exception{
this.conf.setBoolean("hbase.regionserver.restart.on.zk.expire", true);
new HTable(conf, HConstants.META_TABLE_NAME);
HRegionServer rs = cluster.getRegionServer(0);
sessionExpirationHelper(rs.getZooKeeperWrapper());
}
public void testMasterSessionExpired() throws Exception {
new HTable(conf, HConstants.META_TABLE_NAME);
HMaster master = cluster.getMaster();
sessionExpirationHelper(master.getZooKeeperWrapper());
}
public void sessionExpirationHelper(ZooKeeperWrapper nodeZK) throws Exception{
ZooKeeperWrapper zkw = new ZooKeeperWrapper(conf, EmptyWatcher.instance);
String quorumServers = zkw.getQuorumServers();
int sessionTimeout = 5 * 1000; // 5 seconds
HRegionServer rs = cluster.getRegionServer(0);
ZooKeeperWrapper rsZK = rs.getZooKeeperWrapper();
long sessionID = rsZK.getSessionID();
byte[] password = rsZK.getSessionPassword();
byte[] password = nodeZK.getSessionPassword();
long sessionID = nodeZK.getSessionID();
ZooKeeper zk = new ZooKeeper(quorumServers, sessionTimeout, EmptyWatcher.instance, sessionID, password);
ZooKeeper zk = new ZooKeeper(quorumServers,
sessionTimeout, EmptyWatcher.instance, sessionID, password);
zk.close();
Thread.sleep(sessionTimeout * 3L);
@ -126,10 +134,7 @@ public class TestZooKeeper extends HBaseClusterTestCase {
Put put = new Put(Bytes.toBytes("testrow"));
put.add(Bytes.toBytes("fam"), Bytes.toBytes("col"), Bytes.toBytes("testdata"));
table.put(put);
} catch (Exception e) {
e.printStackTrace();
fail();
}
}
public void testMultipleZK() {