HBASE-5926 Delete the master znode after a master crash

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1340185 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Stack 2012-05-18 17:44:04 +00:00
parent 60d28c645d
commit 4c61d4a965
6 changed files with 94 additions and 51 deletions

View File

@ -73,9 +73,13 @@ hbase_rotate_log ()
cleanZNode() { cleanZNode() {
if [ -f $HBASE_ZNODE_FILE ]; then if [ -f $HBASE_ZNODE_FILE ]; then
#call ZK to delete the node if [ "$command" = "master" ]; then
ZNODE=`cat $HBASE_ZNODE_FILE` $bin/hbase master clear > /dev/null 2>&1
$bin/hbase zkcli delete $ZNODE > /dev/null 2>&1 else
#call ZK to delete the node
ZNODE=`cat $HBASE_ZNODE_FILE`
$bin/hbase zkcli delete $ZNODE > /dev/null 2>&1
fi
rm $HBASE_ZNODE_FILE rm $HBASE_ZNODE_FILE
fi fi
} }

View File

@ -25,6 +25,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hbase.ZNodeClearer;
import org.apache.hadoop.hbase.DeserializationException; import org.apache.hadoop.hbase.DeserializationException;
import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.Server;
import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.ServerName;
@ -141,12 +142,17 @@ class ActiveMasterManager extends ZooKeeperListener {
try { try {
String backupZNode = String backupZNode =
ZKUtil.joinZNode(this.watcher.backupMasterAddressesZNode, this.sn.toString()); ZKUtil.joinZNode(this.watcher.backupMasterAddressesZNode, this.sn.toString());
if (MasterAddressTracker.setMasterAddress(this.watcher, this.watcher.getMasterAddressZNode(), this.sn)) { if (MasterAddressTracker.setMasterAddress(this.watcher,
this.watcher.getMasterAddressZNode(), this.sn)) {
// If we were a backup master before, delete our ZNode from the backup // If we were a backup master before, delete our ZNode from the backup
// master directory since we are the active now // master directory since we are the active now
LOG.info("Deleting ZNode for " + backupZNode + " from backup master directory"); LOG.info("Deleting ZNode for " + backupZNode + " from backup master directory");
ZKUtil.deleteNodeFailSilent(this.watcher, backupZNode); ZKUtil.deleteNodeFailSilent(this.watcher, backupZNode);
// Save the znode in a file, this will allow to check if we crash in the launch scripts
ZNodeClearer.writeMyEphemeralNodeOnDisk(this.sn.toString());
// We are the master, return // We are the master, return
startupStatus.setStatus("Successfully registered as active master."); startupStatus.setStatus("Successfully registered as active master.");
this.clusterHasActiveMaster.set(true); this.clusterHasActiveMaster.set(true);
@ -189,6 +195,10 @@ class ActiveMasterManager extends ZooKeeperListener {
currentMaster + "; master was restarted? Deleting node."); currentMaster + "; master was restarted? Deleting node.");
// Hurry along the expiration of the znode. // Hurry along the expiration of the znode.
ZKUtil.deleteNode(this.watcher, this.watcher.getMasterAddressZNode()); ZKUtil.deleteNode(this.watcher, this.watcher.getMasterAddressZNode());
// We may have failed to delete the znode at the previous step, but
// we delete the file anyway: a second attempt to delete the znode is likely to fail again.
ZNodeClearer.deleteMyEphemeralNodeOnDisk();
} else { } else {
msg = "Another master is the active master, " + currentMaster + msg = "Another master is the active master, " + currentMaster +
"; waiting to become the next active master"; "; waiting to become the next active master";
@ -249,6 +259,9 @@ class ActiveMasterManager extends ZooKeeperListener {
} }
if (activeMaster != null && activeMaster.equals(this.sn)) { if (activeMaster != null && activeMaster.equals(this.sn)) {
ZKUtil.deleteNode(watcher, watcher.getMasterAddressZNode()); ZKUtil.deleteNode(watcher, watcher.getMasterAddressZNode());
// We may have failed to delete the znode at the previous step, but
// we delete the file anyway: a second attempt to delete the znode is likely to fail again.
ZNodeClearer.deleteMyEphemeralNodeOnDisk();
} }
} catch (KeeperException e) { } catch (KeeperException e) {
LOG.error(this.watcher.prefix("Error deleting our own master address node"), e); LOG.error(this.watcher.prefix("Error deleting our own master address node"), e);

View File

@ -31,6 +31,7 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ZNodeClearer;
import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.LocalHBaseCluster; import org.apache.hadoop.hbase.LocalHBaseCluster;
import org.apache.hadoop.hbase.MasterNotRunningException; import org.apache.hadoop.hbase.MasterNotRunningException;
@ -47,9 +48,10 @@ public class HMasterCommandLine extends ServerCommandLine {
private static final Log LOG = LogFactory.getLog(HMasterCommandLine.class); private static final Log LOG = LogFactory.getLog(HMasterCommandLine.class);
private static final String USAGE = private static final String USAGE =
"Usage: Master [opts] start|stop\n" + "Usage: Master [opts] start|stop|clear\n" +
" start Start Master. If local mode, start Master and RegionServer in same JVM\n" + " start Start Master. If local mode, start Master and RegionServer in same JVM\n" +
" stop Start cluster shutdown; Master signals RegionServer shutdown\n" + " stop Start cluster shutdown; Master signals RegionServer shutdown\n" +
" clear Delete the master znode in ZooKeeper after a master crashes\n "+
" where [opts] are:\n" + " where [opts] are:\n" +
" --minServers=<servers> Minimum RegionServers needed to host user tables.\n" + " --minServers=<servers> Minimum RegionServers needed to host user tables.\n" +
" --backup Master should start in backup mode"; " --backup Master should start in backup mode";
@ -105,6 +107,8 @@ public class HMasterCommandLine extends ServerCommandLine {
return startMaster(); return startMaster();
} else if ("stop".equals(command)) { } else if ("stop".equals(command)) {
return stopMaster(); return stopMaster();
} else if ("clear".equals(command)) {
return (ZNodeClearer.clear(getConf()) ? 0 : -1);
} else { } else {
usage("Invalid command: " + command); usage("Invalid command: " + command);
return -1; return -1;

View File

@ -19,9 +19,6 @@
*/ */
package org.apache.hadoop.hbase.regionserver; package org.apache.hadoop.hbase.regionserver;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException; import java.io.IOException;
import java.io.StringWriter; import java.io.StringWriter;
import java.lang.Thread.UncaughtExceptionHandler; import java.lang.Thread.UncaughtExceptionHandler;
@ -82,6 +79,7 @@ import org.apache.hadoop.hbase.TableDescriptors;
import org.apache.hadoop.hbase.UnknownRowLockException; import org.apache.hadoop.hbase.UnknownRowLockException;
import org.apache.hadoop.hbase.UnknownScannerException; import org.apache.hadoop.hbase.UnknownScannerException;
import org.apache.hadoop.hbase.YouAreDeadException; import org.apache.hadoop.hbase.YouAreDeadException;
import org.apache.hadoop.hbase.ZNodeClearer;
import org.apache.hadoop.hbase.catalog.CatalogTracker; import org.apache.hadoop.hbase.catalog.CatalogTracker;
import org.apache.hadoop.hbase.catalog.MetaEditor; import org.apache.hadoop.hbase.catalog.MetaEditor;
import org.apache.hadoop.hbase.catalog.MetaReader; import org.apache.hadoop.hbase.catalog.MetaReader;
@ -865,7 +863,7 @@ public class HRegionServer implements ClientProtocol,
} }
// We may have failed to delete the znode at the previous step, but // We may have failed to delete the znode at the previous step, but
// we delete the file anyway: a second attempt to delete the znode is likely to fail again. // we delete the file anyway: a second attempt to delete the znode is likely to fail again.
deleteMyEphemeralNodeOnDisk(); ZNodeClearer.deleteMyEphemeralNodeOnDisk();
this.zooKeeper.close(); this.zooKeeper.close();
LOG.info("stopping server " + this.serverNameFromMasterPOV + LOG.info("stopping server " + this.serverNameFromMasterPOV +
"; zookeeper connection closed."); "; zookeeper connection closed.");
@ -1054,7 +1052,7 @@ public class HRegionServer implements ClientProtocol,
createMyEphemeralNode(); createMyEphemeralNode();
// Save it in a file, this will allow to see if we crash // Save it in a file, this will allow to see if we crash
writeMyEphemeralNodeOnDisk(); ZNodeClearer.writeMyEphemeralNodeOnDisk(getMyEphemeralNodePath());
// Master sent us hbase.rootdir to use. Should be fully qualified // Master sent us hbase.rootdir to use. Should be fully qualified
// path with file system specification included. Set 'fs.defaultFS' // path with file system specification included. Set 'fs.defaultFS'
@ -1086,52 +1084,11 @@ public class HRegionServer implements ClientProtocol,
} }
} }
private String getMyEphemeralNodePath() {
return ZKUtil.joinZNode(this.zooKeeper.rsZNode, getServerName().toString());
}
private String getMyEphemeralNodeFileName() {
return System.getenv().get("HBASE_ZNODE_FILE");
}
private void createMyEphemeralNode() throws KeeperException { private void createMyEphemeralNode() throws KeeperException {
ZKUtil.createEphemeralNodeAndWatch(this.zooKeeper, getMyEphemeralNodePath(), ZKUtil.createEphemeralNodeAndWatch(this.zooKeeper, getMyEphemeralNodePath(),
HConstants.EMPTY_BYTE_ARRAY); HConstants.EMPTY_BYTE_ARRAY);
} }
private void writeMyEphemeralNodeOnDisk() throws IOException {
String fileName = getMyEphemeralNodeFileName();
if (fileName == null) {
LOG.warn("No filename given to save the znode used, it won't be saved " +
"(Environment variable HBASE_ZNODE_FILE is not set).");
return;
}
FileWriter fstream = new FileWriter(fileName);
BufferedWriter out = new BufferedWriter(fstream);
try {
out.write(getMyEphemeralNodePath() + "\n");
} finally {
try {
out.close();
} finally {
fstream.close();
}
}
}
private void deleteMyEphemeralNodeOnDisk(){
String fileName = getMyEphemeralNodeFileName();
if (fileName == null){
return;
}
File f = new File(fileName);
f.delete();
}
private void deleteMyEphemeralNode() throws KeeperException { private void deleteMyEphemeralNode() throws KeeperException {
ZKUtil.deleteNode(this.zooKeeper, getMyEphemeralNodePath()); ZKUtil.deleteNode(this.zooKeeper, getMyEphemeralNodePath());
} }
@ -3915,4 +3872,8 @@ public class HRegionServer implements ClientProtocol,
return stoppable.isStopped(); return stoppable.isStopped();
} }
} }
private String getMyEphemeralNodePath() {
return ZKUtil.joinZNode(this.zooKeeper.rsZNode, getServerName().toString());
}
} }

View File

@ -27,6 +27,7 @@ import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos; import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos;
import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos; import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.data.Stat;
/** /**
* Manages the location of the current active Master for the RegionServer. * Manages the location of the current active Master for the RegionServer.
@ -153,4 +154,28 @@ public class MasterAddressTracker extends ZooKeeperNodeTracker {
mbuilder.setMaster(snbuilder.build()); mbuilder.setMaster(snbuilder.build());
return ProtobufUtil.prependPBMagic(mbuilder.build().toByteArray()); return ProtobufUtil.prependPBMagic(mbuilder.build().toByteArray());
} }
/**
* delete the master znode if its content is same as the parameter
*/
public static boolean deleteIfEquals(ZooKeeperWatcher zkw, final String content) {
if (content == null){
throw new IllegalArgumentException("Content must not be null");
}
try {
Stat stat = new Stat();
byte[] data = ZKUtil.getDataNoWatch(zkw, zkw.getMasterAddressZNode(), stat);
ServerName sn = ServerName.parseFrom(data);
if (sn != null && content.equals(sn.toString())) {
return (ZKUtil.deleteNode(zkw, zkw.getMasterAddressZNode(), stat.getVersion()));
}
} catch (KeeperException e) {
LOG.warn("Can't get or delete the master znode", e);
} catch (DeserializationException e) {
LOG.warn("Can't get or delete the master znode", e);
}
return false;
}
} }

View File

@ -26,16 +26,20 @@ import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
import java.io.IOException; import java.io.IOException;
import java.lang.reflect.Method;
import java.util.Random; import java.util.Random;
import java.util.concurrent.Semaphore; import java.util.concurrent.Semaphore;
import junit.framework.Assert;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.*; import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.HConnectionManager;
import org.apache.hadoop.hbase.master.TestActiveMasterManager.NodeDeletionListener; import org.apache.hadoop.hbase.master.TestActiveMasterManager.NodeDeletionListener;
import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.util.Threads;
import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.WatchedEvent; import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.Watcher; import org.apache.zookeeper.Watcher;
import org.apache.zookeeper.ZooDefs.Ids; import org.apache.zookeeper.ZooDefs.Ids;
@ -313,6 +317,38 @@ public class TestZooKeeperNodeTracker {
public void process(WatchedEvent event) {} public void process(WatchedEvent event) {}
} }
@Test
public void testCleanZNode() throws Exception {
ZooKeeperWatcher zkw = new ZooKeeperWatcher(TEST_UTIL.getConfiguration(),
"testNodeTracker", new TestZooKeeperNodeTracker.StubAbortable());
final ServerName sn = new ServerName("127.0.0.1:52",45L);
ZKUtil.createAndFailSilent(zkw,
TEST_UTIL.getConfiguration().get(HConstants.ZOOKEEPER_ZNODE_PARENT,
HConstants.DEFAULT_ZOOKEEPER_ZNODE_PARENT));
final String nodeName = zkw.getMasterAddressZNode();
// Check that we manage the case when there is no data
ZKUtil.createAndFailSilent(zkw, nodeName);
MasterAddressTracker.deleteIfEquals(zkw, sn.toString());
Assert.assertFalse(ZKUtil.getData(zkw, nodeName) == null);
// Check that we don't delete if we're not supposed to
ZKUtil.setData(zkw, nodeName, MasterAddressTracker.toByteArray(sn));
MasterAddressTracker.deleteIfEquals(zkw, new ServerName("127.0.0.2:52",45L).toString());
Assert.assertFalse(ZKUtil.getData(zkw, nodeName) == null);
// Check that we delete when we're supposed to
ZKUtil.setData(zkw, nodeName,MasterAddressTracker.toByteArray(sn));
MasterAddressTracker.deleteIfEquals(zkw, sn.toString());
Assert.assertTrue( ZKUtil.getData(zkw, nodeName)== null );
// Check that we support the case when the znode does not exist
MasterAddressTracker.deleteIfEquals(zkw, sn.toString()); // must not throw an exception
}
@org.junit.Rule @org.junit.Rule
public org.apache.hadoop.hbase.ResourceCheckerJUnitRule cu = public org.apache.hadoop.hbase.ResourceCheckerJUnitRule cu =
new org.apache.hadoop.hbase.ResourceCheckerJUnitRule(); new org.apache.hadoop.hbase.ResourceCheckerJUnitRule();