diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java index 6b2a6b555bf..4537bc79802 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java @@ -176,6 +176,9 @@ public final class HConstants { /** Default value for ZooKeeper session timeout */ public static final int DEFAULT_ZK_SESSION_TIMEOUT = 180 * 1000; + /** Configuration key for whether to use ZK.multi */ + public static final String ZOOKEEPER_USEMULTI = "hbase.zookeeper.useMulti"; + /** Parameter name for port region server listens on. */ public static final String REGIONSERVER_PORT = "hbase.regionserver.port"; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java index 6e0c1cc6aa3..11d0e7f404f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.lang.management.ManagementFactory; import java.security.SecureRandom; import java.util.ArrayList; +import java.util.LinkedList; import java.util.List; import java.util.Random; @@ -35,11 +36,16 @@ import org.apache.hadoop.hbase.util.RetryCounterFactory; import org.apache.zookeeper.AsyncCallback; import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; +import org.apache.zookeeper.Op; +import org.apache.zookeeper.OpResult; import org.apache.zookeeper.Watcher; +import org.apache.zookeeper.ZooDefs; import org.apache.zookeeper.ZooKeeper; import org.apache.zookeeper.ZooKeeper.States; import org.apache.zookeeper.data.ACL; import org.apache.zookeeper.data.Stat; +import org.apache.zookeeper.proto.CreateRequest; +import org.apache.zookeeper.proto.SetDataRequest; /** * A zookeeper that can handle 'recoverable' errors. @@ -494,6 +500,60 @@ public class RecoverableZooKeeper { retryCounter.useRetry(); } } + /** + * Convert Iterable of {@link ZKOp} we got into the ZooKeeper.Op + * instances to actually pass to multi (need to do this in order to appendMetaData). + */ + private Iterable prepareZKMulti(Iterable ops) + throws UnsupportedOperationException { + if(ops == null) return null; + + List preparedOps = new LinkedList(); + for (Op op : ops) { + if (op.getType() == ZooDefs.OpCode.create) { + CreateRequest create = (CreateRequest)op.toRequestRecord(); + preparedOps.add(Op.create(create.getPath(), appendMetaData(create.getData()), + create.getAcl(), create.getFlags())); + } else if (op.getType() == ZooDefs.OpCode.delete) { + // no need to appendMetaData for delete + preparedOps.add(op); + } else if (op.getType() == ZooDefs.OpCode.setData) { + SetDataRequest setData = (SetDataRequest)op.toRequestRecord(); + preparedOps.add(Op.setData(setData.getPath(), appendMetaData(setData.getData()), + setData.getVersion())); + } else { + throw new UnsupportedOperationException("Unexpected ZKOp type: " + op.getClass().getName()); + } + } + return preparedOps; + } + + /** + * Run multiple operations in a transactional manner. Retry before throwing exception + */ + public List multi(Iterable ops) + throws KeeperException, InterruptedException { + RetryCounter retryCounter = retryCounterFactory.create(); + Iterable multiOps = prepareZKMulti(ops); + while (true) { + try { + return zk.multi(multiOps); + } catch (KeeperException e) { + switch (e.code()) { + case CONNECTIONLOSS: + case SESSIONEXPIRED: + case OPERATIONTIMEOUT: + retryOrThrow(retryCounter, e, "multi"); + break; + + default: + throw e; + } + } + retryCounter.sleepUntilNextRetry(); + retryCounter.useRetry(); + } + } private String findPreviousSequentialNode(String path) throws KeeperException, InterruptedException { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKUtil.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKUtil.java index bd39faedb98..33697b53d25 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKUtil.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKUtil.java @@ -23,16 +23,15 @@ import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.net.InetSocketAddress; -import java.net.InetAddress; import java.net.Socket; import java.util.ArrayList; +import java.util.Arrays; import java.util.LinkedList; import java.util.List; import java.util.Properties; import java.util.HashMap; import java.util.Map; -import javax.security.auth.login.LoginException; import javax.security.auth.login.AppConfigurationEntry; import javax.security.auth.login.AppConfigurationEntry.LoginModuleControlFlag; @@ -50,9 +49,13 @@ import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Threads; +import org.apache.hadoop.hbase.zookeeper.ZKUtil.ZKUtilOp.CreateAndFailSilent; +import org.apache.hadoop.hbase.zookeeper.ZKUtil.ZKUtilOp.DeleteNodeFailSilent; +import org.apache.hadoop.hbase.zookeeper.ZKUtil.ZKUtilOp.SetData; import org.apache.zookeeper.AsyncCallback; import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; +import org.apache.zookeeper.Op; import org.apache.zookeeper.KeeperException.NoNodeException; import org.apache.zookeeper.Watcher; import org.apache.zookeeper.ZooDefs.Ids; @@ -60,6 +63,9 @@ import org.apache.zookeeper.ZooKeeper; import org.apache.zookeeper.data.ACL; import org.apache.zookeeper.data.Stat; import org.apache.zookeeper.client.ZooKeeperSaslClient; +import org.apache.zookeeper.proto.CreateRequest; +import org.apache.zookeeper.proto.DeleteRequest; +import org.apache.zookeeper.proto.SetDataRequest; import org.apache.zookeeper.server.ZooKeeperSaslServer; /** @@ -882,7 +888,13 @@ public class ZKUtil { */ public static void setData(ZooKeeperWatcher zkw, String znode, byte [] data) throws KeeperException, KeeperException.NoNodeException { - setData(zkw, znode, data, -1); + setData(zkw, (SetData)ZKUtilOp.setData(znode, data)); + } + + private static void setData(ZooKeeperWatcher zkw, SetData setData) + throws KeeperException, KeeperException.NoNodeException { + SetDataRequest sd = (SetDataRequest)toZooKeeperOp(zkw, setData).toRequestRecord(); + setData(zkw, sd.getPath(), sd.getData(), sd.getVersion()); } /** @@ -1088,14 +1100,20 @@ public class ZKUtil { * @throws KeeperException if unexpected zookeeper exception */ public static void createAndFailSilent(ZooKeeperWatcher zkw, - String znode) + String znode) throws KeeperException { + createAndFailSilent(zkw, + (CreateAndFailSilent)ZKUtilOp.createAndFailSilent(znode, new byte[0])); + } + + private static void createAndFailSilent(ZooKeeperWatcher zkw, CreateAndFailSilent cafs) throws KeeperException { + CreateRequest create = (CreateRequest)toZooKeeperOp(zkw, cafs).toRequestRecord(); + String znode = create.getPath(); try { RecoverableZooKeeper zk = zkw.getRecoverableZooKeeper(); waitForZKConnectionIfAuthenticating(zkw); if (zk.exists(znode, false) == null) { - zk.create(znode, new byte[0], createACL(zkw,znode), - CreateMode.PERSISTENT); + zk.create(znode, create.getData(), create.getAcl(), CreateMode.fromFlag(create.getFlags())); } } catch(KeeperException.NodeExistsException nee) { } catch(KeeperException.NoAuthException nee){ @@ -1181,14 +1199,22 @@ public class ZKUtil { */ public static void deleteNodeFailSilent(ZooKeeperWatcher zkw, String node) throws KeeperException { + deleteNodeFailSilent(zkw, + (DeleteNodeFailSilent)ZKUtilOp.deleteNodeFailSilent(node)); + } + + private static void deleteNodeFailSilent(ZooKeeperWatcher zkw, + DeleteNodeFailSilent dnfs) throws KeeperException { + DeleteRequest delete = (DeleteRequest)toZooKeeperOp(zkw, dnfs).toRequestRecord(); try { - zkw.getRecoverableZooKeeper().delete(node, -1); + zkw.getRecoverableZooKeeper().delete(delete.getPath(), delete.getVersion()); } catch(KeeperException.NoNodeException nne) { } catch(InterruptedException ie) { zkw.interruptedException(ie); } } + /** * Delete the specified node and all of it's children. *

@@ -1230,6 +1256,232 @@ public class ZKUtil { } } + /** + * Represents an action taken by ZKUtil, e.g. createAndFailSilent. + * These actions are higher-level than ZKOp actions, which represent + * individual actions in the ZooKeeper API, like create. + */ + public abstract static class ZKUtilOp { + private String path; + + private ZKUtilOp(String path) { + this.path = path; + } + + /** + * @return a createAndFailSilent ZKUtilOp + */ + public static ZKUtilOp createAndFailSilent(String path, byte[] data) { + return new CreateAndFailSilent(path, data); + } + + /** + * @return a deleteNodeFailSilent ZKUtilOP + */ + public static ZKUtilOp deleteNodeFailSilent(String path) { + return new DeleteNodeFailSilent(path); + } + + /** + * @return a setData ZKUtilOp + */ + public static ZKUtilOp setData(String path, byte [] data) { + return new SetData(path, data); + } + + /** + * @return path to znode where the ZKOp will occur + */ + public String getPath() { + return path; + } + + /** + * ZKUtilOp representing createAndFailSilent in ZooKeeper + * (attempt to create node, ignore error if already exists) + */ + public static class CreateAndFailSilent extends ZKUtilOp { + private byte [] data; + + private CreateAndFailSilent(String path, byte [] data) { + super(path); + this.data = data; + } + + public byte[] getData() { + return data; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof CreateAndFailSilent)) return false; + + CreateAndFailSilent op = (CreateAndFailSilent) o; + return getPath().equals(op.getPath()) && Arrays.equals(data, op.data); + } + + @Override + public int hashCode() { + int ret = 17 + getPath().hashCode() * 31; + return ret * 31 + Bytes.hashCode(data); + } + } + + /** + * ZKUtilOp representing deleteNodeFailSilent in ZooKeeper + * (attempt to delete node, ignore error if node doesn't exist) + */ + public static class DeleteNodeFailSilent extends ZKUtilOp { + private DeleteNodeFailSilent(String path) { + super(path); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof DeleteNodeFailSilent)) return false; + + return super.equals(o); + } + + @Override + public int hashCode() { + return getPath().hashCode(); + } + } + + /** + * ZKUtilOp representing setData in ZooKeeper + */ + public static class SetData extends ZKUtilOp { + private byte [] data; + + private SetData(String path, byte [] data) { + super(path); + this.data = data; + } + + public byte[] getData() { + return data; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof SetData)) return false; + + SetData op = (SetData) o; + return getPath().equals(op.getPath()) && Arrays.equals(data, op.data); + } + + @Override + public int hashCode() { + int ret = getPath().hashCode(); + return ret * 31 + Bytes.hashCode(data); + } + } + } + + /** + * Convert from ZKUtilOp to ZKOp + */ + private static Op toZooKeeperOp(ZooKeeperWatcher zkw, ZKUtilOp op) + throws UnsupportedOperationException { + if(op == null) return null; + + if (op instanceof CreateAndFailSilent) { + CreateAndFailSilent cafs = (CreateAndFailSilent)op; + return Op.create(cafs.getPath(), cafs.getData(), createACL(zkw, cafs.getPath()), + CreateMode.PERSISTENT); + } else if (op instanceof DeleteNodeFailSilent) { + DeleteNodeFailSilent dnfs = (DeleteNodeFailSilent)op; + return Op.delete(dnfs.getPath(), -1); + } else if (op instanceof SetData) { + SetData sd = (SetData)op; + return Op.setData(sd.getPath(), sd.getData(), -1); + } else { + throw new UnsupportedOperationException("Unexpected ZKUtilOp type: " + + op.getClass().getName()); + } + } + + /** + * If hbase.zookeeper.useMulti is true, use ZooKeeper's multi-update functionality. + * Otherwise, run the list of operations sequentially. + * + * If all of the following are true: + * - runSequentialOnMultiFailure is true + * - hbase.zookeeper.useMulti is true + * - on calling multi, we get a ZooKeeper exception that can be handled by a sequential call(*) + * Then: + * - we retry the operations one-by-one (sequentially) + * + * Note *: an example is receiving a NodeExistsException from a "create" call. Without multi, + * a user could call "createAndFailSilent" to ensure that a node exists if they don't care who + * actually created the node (i.e. the NodeExistsException from ZooKeeper is caught). + * This will cause all operations in the multi to fail, however, because + * the NodeExistsException that zk.create throws will fail the multi transaction. + * In this case, if the previous conditions hold, the commands are run sequentially, which should + * result in the correct final state, but means that the operations will not run atomically. + * + * @throws KeeperException + */ + public static void multiOrSequential(ZooKeeperWatcher zkw, List ops, + boolean runSequentialOnMultiFailure) throws KeeperException { + if (ops == null) return; + boolean useMulti = zkw.getConfiguration().getBoolean(HConstants.ZOOKEEPER_USEMULTI, false); + + if (useMulti) { + List zkOps = new LinkedList(); + for (ZKUtilOp op : ops) { + zkOps.add(toZooKeeperOp(zkw, op)); + } + try { + zkw.getRecoverableZooKeeper().multi(zkOps); + } catch (KeeperException ke) { + switch (ke.code()) { + case NODEEXISTS: + case NONODE: + case BADVERSION: + case NOAUTH: + // if we get an exception that could be solved by running sequentially + // (and the client asked us to), then break out and run sequentially + if (runSequentialOnMultiFailure) { + LOG.info("On call to ZK.multi, received exception: " + ke.toString() + "." + + " Attempting to run operations sequentially because" + + " runSequentialOnMultiFailure is: " + runSequentialOnMultiFailure + "."); + processSequentially(zkw, ops); + break; + } + default: + throw ke; + } + } catch (InterruptedException ie) { + zkw.interruptedException(ie); + } + } else { + // run sequentially + processSequentially(zkw, ops); + } + } + + private static void processSequentially(ZooKeeperWatcher zkw, List ops) + throws KeeperException, NoNodeException { + for (ZKUtilOp op : ops) { + if (op instanceof CreateAndFailSilent) { + createAndFailSilent(zkw, (CreateAndFailSilent) op); + } else if (op instanceof DeleteNodeFailSilent) { + deleteNodeFailSilent(zkw, (DeleteNodeFailSilent) op); + } else if (op instanceof SetData) { + setData(zkw, (SetData) op); + } else { + throw new UnsupportedOperationException("Unexpected ZKUtilOp type: " + + op.getClass().getName()); + } + } + } + // // ZooKeeper cluster information // diff --git a/hbase-server/src/main/resources/hbase-default.xml b/hbase-server/src/main/resources/hbase-default.xml index b62d30855fb..7e8dfaa4859 100644 --- a/hbase-server/src/main/resources/hbase-default.xml +++ b/hbase-server/src/main/resources/hbase-default.xml @@ -705,6 +705,18 @@ for more information. + + hbase.zookeeper.useMulti + false + Instructs HBase to make use of ZooKeeper's multi-update functionality. + This allows certain ZooKeeper operations to complete more quickly and prevents some issues + with rare Replication failure scenarios (see the release note of HBASE-2611 for an example). + IMPORTANT: only set this to true if all ZooKeeper servers in the cluster are on version 3.4+ + and will not be downgraded. ZooKeeper versions before 3.4 do not support multi-update and will + not fail gracefully if multi-update is invoked (see ZOOKEEPER-1495). + + +