From ce2fd2c58c5358f07defe13c8ae56e1bbfd59590 Mon Sep 17 00:00:00 2001 From: Lars Hofhansl Date: Tue, 16 Jun 2015 12:21:49 -0700 Subject: [PATCH] HBASE-13885 ZK watches leaks during snapshots. --- .../hadoop/hbase/procedure/ZKProcedureCoordinatorRpcs.java | 5 ++++- .../apache/hadoop/hbase/procedure/ZKProcedureMemberRpcs.java | 5 ++++- .../org/apache/hadoop/hbase/procedure/ZKProcedureUtil.java | 5 +++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/procedure/ZKProcedureCoordinatorRpcs.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/procedure/ZKProcedureCoordinatorRpcs.java index 3e32ca2c6ac..fedd5956b22 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/procedure/ZKProcedureCoordinatorRpcs.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/procedure/ZKProcedureCoordinatorRpcs.java @@ -275,7 +275,10 @@ public class ZKProcedureCoordinatorRpcs implements ProcedureCoordinatorRpcs { ForeignException ee = null; try { byte[] data = ZKUtil.getData(zkProc.getWatcher(), abortNode); - if (!ProtobufUtil.isPBMagicPrefix(data)) { + if (data == null || data.length == 0) { + // ignore + return; + } else if (!ProtobufUtil.isPBMagicPrefix(data)) { LOG.warn("Got an error notification for op:" + abortNode + " but we can't read the information. Killing the procedure."); // we got a remote exception, but we can't describe it diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/procedure/ZKProcedureMemberRpcs.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/procedure/ZKProcedureMemberRpcs.java index 6ac2a479189..2e03a600229 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/procedure/ZKProcedureMemberRpcs.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/procedure/ZKProcedureMemberRpcs.java @@ -318,7 +318,10 @@ public class ZKProcedureMemberRpcs implements ProcedureMemberRpcs { // figure out the data we need to pass ForeignException ee; try { - if (!ProtobufUtil.isPBMagicPrefix(data)) { + if (data == null || data.length == 0) { + // ignore + return; + } else if (!ProtobufUtil.isPBMagicPrefix(data)) { String msg = "Illegally formatted data in abort node for proc " + opName + ". Killing the procedure."; LOG.error(msg); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/procedure/ZKProcedureUtil.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/procedure/ZKProcedureUtil.java index 342ab2265f6..9e0ef7fc6a9 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/procedure/ZKProcedureUtil.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/procedure/ZKProcedureUtil.java @@ -283,6 +283,11 @@ public abstract class ZKProcedureUtil // TODO This is potentially racy since not atomic. update when we support zk that has multi LOG.info("Clearing all znodes for procedure " + procedureName + "including nodes " + acquiredZnode + " " + reachedZnode + " " + abortZnode); + + // Make sure we trigger the watches on these nodes by creating them. (HBASE-13885) + ZKUtil.createAndFailSilent(watcher, getAcquiredBarrierNode(procedureName)); + ZKUtil.createAndFailSilent(watcher, getAbortZNode(procedureName)); + ZKUtil.deleteNodeRecursively(watcher, getAcquiredBarrierNode(procedureName)); ZKUtil.deleteNodeRecursively(watcher, getReachedBarrierNode(procedureName)); ZKUtil.deleteNodeRecursively(watcher, getAbortZNode(procedureName));