HBASE-8435 Test for zk leak does not account for unsynchronized access to zk watcher: ADDENDUM

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1483117 13f79535-47bb-0310-9956-ffa450edef68
2013-05-15 22:52:19 +00:00 · 2013-05-15 22:52:19 +00:00 · 86aad7e82c
parent 6ebd8e3f2a
commit 86aad7e82c
3 changed files with 43 additions and 8 deletions
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java
@ -604,9 +604,8 @@ public class HConnectionManager {
      this.clusterId = this.registry.getClusterId();
      if (clusterId == null) {
        clusterId = HConstants.CLUSTER_ID_DEFAULT;
+        LOG.debug("clusterid came back null, using default " + clusterId);
      }
-
-      LOG.info("ClusterId is " + clusterId);
    }

    @Override
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ZooKeeperRegistry.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ZooKeeperRegistry.java
@ -26,6 +26,7 @@ import org.apache.hadoop.hbase.HRegionLocation;
 import org.apache.hadoop.hbase.ServerName;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
+import org.apache.hadoop.hbase.zookeeper.ZKClusterId;
 import org.apache.hadoop.hbase.zookeeper.ZKTableReadOnly;
 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
 import org.apache.zookeeper.KeeperException;
@ -69,10 +70,28 @@ class ZooKeeperRegistry implements Registry {
    }
  }

+  private String clusterId = null;
+
  @Override
  public String getClusterId() {
-    // TODO Auto-generated method stub
-    return null;
+    if (this.clusterId != null) return this.clusterId;
+    // No synchronized here, worse case we will retrieve it twice, that's
+    //  not an issue.
+    ZooKeeperKeepAliveConnection zkw = null;
+    try {
+      zkw = hci.getKeepAliveZooKeeperWatcher();
+      this.clusterId = ZKClusterId.readClusterIdZNode(zkw);
+      if (this.clusterId == null) {
+        LOG.info("ClusterId read in ZooKeeper is null");
+      }
+    } catch (KeeperException e) {
+      LOG.warn("Can't retrieve clusterId from Zookeeper", e);
+    } catch (IOException e) {
+      LOG.warn("Can't retrieve clusterId from Zookeeper", e);
+    } finally {
+      if (zkw != null) zkw.close();
+    }
+    return this.clusterId;
  }

  @Override
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java
@ -803,13 +803,21 @@ public class TestHCM {
  }

  /**
-   * Tests that a closed connection does not have a live zookeeper
+   * Tests that a destroyed connection does not have a live zookeeper.
+   * Below is timing based.  We put up a connection to a table and then close the connection while
+   * having a background thread running that is forcing close of the connection to try and
+   * provoke a close catastrophe; we are hoping for a car crash so we can see if we are leaking
+   * zk connections.
   * @throws Exception
   */
  @Test
  public void testDeleteForZKConnLeak() throws Exception {
    TEST_UTIL.createTable(TABLE_NAME4, FAM_NAM);
-    final Configuration config = TEST_UTIL.getConfiguration();
+    final Configuration config = HBaseConfiguration.create(TEST_UTIL.getConfiguration());
+    config.setInt("zookeeper.recovery.retry", 1);
+    config.setInt("zookeeper.recovery.retry.intervalmill", 1000);
+    config.setInt("hbase.rpc.timeout", 2000);
+    config.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 2);

    ThreadPoolExecutor pool = new ThreadPoolExecutor(1, 10,
      5, TimeUnit.SECONDS,
@ -822,21 +830,30 @@ public class TestHCM {
        while (!Thread.interrupted()) {
          try {
            HConnection conn = HConnectionManager.getConnection(config);
+            LOG.info("Connection " + conn);
            HConnectionManager.deleteStaleConnection(conn);
+            LOG.info("Connection closed " + conn);
+            // TODO: This sleep time should be less than the time that it takes to open and close
+            // a table.  Ideally we would do a few runs first to measure.  For now this is
+            // timing based; hopefully we hit the bad condition.
+            Threads.sleep(10);
          } catch (Exception e) {
          }
        }
      }
    });

-    // use connection multiple times
-    for (int i = 0; i < 50; i++) {
+    // Use connection multiple times.
+    for (int i = 0; i < 30; i++) {
      HConnection c1 = null;
      try {
        c1 = HConnectionManager.getConnection(config);
+        LOG.info("HTable connection " + i + " " + c1);
        HTable table = new HTable(TABLE_NAME4, c1, pool);
        table.close();
+        LOG.info("HTable connection " + i + " closed " + c1);
      } catch (Exception e) {
+        LOG.info("We actually want this to happen!!!!  So we can see if we are leaking zk", e);
      } finally {
        if (c1 != null) {
          if (c1.isClosed()) {