From f06d3bf977eed44f63b0b46822bc9e01188f075f Mon Sep 17 00:00:00 2001 From: Mark Robert Miller Date: Fri, 22 Aug 2014 14:02:06 +0000 Subject: [PATCH] SOLR-6405: ZooKeeper calls can easily not be retried enough on ConnectionLoss. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1619810 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 5 ++- .../apache/solr/cloud/ZkSolrClientTest.java | 38 +++++++++++++++++++ .../solr/common/cloud/ZkCmdExecutor.java | 13 ++++--- 3 files changed, 50 insertions(+), 6 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index bd1ca18589d..147665001cd 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -340,7 +340,10 @@ Bug Fixes * SOLR-6402: OverseerCollectionProcessor should not exit for ZooKeeper ConnectionLoss. (Jessica Cheng via Mark Miller) - + +* SOLR-6405: ZooKeeper calls can easily not be retried enough on ConnectionLoss. + (Jessica Cheng, Mark Miller) + Optimizations --------------------- diff --git a/solr/core/src/test/org/apache/solr/cloud/ZkSolrClientTest.java b/solr/core/src/test/org/apache/solr/cloud/ZkSolrClientTest.java index bf9ed47e519..6340f78052a 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ZkSolrClientTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ZkSolrClientTest.java @@ -24,6 +24,8 @@ import java.util.concurrent.atomic.AtomicInteger; import junit.framework.Assert; import org.apache.solr.common.cloud.SolrZkClient; +import org.apache.solr.common.cloud.ZkCmdExecutor; +import org.apache.solr.common.cloud.ZkOperation; import org.apache.solr.util.AbstractSolrTestCase; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.WatchedEvent; @@ -198,6 +200,42 @@ public class ZkSolrClientTest extends AbstractSolrTestCase { } } } + + public void testZkCmdExectutor() throws Exception { + String zkDir = createTempDir("zkData").getAbsolutePath(); + ZkTestServer server = null; + + try { + server = new ZkTestServer(zkDir); + server.run(); + AbstractZkTestCase.tryCleanSolrZkNode(server.getZkHost()); + AbstractZkTestCase.makeSolrZkNode(server.getZkHost()); + + final int timeout = random().nextInt(10000) + 5000; + + ZkCmdExecutor zkCmdExecutor = new ZkCmdExecutor(timeout); + final long start = System.nanoTime(); + try { + zkCmdExecutor.retryOperation(new ZkOperation() { + @Override + public String execute() throws KeeperException, InterruptedException { + if (System.nanoTime() - start > TimeUnit.NANOSECONDS.convert(timeout, TimeUnit.MILLISECONDS)) { + throw new KeeperException.SessionExpiredException(); + } + throw new KeeperException.ConnectionLossException(); + } + }); + } catch(KeeperException.SessionExpiredException e) { + + } catch (Exception e) { + fail("Expected " + KeeperException.SessionExpiredException.class.getSimpleName() + " but got " + e.getClass().getSimpleName()); + } + } finally { + if (server != null) { + server.shutdown(); + } + } + } public void testMultipleWatchesAsync() throws Exception { try (ZkConnection conn = new ZkConnection ()) { diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkCmdExecutor.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkCmdExecutor.java index 65d853971a7..c9c89fdd153 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkCmdExecutor.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkCmdExecutor.java @@ -27,9 +27,10 @@ import org.apache.zookeeper.data.ACL; public class ZkCmdExecutor { - private long retryDelay = 1500L; // 500 ms over for padding + private long retryDelay = 1500L; // 1500 ms over for padding private int retryCount; private List acl = ZooDefs.Ids.OPEN_ACL_UNSAFE; + private double timeouts; /** * TODO: At this point, this should probably take a SolrZkClient in @@ -40,8 +41,8 @@ public class ZkCmdExecutor { * with this class. */ public ZkCmdExecutor(int timeoutms) { - double timeouts = timeoutms / 1000.0; - this.retryCount = Math.round(0.5f * ((float)Math.sqrt(8.0f * timeouts + 1.0f) - 1.0f)); + timeouts = timeoutms / 1000.0; + this.retryCount = Math.round(0.5f * ((float)Math.sqrt(8.0f * timeouts + 1.0f) - 1.0f)) + 1; } public List getAcl() { @@ -84,7 +85,9 @@ public class ZkCmdExecutor { throw exception; } } - retryDelay(i); + if (i != retryCount -1) { + retryDelay(i); + } } } throw exception; @@ -116,7 +119,7 @@ public class ZkCmdExecutor { */ protected void retryDelay(int attemptCount) throws InterruptedException { if (attemptCount > 0) { - Thread.sleep(attemptCount * retryDelay); + Thread.sleep((attemptCount + 1) * retryDelay); } }