From 307ebd17cb078f784cac771c33decbd72987972e Mon Sep 17 00:00:00 2001 From: Mark Robert Miller Date: Sat, 15 Dec 2012 22:23:07 +0000 Subject: [PATCH] SOLR-4199: When doing zk retries due to connectionloss, rather than just retrying for 2 minutes, retry in proportion to the session timeout. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1422391 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 4 ++++ .../java/org/apache/solr/cloud/LeaderElector.java | 3 ++- .../java/org/apache/solr/cloud/ZkController.java | 15 +++++++++++---- .../apache/solr/common/cloud/SolrZkClient.java | 10 +++++++++- .../apache/solr/common/cloud/ZkCmdExecutor.java | 8 +++++--- .../apache/solr/common/cloud/ZkStateReader.java | 11 +++++++++-- 6 files changed, 40 insertions(+), 11 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 2fa2ac3ce5b..cf58c232508 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -175,6 +175,10 @@ Optimizations * SOLR-4063: Allow CoreContainer to load multiple SolrCores in parallel rather than just serially. (Mark Miller) + +* SOLR-4199: When doing zk retries due to connectionloss, rather than just + retrying for 2 minutes, retry in proportion to the session timeout. + (Mark Miller) Bug Fixes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java index 7be033fc37d..b684b0126cd 100644 --- a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java +++ b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java @@ -62,10 +62,11 @@ public class LeaderElector { protected SolrZkClient zkClient; - private ZkCmdExecutor zkCmdExecutor = new ZkCmdExecutor(); + private ZkCmdExecutor zkCmdExecutor; public LeaderElector(SolrZkClient zkClient) { this.zkClient = zkClient; + zkCmdExecutor = new ZkCmdExecutor((int) (zkClient.getZkClientTimeout()/1000.0 + 3000)); } /** diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java index a8d4da98d23..a9f6a7a0778 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java +++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java @@ -99,9 +99,7 @@ public final class ZkController { private final DistributedQueue overseerJobQueue; private final DistributedQueue overseerCollectionQueue; - // package private for tests - - static final String CONFIGS_ZKNODE = "/configs"; + public static final String CONFIGS_ZKNODE = "/configs"; public final static String COLLECTION_PARAM_PREFIX="collection."; public final static String CONFIGNAME_PROP="configName"; @@ -142,6 +140,8 @@ public final class ZkController { private int clientTimeout; + private volatile boolean isClosed; + public ZkController(final CoreContainer cc, String zkServerAddress, int zkClientTimeout, int zkClientConnectTimeout, String localHost, String locaHostPort, String localHostContext, String leaderVoteWait, final CurrentCoreDescriptorProvider registerOnReconnect) throws InterruptedException, TimeoutException, IOException { @@ -241,7 +241,7 @@ public final class ZkController { this.overseerJobQueue = Overseer.getInQueue(zkClient); this.overseerCollectionQueue = Overseer.getCollectionQueue(zkClient); - cmdExecutor = new ZkCmdExecutor(); + cmdExecutor = new ZkCmdExecutor(zkClientTimeout); leaderElector = new LeaderElector(zkClient); zkStateReader = new ZkStateReader(zkClient); @@ -266,6 +266,9 @@ public final class ZkController { descriptor.getCloudDescriptor().isLeader = false; publish(descriptor, ZkStateReader.DOWN, updateLastPublished); } catch (Exception e) { + if (isClosed) { + return; + } try { Thread.sleep(1000); } catch (InterruptedException e1) { @@ -282,6 +285,9 @@ public final class ZkController { waitForLeaderToSeeDownState(descriptor, coreZkNodeName); } catch (Exception e) { SolrException.log(log, "", e); + if (isClosed) { + return; + } try { Thread.sleep(5000); } catch (InterruptedException e1) { @@ -307,6 +313,7 @@ public final class ZkController { * Closes the underlying ZooKeeper client. */ public void close() { + this.isClosed = true; if (cmdDistribExecutor != null) { try { diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java b/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java index 5216af2effd..cb60ff09184 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/SolrZkClient.java @@ -71,11 +71,16 @@ public class SolrZkClient { private volatile SolrZooKeeper keeper; - private ZkCmdExecutor zkCmdExecutor = new ZkCmdExecutor(); + private ZkCmdExecutor zkCmdExecutor; private volatile boolean isClosed = false; private ZkClientConnectionStrategy zkClientConnectionStrategy; + private int zkClientTimeout; + public int getZkClientTimeout() { + return zkClientTimeout; + } + public SolrZkClient(String zkServerAddress, int zkClientTimeout) { this(zkServerAddress, zkClientTimeout, new DefaultConnectionStrategy(), null); } @@ -92,6 +97,9 @@ public class SolrZkClient { public SolrZkClient(String zkServerAddress, int zkClientTimeout, ZkClientConnectionStrategy strat, final OnReconnect onReconnect, int clientConnectTimeout) { this.zkClientConnectionStrategy = strat; + this.zkClientTimeout = zkClientTimeout; + // we must retry at least as long as the session timeout + zkCmdExecutor = new ZkCmdExecutor(zkClientTimeout); connManager = new ConnectionManager("ZooKeeperConnection Watcher:" + zkServerAddress, this, zkServerAddress, zkClientTimeout, strat, onReconnect); try { diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkCmdExecutor.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkCmdExecutor.java index 377531bd2a3..fa6e4d0ce22 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkCmdExecutor.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkCmdExecutor.java @@ -27,11 +27,13 @@ import org.apache.zookeeper.data.ACL; public class ZkCmdExecutor { - private long retryDelay = 1000L; - private int retryCount = 15; + private long retryDelay = 1300L; // 300 ms over for padding + private int retryCount; private List acl = ZooDefs.Ids.OPEN_ACL_UNSAFE; - public ZkCmdExecutor() { + public ZkCmdExecutor(int timeoutms) { + double timeouts = timeoutms / 1000.0; + this.retryCount = Math.round(0.5f * ((float)Math.sqrt(8.0f * timeouts + 1.0f) - 1.0f)); } public List getAcl() { diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java index 3afea751a67..dd06f46fb39 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java @@ -127,14 +127,16 @@ public class ZkStateReader { private boolean closeClient = false; - private ZkCmdExecutor cmdExecutor = new ZkCmdExecutor(); + private ZkCmdExecutor cmdExecutor; public ZkStateReader(SolrZkClient zkClient) { this.zkClient = zkClient; + initZkCmdExecutor(zkClient.getZkClientTimeout()); } - + public ZkStateReader(String zkServerAddress, int zkClientTimeout, int zkClientConnectTimeout) throws InterruptedException, TimeoutException, IOException { closeClient = true; + initZkCmdExecutor(zkClientTimeout); zkClient = new SolrZkClient(zkServerAddress, zkClientTimeout, zkClientConnectTimeout, // on reconnect, reload cloud info new OnReconnect() { @@ -159,6 +161,11 @@ public class ZkStateReader { }); } + private void initZkCmdExecutor(int zkClientTimeout) { + // we must retry at least as long as the session timeout + cmdExecutor = new ZkCmdExecutor(zkClientTimeout); + } + // load and publish a new CollectionInfo public void updateClusterState(boolean immediate) throws KeeperException, InterruptedException { updateClusterState(immediate, false);