mirror of https://github.com/apache/lucene.git
SOLR-4199: When doing zk retries due to connectionloss, rather than just retrying for 2 minutes, retry in proportion to the session timeout.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1422391 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a6b2552873
commit
307ebd17cb
|
@ -175,6 +175,10 @@ Optimizations
|
|||
|
||||
* SOLR-4063: Allow CoreContainer to load multiple SolrCores in parallel rather
|
||||
than just serially. (Mark Miller)
|
||||
|
||||
* SOLR-4199: When doing zk retries due to connectionloss, rather than just
|
||||
retrying for 2 minutes, retry in proportion to the session timeout.
|
||||
(Mark Miller)
|
||||
|
||||
Bug Fixes
|
||||
----------------------
|
||||
|
|
|
@ -62,10 +62,11 @@ public class LeaderElector {
|
|||
|
||||
protected SolrZkClient zkClient;
|
||||
|
||||
private ZkCmdExecutor zkCmdExecutor = new ZkCmdExecutor();
|
||||
private ZkCmdExecutor zkCmdExecutor;
|
||||
|
||||
public LeaderElector(SolrZkClient zkClient) {
|
||||
this.zkClient = zkClient;
|
||||
zkCmdExecutor = new ZkCmdExecutor((int) (zkClient.getZkClientTimeout()/1000.0 + 3000));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -99,9 +99,7 @@ public final class ZkController {
|
|||
private final DistributedQueue overseerJobQueue;
|
||||
private final DistributedQueue overseerCollectionQueue;
|
||||
|
||||
// package private for tests
|
||||
|
||||
static final String CONFIGS_ZKNODE = "/configs";
|
||||
public static final String CONFIGS_ZKNODE = "/configs";
|
||||
|
||||
public final static String COLLECTION_PARAM_PREFIX="collection.";
|
||||
public final static String CONFIGNAME_PROP="configName";
|
||||
|
@ -142,6 +140,8 @@ public final class ZkController {
|
|||
|
||||
private int clientTimeout;
|
||||
|
||||
private volatile boolean isClosed;
|
||||
|
||||
public ZkController(final CoreContainer cc, String zkServerAddress, int zkClientTimeout, int zkClientConnectTimeout, String localHost, String locaHostPort,
|
||||
String localHostContext, String leaderVoteWait, final CurrentCoreDescriptorProvider registerOnReconnect) throws InterruptedException,
|
||||
TimeoutException, IOException {
|
||||
|
@ -241,7 +241,7 @@ public final class ZkController {
|
|||
|
||||
this.overseerJobQueue = Overseer.getInQueue(zkClient);
|
||||
this.overseerCollectionQueue = Overseer.getCollectionQueue(zkClient);
|
||||
cmdExecutor = new ZkCmdExecutor();
|
||||
cmdExecutor = new ZkCmdExecutor(zkClientTimeout);
|
||||
leaderElector = new LeaderElector(zkClient);
|
||||
zkStateReader = new ZkStateReader(zkClient);
|
||||
|
||||
|
@ -266,6 +266,9 @@ public final class ZkController {
|
|||
descriptor.getCloudDescriptor().isLeader = false;
|
||||
publish(descriptor, ZkStateReader.DOWN, updateLastPublished);
|
||||
} catch (Exception e) {
|
||||
if (isClosed) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
Thread.sleep(1000);
|
||||
} catch (InterruptedException e1) {
|
||||
|
@ -282,6 +285,9 @@ public final class ZkController {
|
|||
waitForLeaderToSeeDownState(descriptor, coreZkNodeName);
|
||||
} catch (Exception e) {
|
||||
SolrException.log(log, "", e);
|
||||
if (isClosed) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
Thread.sleep(5000);
|
||||
} catch (InterruptedException e1) {
|
||||
|
@ -307,6 +313,7 @@ public final class ZkController {
|
|||
* Closes the underlying ZooKeeper client.
|
||||
*/
|
||||
public void close() {
|
||||
this.isClosed = true;
|
||||
|
||||
if (cmdDistribExecutor != null) {
|
||||
try {
|
||||
|
|
|
@ -71,11 +71,16 @@ public class SolrZkClient {
|
|||
|
||||
private volatile SolrZooKeeper keeper;
|
||||
|
||||
private ZkCmdExecutor zkCmdExecutor = new ZkCmdExecutor();
|
||||
private ZkCmdExecutor zkCmdExecutor;
|
||||
|
||||
private volatile boolean isClosed = false;
|
||||
private ZkClientConnectionStrategy zkClientConnectionStrategy;
|
||||
private int zkClientTimeout;
|
||||
|
||||
public int getZkClientTimeout() {
|
||||
return zkClientTimeout;
|
||||
}
|
||||
|
||||
public SolrZkClient(String zkServerAddress, int zkClientTimeout) {
|
||||
this(zkServerAddress, zkClientTimeout, new DefaultConnectionStrategy(), null);
|
||||
}
|
||||
|
@ -92,6 +97,9 @@ public class SolrZkClient {
|
|||
public SolrZkClient(String zkServerAddress, int zkClientTimeout,
|
||||
ZkClientConnectionStrategy strat, final OnReconnect onReconnect, int clientConnectTimeout) {
|
||||
this.zkClientConnectionStrategy = strat;
|
||||
this.zkClientTimeout = zkClientTimeout;
|
||||
// we must retry at least as long as the session timeout
|
||||
zkCmdExecutor = new ZkCmdExecutor(zkClientTimeout);
|
||||
connManager = new ConnectionManager("ZooKeeperConnection Watcher:"
|
||||
+ zkServerAddress, this, zkServerAddress, zkClientTimeout, strat, onReconnect);
|
||||
try {
|
||||
|
|
|
@ -27,11 +27,13 @@ import org.apache.zookeeper.data.ACL;
|
|||
|
||||
|
||||
public class ZkCmdExecutor {
|
||||
private long retryDelay = 1000L;
|
||||
private int retryCount = 15;
|
||||
private long retryDelay = 1300L; // 300 ms over for padding
|
||||
private int retryCount;
|
||||
private List<ACL> acl = ZooDefs.Ids.OPEN_ACL_UNSAFE;
|
||||
|
||||
public ZkCmdExecutor() {
|
||||
public ZkCmdExecutor(int timeoutms) {
|
||||
double timeouts = timeoutms / 1000.0;
|
||||
this.retryCount = Math.round(0.5f * ((float)Math.sqrt(8.0f * timeouts + 1.0f) - 1.0f));
|
||||
}
|
||||
|
||||
public List<ACL> getAcl() {
|
||||
|
|
|
@ -127,14 +127,16 @@ public class ZkStateReader {
|
|||
|
||||
private boolean closeClient = false;
|
||||
|
||||
private ZkCmdExecutor cmdExecutor = new ZkCmdExecutor();
|
||||
private ZkCmdExecutor cmdExecutor;
|
||||
|
||||
public ZkStateReader(SolrZkClient zkClient) {
|
||||
this.zkClient = zkClient;
|
||||
initZkCmdExecutor(zkClient.getZkClientTimeout());
|
||||
}
|
||||
|
||||
|
||||
public ZkStateReader(String zkServerAddress, int zkClientTimeout, int zkClientConnectTimeout) throws InterruptedException, TimeoutException, IOException {
|
||||
closeClient = true;
|
||||
initZkCmdExecutor(zkClientTimeout);
|
||||
zkClient = new SolrZkClient(zkServerAddress, zkClientTimeout, zkClientConnectTimeout,
|
||||
// on reconnect, reload cloud info
|
||||
new OnReconnect() {
|
||||
|
@ -159,6 +161,11 @@ public class ZkStateReader {
|
|||
});
|
||||
}
|
||||
|
||||
private void initZkCmdExecutor(int zkClientTimeout) {
|
||||
// we must retry at least as long as the session timeout
|
||||
cmdExecutor = new ZkCmdExecutor(zkClientTimeout);
|
||||
}
|
||||
|
||||
// load and publish a new CollectionInfo
|
||||
public void updateClusterState(boolean immediate) throws KeeperException, InterruptedException {
|
||||
updateClusterState(immediate, false);
|
||||
|
|
Loading…
Reference in New Issue