SOLR-3833: When a election is started because a leader went down, the new leader candidate should decline if the last state they published was not active.

SOLR-3836: When doing peer sync, we should only count sync attempts that cannot reach the given host as success when the candidate leader is syncing with the replicas - not when replicas are syncing to the leader. SOLR-3835: In our leader election algorithm, if on connection loss we found we did not create our election node, we should retry, not throw an exception. SOLR-3834: A new leader on cluster startup should also run the leader sync process in case there was a bad cluster shutdown. SOLR-3772: On cluster startup, we should wait until we see all registered replicas before running the leader process - or if they all do not come up, N amount of time. SOLR-3756: If we are elected the leader of a shard, but we fail to publish this for any reason, we should clean up and re trigger a leader election. SOLR-3812: ConnectionLoss during recovery can cause lost updates, leading to shard inconsistency. SOLR-3813: When a new leader syncs, we need to ask all shards to sync back, not just those that are active. SOLR-3807: Currently during recovery we pause for a number of seconds after waiting for the leader to see a recovering state so that any previous updates will have finished before our commit on the leader - we don't need this wait for peersync. SOLR-3837: When a leader is elected and asks replicas to sync back to him and that fails, we should ask those nodes to recovery asynchronously rather than synchronously. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1384923 13f79535-47bb-0310-9956-ffa450edef68
2012-09-14 21:05:15 +00:00 · 2012-09-14 21:05:15 +00:00 · f0185fb9d5
parent 785c7d47c0
commit f0185fb9d5
28 changed files with 663 additions and 456 deletions
--- a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
@ -289,7 +289,7 @@ public abstract class LuceneTestCase extends Assert {
   * @see #classRules
   */
  private static final String [] IGNORED_INVARIANT_PROPERTIES = {
-    "user.timezone"
+    "user.timezone", "java.rmi.server.randomIDs"
  };

  /** Filesystem-based {@link Directory} implementations. */
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -70,6 +70,17 @@ Optimizations
 * SOLR-3715: improve concurrency of the transaction log by removing
  synchronization around log record serialization. (yonik)

+* SOLR-3807: Currently during recovery we pause for a number of seconds after 
+  waiting for the leader to see a recovering state so that any previous updates
+  will have finished before our commit on the leader - we don't need this wait 
+  for peersync. (Mark Miller)
+  
+* SOLR-3837: When a leader is elected and asks replicas to sync back to him and 
+  that fails, we should ask those nodes to recovery asynchronously rather than 
+  synchronously. (Mark Miller)
+
+* SOLR-3709: Cache the url list created from the ClusterState in CloudSolrServer
+  on each request. (Mark Miller)

 Bug Fixes
 ----------------------
@ -174,6 +185,36 @@ Bug Fixes
 * SOLR-3791: CachedSqlEntityProcessor would throw a NullPointerException when 
  a query returns a row with a NULL key.  (Steffen Moelter via James Dyer)

+* SOLR-3833: When a election is started because a leader went down, the new 
+  leader candidate should decline if the last state they published was not
+  active. (yonik, Mark Miller)
+
+* SOLR-3836: When doing peer sync, we should only count sync attempts that 
+  cannot reach the given host as success when the candidate leader is 
+  syncing with the replicas - not when replicas are syncing to the leader.
+  (Mark Miller)
+
+* SOLR-3835: In our leader election algorithm, if on connection loss we found
+  we did not create our election node, we should retry, not throw an exception.
+  (Mark Miller)
+
+* SOLR-3834: A new leader on cluster startup should also run the leader sync 
+  process in case there was a bad cluster shutdown. (Mark Miller)
+
+* SOLR-3772: On cluster startup, we should wait until we see all registered 
+  replicas before running the leader process - or if they all do not come up,
+  N amount of time. (Mark Miller)
+  
+* SOLR-3756: If we are elected the leader of a shard, but we fail to publish 
+  this for any reason, we should clean up and re trigger a leader election.
+  (Mark Miller)
+
+* SOLR-3812: ConnectionLoss during recovery can cause lost updates, leading to 
+  shard inconsistency. (Mark Miller)
+  
+* SOLR-3813: When a new leader syncs, we need to ask all shards to sync back, 
+  not just those that are active. (Mark Miller)
+
 Other Changes
 ----------------------

--- a/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java
+++ b/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java
@ -27,7 +27,12 @@ public class CloudDescriptor {
  private Integer numShards;
  
  volatile boolean isLeader = false;
+  volatile String lastPublished;
  
+  public String getLastPublished() {
+    return lastPublished;
+  }
+
  public boolean isLeader() {
    return isLeader;
  }
--- a/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
@ -2,8 +2,6 @@ package org.apache.solr.cloud;

 import java.io.IOException;
 import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;

 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
@ -93,7 +91,7 @@ class ShardLeaderElectionContextBase extends ElectionContext {
    ZkNodeProps m = ZkNodeProps.fromKeyVals(Overseer.QUEUE_OPERATION, "leader",
        ZkStateReader.SHARD_ID_PROP, shardId, ZkStateReader.COLLECTION_PROP,
        collection, ZkStateReader.BASE_URL_PROP, leaderProps.getProperties()
-        .get(ZkStateReader.BASE_URL_PROP), ZkStateReader.CORE_NAME_PROP,
+            .get(ZkStateReader.BASE_URL_PROP), ZkStateReader.CORE_NAME_PROP,
        leaderProps.getProperties().get(ZkStateReader.CORE_NAME_PROP),
        ZkStateReader.STATE_PROP, ZkStateReader.ACTIVE);
    Overseer.getInQueue(zkClient).offer(ZkStateReader.toJSON(m));
@ -110,18 +108,15 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
  private CoreContainer cc;
  private SyncStrategy syncStrategy = new SyncStrategy();

-  private boolean afterExpiration;
-
  private volatile boolean isClosed = false;
  
  public ShardLeaderElectionContext(LeaderElector leaderElector, 
      final String shardId, final String collection,
-      final String shardZkNodeName, ZkNodeProps props, ZkController zkController, CoreContainer cc, boolean afterExpiration) {
+      final String shardZkNodeName, ZkNodeProps props, ZkController zkController, CoreContainer cc) {
    super(leaderElector, shardId, collection, shardZkNodeName, props,
        zkController.getZkStateReader());
    this.zkController = zkController;
    this.cc = cc;
-    this.afterExpiration = afterExpiration;
  }
  
  @Override
@ -132,7 +127,7 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
  @Override
  void runLeaderProcess(boolean weAreReplacement) throws KeeperException,
      InterruptedException, IOException {
-    log.info("Running the leader process. afterExpiration=" + afterExpiration);
+    log.info("Running the leader process.");
    
    String coreName = leaderProps.getStr(ZkStateReader.CORE_NAME_PROP);
    
@ -143,7 +138,7 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
    Overseer.getInQueue(zkClient).offer(ZkStateReader.toJSON(m));
    
    String leaderVoteWait = cc.getZkController().getLeaderVoteWait();
-    if (leaderVoteWait != null) {
+    if (!weAreReplacement && leaderVoteWait != null) {
      waitForReplicasToComeUp(weAreReplacement, leaderVoteWait);
    }
    
@ -161,41 +156,58 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
      
      // should I be leader?
      if (weAreReplacement && !shouldIBeLeader(leaderProps, core)) {
-        // System.out.println("there is a better leader candidate it appears");
        rejoinLeaderElection(leaderSeqPath, core);
        return;
      }
      
-      if (weAreReplacement) {
-        log.info("I may be the new leader - try and sync");
-        // we are going to attempt to be the leader
-        // first cancel any current recovery
-        core.getUpdateHandler().getSolrCoreState().cancelRecovery();
-        boolean success = syncStrategy.sync(zkController, core, leaderProps);
-        // solrcloud_debug
-        // try {
-        // RefCounted<SolrIndexSearcher> searchHolder =
-        // core.getNewestSearcher(false);
-        // SolrIndexSearcher searcher = searchHolder.get();
-        // try {
-        // System.out.println(core.getCoreDescriptor().getCoreContainer().getZkController().getNodeName()
-        // + " synched "
-        // + searcher.search(new MatchAllDocsQuery(), 1).totalHits);
-        // } finally {
-        // searchHolder.decref();
-        // }
-        // } catch (Exception e) {
-        //
-        // }
-        if (!success && anyoneElseActive()) {
-          rejoinLeaderElection(leaderSeqPath, core);
-          return;
-        }
+      log.info("I may be the new leader - try and sync");
+      // we are going to attempt to be the leader
+      // first cancel any current recovery
+      core.getUpdateHandler().getSolrCoreState().cancelRecovery();
+      boolean success = false;
+      try {
+        success = syncStrategy.sync(zkController, core, leaderProps);
+      } catch (Throwable t) {
+        SolrException.log(log, "Exception while trying to sync", t);
+        success = false;
      }
      
+      // if !success but no one else is in active mode,
+      // we are the leader anyway
+      // TODO: should we also be leader if there is only one other active?
+      // if we couldn't sync with it, it shouldn't be able to sync with us
+      // TODO: this needs to be moved to the election context - the logic does
+      // not belong here.
+      if (!success
+          && !areAnyOtherReplicasActive(zkController, leaderProps, collection,
+              shardId)) {
+        log.info("Sync was not a success but no one else is active! I am the leader");
+        success = true;
+      }
+      
+      // solrcloud_debug
+      // try {
+      // RefCounted<SolrIndexSearcher> searchHolder =
+      // core.getNewestSearcher(false);
+      // SolrIndexSearcher searcher = searchHolder.get();
+      // try {
+      // System.out.println(core.getCoreDescriptor().getCoreContainer().getZkController().getNodeName()
+      // + " synched "
+      // + searcher.search(new MatchAllDocsQuery(), 1).totalHits);
+      // } finally {
+      // searchHolder.decref();
+      // }
+      // } catch (Exception e) {
+      //
+      // }
+      if (!success) {
+        rejoinLeaderElection(leaderSeqPath, core);
+        return;
+      }
+
      log.info("I am the new leader: "
          + ZkCoreNodeProps.getCoreUrl(leaderProps));
-      
+      core.getCoreDescriptor().getCloudDescriptor().isLeader = true;
    } finally {
      if (core != null) {
        core.close();
@ -205,101 +217,115 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
    try {
      super.runLeaderProcess(weAreReplacement);
    } catch (Throwable t) {
-      cancelElection();
      try {
        core = cc.getCore(coreName);
        core.getCoreDescriptor().getCloudDescriptor().isLeader = false;
-        if (!cc.isShutDown()) {
-          // we could not publish ourselves as leader - rejoin election
-          rejoinLeaderElection(coreName, core);
-        }
+        
+        // we could not publish ourselves as leader - rejoin election
+        rejoinLeaderElection(coreName, core);
      } finally {
        if (core != null) {
          core.close();
        }
      }
-      
-    }
-    
-    try {
-      core = cc.getCore(coreName);
-      // we do this after the above super. call so that we don't
-      // briefly think we are the leader and then end up not being
-      // able to publish that we are the leader.
-      core.getCoreDescriptor().getCloudDescriptor().isLeader = true;
-    } finally {
-      if (core != null) {
-        core.close();
-      }
    }
    
  }
+  
+  private boolean areAnyOtherReplicasActive(ZkController zkController,
+      ZkNodeProps leaderProps, String collection, String shardId) {
+    ClusterState clusterState = zkController.getZkStateReader()
+        .getClusterState();
+    Map<String,Slice> slices = clusterState.getSlices(collection);
+    Slice slice = slices.get(shardId);
+    Map<String,Replica> replicasMap = slice.getReplicasMap();
+    for (Map.Entry<String,Replica> shard : replicasMap.entrySet()) {
+      String state = shard.getValue().getStr(ZkStateReader.STATE_PROP);
+      // System.out.println("state:"
+      // + state
+      // + shard.getValue().get(ZkStateReader.NODE_NAME_PROP)
+      // + " live: "
+      // + clusterState.liveNodesContain(shard.getValue().get(
+      // ZkStateReader.NODE_NAME_PROP)));
+      if (state.equals(ZkStateReader.ACTIVE)
+          && clusterState.liveNodesContain(shard.getValue().getStr(
+              ZkStateReader.NODE_NAME_PROP))
+          && !new ZkCoreNodeProps(shard.getValue()).getCoreUrl().equals(
+              new ZkCoreNodeProps(leaderProps).getCoreUrl())) {
+        return true;
+      }
+    }
+    
+    return false;
+  }

-  private void waitForReplicasToComeUp(boolean weAreReplacement, String leaderVoteWait)
-      throws InterruptedException {
+  private void waitForReplicasToComeUp(boolean weAreReplacement,
+      String leaderVoteWait) throws InterruptedException {
    int timeout = Integer.parseInt(leaderVoteWait);
    long timeoutAt = System.currentTimeMillis() + timeout;
-
-    boolean tryAgain = true;
+    final String shardsElectZkPath = electionPath + LeaderElector.ELECTION_NODE;
+    
    Slice slices = zkController.getClusterState().getSlice(collection, shardId);
+    
    while (true && !isClosed) {
      // wait for everyone to be up
      if (slices != null) {
-        Map<String,Replica> shards = slices.getReplicasMap();
-        Set<Entry<String,Replica>> entrySet = shards.entrySet();
        int found = 0;
-        tryAgain = false;
-        for (Entry<String,Replica> entry : entrySet) {
-          ZkCoreNodeProps props = new ZkCoreNodeProps(entry.getValue());
-          if (props.getState().equals(ZkStateReader.ACTIVE)
-              && zkController.getClusterState().liveNodesContain(
-                  props.getNodeName())) {
-            found++;
-          }
+        try {
+          found = zkClient.getChildren(shardsElectZkPath, null, true).size();
+        } catch (KeeperException e) {
+          SolrException.log(log,
+              "Errir checking for the number of election participants", e);
        }
        
        // on startup and after connection timeout, wait for all known shards
-        if ((afterExpiration || !weAreReplacement)
-            && found >= slices.getReplicasMap().size()) {
+        if (found >= slices.getReplicasMap().size()) {
          log.info("Enough replicas found to continue.");
-          break;
-        } else if (!afterExpiration && found >= slices.getReplicasMap().size() - 1) {
-          // a previous leader went down - wait for one less than the total
-          // known shards
-          log.info("Enough replicas found to continue.");
-          break;
+          return;
        } else {
-          log.info("Waiting until we see more replicas up: total=" + slices.getReplicasMap().size() + " found=" + found + " timeoutin=" + (timeoutAt - System.currentTimeMillis()));
+          log.info("Waiting until we see more replicas up: total="
+              + slices.getReplicasMap().size() + " found=" + found
+              + " timeoutin=" + (timeoutAt - System.currentTimeMillis()));
        }
-  
+        
        if (System.currentTimeMillis() > timeoutAt) {
          log.info("Was waiting for replicas to come up, but they are taking too long - assuming they won't come back till later");
-          break;
+          return;
        }
      }
-      if (tryAgain) {
-        Thread.sleep(500);
-        slices = zkController.getClusterState().getSlice(collection, shardId);
-      }
+      
+      Thread.sleep(500);
+      slices = zkController.getClusterState().getSlice(collection, shardId);
    }
  }

  private void rejoinLeaderElection(String leaderSeqPath, SolrCore core)
      throws InterruptedException, KeeperException, IOException {
    // remove our ephemeral and re join the election
-    // System.out.println("sync failed, delete our election node:"
-    // + leaderSeqPath);
+    if (cc.isShutDown()) {
+      log.info("Not rejoining election because CoreContainer is shutdown");
+      return;
+    }
+    
    log.info("There is a better leader candidate than us - going back into recovery");
    
-    zkController.publish(core.getCoreDescriptor(), ZkStateReader.DOWN);
+    try {
+      zkController.publish(core.getCoreDescriptor(), ZkStateReader.DOWN);
+    } catch (Throwable t) {
+      SolrException.log(log, "Error trying to publish down state", t);
+    }
    
    cancelElection();
    
-    core.getUpdateHandler().getSolrCoreState().doRecovery(cc, core.getName());
+    try {
+      core.getUpdateHandler().getSolrCoreState().doRecovery(cc, core.getName());
+    } catch (Throwable t) {
+      SolrException.log(log, "Error trying to start recovery", t);
+    }
    
    leaderElector.joinElection(this);
  }
-  
+
  private boolean shouldIBeLeader(ZkNodeProps leaderProps, SolrCore core) {
    log.info("Checking if I should try and be the leader.");
    
@ -308,66 +334,13 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
      return false;
    }
    
-    ClusterState clusterState = zkController.getZkStateReader().getClusterState();
-    Map<String,Slice> slices = clusterState.getSlices(this.collection);
-    Slice slice = slices.get(shardId);
-    Map<String,Replica> shards = slice.getReplicasMap();
-    boolean foundSomeoneElseActive = false;
-    for (Map.Entry<String,Replica> shard : shards.entrySet()) {
-      String state = shard.getValue().getStr(ZkStateReader.STATE_PROP);
-
-      if (new ZkCoreNodeProps(shard.getValue()).getCoreUrl().equals(
-              new ZkCoreNodeProps(leaderProps).getCoreUrl())) {
-        if (state.equals(ZkStateReader.ACTIVE)
-          && clusterState.liveNodesContain(shard.getValue().getStr(
-            ZkStateReader.NODE_NAME_PROP))) {
-          // we are alive
-          log.info("I am Active and live, it's okay to be the leader.");
-          return true;
-        }
-      }
-      
-      if ((state.equals(ZkStateReader.ACTIVE))
-          && clusterState.liveNodesContain(shard.getValue().getStr(
-          ZkStateReader.NODE_NAME_PROP))
-          && !new ZkCoreNodeProps(shard.getValue()).getCoreUrl().equals(
-              new ZkCoreNodeProps(leaderProps).getCoreUrl())) {
-        foundSomeoneElseActive = true;
-      }
-    }
-    if (!foundSomeoneElseActive) {
-      log.info("I am not Active but no one else is either, it's okay to be the leader");
-      try {
-        zkController.publish(core.getCoreDescriptor(), ZkStateReader.ACTIVE);
-      } catch (KeeperException e) {
-        throw new RuntimeException(e);
-      } catch (InterruptedException e) {
-        Thread.currentThread().interrupt();
-        throw new RuntimeException(e);
-      }
-    } else {
-      log.info("I am not Active and someone else appears to be a better leader candidate.");
-    }
-    return !foundSomeoneElseActive;
-  }
-  
-  private boolean anyoneElseActive() {
-    ClusterState clusterState = zkController.getZkStateReader().getClusterState();
-    Map<String,Slice> slices = clusterState.getSlices(this.collection);
-    Slice slice = slices.get(shardId);
-    Map<String,Replica> shards = slice.getReplicasMap();
-
-    for (Map.Entry<String,Replica> shard : shards.entrySet()) {
-      String state = shard.getValue().getStr(ZkStateReader.STATE_PROP);
-
-      
-      if ((state.equals(ZkStateReader.ACTIVE))
-          && clusterState.liveNodesContain(shard.getValue().getStr(
-          ZkStateReader.NODE_NAME_PROP))) {
-        return true;
-      }
+    if (core.getCoreDescriptor().getCloudDescriptor().getLastPublished().equals(ZkStateReader.ACTIVE)) {
+      log.info("My last published State was Active, it's okay to be the leader.");
+      return true;
    }
    
+//    TODO: and if no is a good candidate?
+    
    return false;
  }
  
--- a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
+++ b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
@ -52,12 +52,11 @@ import org.slf4j.LoggerFactory;
 * a watch on the next lowest node it finds, and if that node goes down, 
 * starts the whole process over by checking if it's the lowest sequential node, etc.
 * 
- * TODO: now we could just reuse the lock package code for leader election
 */
 public  class LeaderElector {
  private static Logger log = LoggerFactory.getLogger(LeaderElector.class);
  
-  private static final String ELECTION_NODE = "/election";
+  static final String ELECTION_NODE = "/election";
  
  private final static Pattern LEADER_SEQ = Pattern.compile(".*?/?.*?-n_(\\d+)");
  private final static Pattern SESSION_ID = Pattern.compile(".*?/?(.*?-.*?)-n_\\d+");
@ -161,6 +160,9 @@ public  class LeaderElector {
  
  /**
   * Returns int given String of form n_0000000001 or n_0000000003, etc.
+   * 
+   * @param nStringSequence
+   * @return sequence number
   */
  private int getSeq(String nStringSequence) {
    int seq = 0;
@ -188,6 +190,9 @@ public  class LeaderElector {
  
  /**
   * Returns int list given list of form n_0000000001, n_0000000003, etc.
+   * 
+   * @param seqs
+   * @return
   */
  private List<Integer> getSeqs(List<String> seqs) {
    List<Integer> intSeqs = new ArrayList<Integer>(seqs.size());
@ -239,18 +244,31 @@ public  class LeaderElector {
          }
        }
        if (!foundId) {
-          throw e;
+          cont = true;
+          if (tries++ > 20) {
+            throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR,
+                "", e);
+          }
+          try {
+            Thread.sleep(50);
+          } catch (InterruptedException e2) {
+            Thread.currentThread().interrupt();
+          }
        }

      } catch (KeeperException.NoNodeException e) {
        // we must have failed in creating the election node - someone else must
        // be working on it, lets try again
-        if (tries++ > 9) {
+        if (tries++ > 20) {
          throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR,
              "", e);
        }
        cont = true;
-        Thread.sleep(50);
+        try {
+          Thread.sleep(50);
+        } catch (InterruptedException e2) {
+          Thread.currentThread().interrupt();
+        }
      }
    }
    int seq = getSeq(leaderSeqPath);
--- a/solr/core/src/java/org/apache/solr/cloud/Overseer.java
+++ b/solr/core/src/java/org/apache/solr/cloud/Overseer.java
@ -516,11 +516,11 @@ public class Overseer {
    } catch (KeeperException.NodeExistsException e) {
      //ok
    } catch (InterruptedException e) {
-      log.error("Could not create Overseer node: " + e.getClass() + ":" + e.getMessage());
+      log.error("Could not create Overseer node", e);
      Thread.currentThread().interrupt();
      throw new RuntimeException(e);
    } catch (KeeperException e) {
-      log.error("Could not create Overseer node: " + e.getClass() + ":" + e.getMessage());
+      log.error("Could not create Overseer node", e);
      throw new RuntimeException(e);
    }
  }
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
@ -128,6 +128,7 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
    // if we are the leader, either we are trying to recover faster
    // then our ephemeral timed out or we are the only node
    if (!leaderBaseUrl.equals(baseUrl)) {
+      
      // send commit
      commitOnLeader(leaderUrl);
      
@ -194,7 +195,6 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
    prepCmd.setState(ZkStateReader.RECOVERING);
    prepCmd.setCheckLive(true);
    prepCmd.setOnlyIfLeader(true);
-    prepCmd.setPauseFor(6000);
    
    server.request(prepCmd);
    server.shutdown();
@ -317,7 +317,7 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
            .getCloudDescriptor();
        ZkNodeProps leaderprops = zkStateReader.getLeaderProps(
            cloudDesc.getCollectionName(), cloudDesc.getShardId());
-
+      
        String leaderBaseUrl = leaderprops.getStr(ZkStateReader.BASE_URL_PROP);
        String leaderCoreName = leaderprops.getStr(ZkStateReader.CORE_NAME_PROP);

@ -338,9 +338,6 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
        }
        
        zkController.publish(core.getCoreDescriptor(), ZkStateReader.RECOVERING);
-        
-        sendPrepRecoveryCmd(leaderBaseUrl, leaderCoreName);
-

        // first thing we just try to sync
        if (firstTime) {
@ -355,6 +352,7 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
          if (syncSuccess) {
            SolrQueryRequest req = new LocalSolrQueryRequest(core,
                new ModifiableSolrParams());
+            // force open a new searcher
            core.getUpdateHandler().commit(new CommitUpdateCommand(req, false));
            log.info("PeerSync Recovery was successful - registering as Active. core=" + coreName);

@ -384,24 +382,24 @@ public class RecoveryStrategy extends Thread implements ClosableThread {

          log.info("PeerSync Recovery was not successful - trying replication. core=" + coreName);
        }
-        //System.out.println("Sync Recovery was not successful - trying replication");
+
        log.info("Starting Replication Recovery. core=" + coreName);
+        
+        sendPrepRecoveryCmd(leaderBaseUrl, leaderCoreName);
+        
+        // we wait a bit so that any updates on the leader
+        // that started before they saw recovering state 
+        // are sure to have finished
+        try {
+          Thread.sleep(2000);
+        } catch (InterruptedException e) {
+          Thread.currentThread().interrupt();
+        }
+        
        log.info("Begin buffering updates. core=" + coreName);
        ulog.bufferUpdates();
        replayed = false;
        
-//        // open a new IndexWriter - we don't want any background merges ongoing
-//        // also ensures something like NRTCachingDirectory is flushed
-//        boolean forceNewIndexDir = false;
-//        try {
-//          core.getUpdateHandler().newIndexWriter(false);
-//        } catch (Throwable t) {
-//          SolrException.log(log, "Could not read the current index - replicating to a new directory", t);
-//          // something is wrong with the index
-//          // we need to force using a new index directory
-//          forceNewIndexDir = true;
-//        }
-//        
        try {

          replicate(zkController.getNodeName(), core,
@ -507,7 +505,11 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
    } else {
      log.info("Replaying buffered documents. core=" + coreName);
      // wait for replay
-      future.get();
+      RecoveryInfo report = future.get();
+      if (report.failed) {
+        SolrException.log(log, "Replay failed");
+        throw new SolrException(ErrorCode.SERVER_ERROR, "Replay failed");
+      }
    }
    
    // solrcloud_debug
--- a/solr/core/src/java/org/apache/solr/cloud/SyncStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/SyncStrategy.java
@ -20,7 +20,6 @@ package org.apache.solr.cloud;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.Map;

 import org.apache.http.client.HttpClient;
 import org.apache.solr.client.solrj.SolrServerException;
@ -28,9 +27,6 @@ import org.apache.solr.client.solrj.impl.HttpClientUtil;
 import org.apache.solr.client.solrj.impl.HttpSolrServer;
 import org.apache.solr.client.solrj.request.CoreAdminRequest.RequestRecovery;
 import org.apache.solr.common.SolrException;
-import org.apache.solr.common.cloud.ClusterState;
-import org.apache.solr.common.cloud.Replica;
-import org.apache.solr.common.cloud.Slice;
 import org.apache.solr.common.cloud.ZkCoreNodeProps;
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
@ -49,6 +45,8 @@ import org.slf4j.LoggerFactory;
 public class SyncStrategy {
  protected final Logger log = LoggerFactory.getLogger(getClass());

+  private final boolean SKIP_AUTO_RECOVERY = Boolean.getBoolean("solrcloud.skip.autorecovery");
+  
  private final ShardHandler shardHandler;
  
  private final static HttpClient client;
@ -73,6 +71,9 @@ public class SyncStrategy {
  
  public boolean sync(ZkController zkController, SolrCore core,
      ZkNodeProps leaderProps) {
+    if (SKIP_AUTO_RECOVERY) {
+      return true;
+    }
    log.info("Sync replicas to " + ZkCoreNodeProps.getCoreUrl(leaderProps));
    // TODO: look at our state usage of sync
    // zkController.publish(core, ZkStateReader.SYNC);
@ -94,6 +95,21 @@ public class SyncStrategy {
    String collection = cloudDesc.getCollectionName();
    String shardId = cloudDesc.getShardId();

+    // if no one that is up is active, we are willing to wait...
+    // we don't want a recovering node to become leader and then
+    // a better candidate pops up a second later.
+//    int tries = 20;
+//    while (!areAnyReplicasActive(zkController, collection, shardId)) {
+//      if (tries-- == 0) {
+//        break;
+//      }
+//      try {
+//        Thread.sleep(500);
+//      } catch (InterruptedException e) {
+//        Thread.currentThread().interrupt();
+//      }
+//    }
+    
    // first sync ourselves - we are the potential leader after all
    try {
      success = syncWithReplicas(zkController, core, leaderProps, collection,
@ -102,18 +118,7 @@ public class SyncStrategy {
      SolrException.log(log, "Sync Failed", e);
    }
    try {
-      // if !success but no one else is in active mode,
-      // we are the leader anyway
-      // TODO: should we also be leader if there is only one other active?
-      // if we couldn't sync with it, it shouldn't be able to sync with us
-      if (!success
-          && !areAnyOtherReplicasActive(zkController, leaderProps, collection,
-              shardId)) {
-        log.info("Sync was not a success but no one else is active! I am the leader");
-        zkController.publish(core.getCoreDescriptor(), ZkStateReader.ACTIVE);
-        success = true;
-      }
-      
+
      if (success) {
        log.info("Sync Success - now sync replicas to me");
        
@ -121,7 +126,6 @@ public class SyncStrategy {
        
      } else {
        SolrException.log(log, "Sync Failed");
-        
        // lets see who seems ahead...
      }
      
@ -132,39 +136,12 @@ public class SyncStrategy {
    return success;
  }
  
-  private boolean areAnyOtherReplicasActive(ZkController zkController,
-      ZkNodeProps leaderProps, String collection, String shardId) {
-    ClusterState clusterState = zkController.getZkStateReader().getClusterState();
-    Map<String,Slice> slices = clusterState.getSlices(collection);
-    Slice slice = slices.get(shardId);
-    Map<String,Replica> shards = slice.getReplicasMap();
-    for (Map.Entry<String,Replica> shard : shards.entrySet()) {
-      String state = shard.getValue().getStr(ZkStateReader.STATE_PROP);
-//      System.out.println("state:"
-//          + state
-//          + shard.getValue().get(ZkStateReader.NODE_NAME_PROP)
-//          + " live: "
-//          + clusterState.liveNodesContain(shard.getValue().get(
-//              ZkStateReader.NODE_NAME_PROP)));
-      if ((state.equals(ZkStateReader.ACTIVE))
-          && clusterState.liveNodesContain(shard.getValue().getStr(
-          ZkStateReader.NODE_NAME_PROP))
-          && !new ZkCoreNodeProps(shard.getValue()).getCoreUrl().equals(
-              new ZkCoreNodeProps(leaderProps).getCoreUrl())) {
-        return true;
-      }
-    }
-    
-    return false;
-  }
-  
  private boolean syncWithReplicas(ZkController zkController, SolrCore core,
      ZkNodeProps props, String collection, String shardId) {
    List<ZkCoreNodeProps> nodes = zkController.getZkStateReader()
        .getReplicaProps(collection, shardId,
            props.getStr(ZkStateReader.NODE_NAME_PROP),
-            props.getStr(ZkStateReader.CORE_NAME_PROP), ZkStateReader.ACTIVE); // TODO:
-    // TODO should there be a state filter?
+            props.getStr(ZkStateReader.CORE_NAME_PROP));
    
    if (nodes == null) {
      // I have no replicas
@ -173,14 +150,13 @@ public class SyncStrategy {
    
    List<String> syncWith = new ArrayList<String>();
    for (ZkCoreNodeProps node : nodes) {
-      // if we see a leader, must be stale state, and this is the guy that went down
-      if (!node.getNodeProps().keySet().contains(ZkStateReader.LEADER_PROP)) {
-        syncWith.add(node.getCoreUrl());
-      }
+      syncWith.add(node.getCoreUrl());
    }
    
- 
-    PeerSync peerSync = new PeerSync(core, syncWith, core.getUpdateHandler().getUpdateLog().numRecordsToKeep);
+    // if we can't reach a replica for sync, we still consider the overall sync a success
+    // TODO: as an assurance, we should still try and tell the sync nodes that we couldn't reach
+    // to recover once more?
+    PeerSync peerSync = new PeerSync(core, syncWith, core.getUpdateHandler().getUpdateLog().numRecordsToKeep, true);
    return peerSync.sync();
  }
  
@ -193,7 +169,7 @@ public class SyncStrategy {
        .getZkStateReader()
        .getReplicaProps(collection, shardId,
            leaderProps.getStr(ZkStateReader.NODE_NAME_PROP),
-            leaderProps.getStr(ZkStateReader.CORE_NAME_PROP), ZkStateReader.ACTIVE);
+            leaderProps.getStr(ZkStateReader.CORE_NAME_PROP));
    if (nodes == null) {
      log.info(ZkCoreNodeProps.getCoreUrl(leaderProps) + " has no replicas");
      return;
@ -224,7 +200,7 @@ public class SyncStrategy {
         try {
           log.info(ZkCoreNodeProps.getCoreUrl(leaderProps) + ": Sync failed - asking replica (" + srsp.getShardAddress() + ") to recover.");
           
-           requestRecovery(((ShardCoreRequest)srsp.getShardRequest()).baseUrl, ((ShardCoreRequest)srsp.getShardRequest()).coreName);
+           requestRecovery(leaderProps, ((ShardCoreRequest)srsp.getShardRequest()).baseUrl, ((ShardCoreRequest)srsp.getShardRequest()).coreName);

         } catch (Throwable t) {
           SolrException.log(log, ZkCoreNodeProps.getCoreUrl(leaderProps) + ": Could not tell a replica to recover", t);
@ -271,16 +247,29 @@ public class SyncStrategy {
    shardHandler.submit(sreq, replica, sreq.params);
  }
  
-  private void requestRecovery(String baseUrl, String coreName) throws SolrServerException, IOException {
+  private void requestRecovery(final ZkNodeProps leaderProps, final String baseUrl, final String coreName) throws SolrServerException, IOException {
    // TODO: do this in background threads
-    RequestRecovery recoverRequestCmd = new RequestRecovery();
-    recoverRequestCmd.setAction(CoreAdminAction.REQUESTRECOVERY);
-    recoverRequestCmd.setCoreName(coreName);
-    
-    HttpSolrServer server = new HttpSolrServer(baseUrl);
-    server.setConnectionTimeout(45000);
-    server.setSoTimeout(45000);
-    server.request(recoverRequestCmd);
+    Thread thread = new Thread() {
+      {
+        setDaemon(true);
+      }
+      @Override
+      public void run() {
+        RequestRecovery recoverRequestCmd = new RequestRecovery();
+        recoverRequestCmd.setAction(CoreAdminAction.REQUESTRECOVERY);
+        recoverRequestCmd.setCoreName(coreName);
+        
+        HttpSolrServer server = new HttpSolrServer(baseUrl);
+        server.setConnectionTimeout(45000);
+        server.setSoTimeout(45000);
+        try {
+          server.request(recoverRequestCmd);
+        } catch (Throwable t) {
+          SolrException.log(log, ZkCoreNodeProps.getCoreUrl(leaderProps) + ": Could not tell a replica to recover", t);
+        }
+      }
+    };
+    thread.run();
  }
  
  public static ModifiableSolrParams params(String... params) {
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@ -27,6 +27,9 @@ import java.util.List;
 import java.util.Map;
 import java.util.Properties;
 import java.util.concurrent.Future;
+import java.util.concurrent.SynchronousQueue;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@ -48,17 +51,19 @@ import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.cloud.ZooKeeperException;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.ExecutorUtil;
+
 import org.apache.solr.core.Config;
 import org.apache.solr.core.CoreContainer;
 import org.apache.solr.core.CoreDescriptor;
 import org.apache.solr.core.SolrCore;
-
 import org.apache.solr.handler.component.ShardHandler;
 import org.apache.solr.update.UpdateLog;
 import org.apache.solr.util.DOMUtil;
+import org.apache.solr.util.DefaultSolrThreadFactory;
 import org.apache.zookeeper.CreateMode;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.KeeperException.NoNodeException;
+import org.apache.zookeeper.KeeperException.SessionExpiredException;
 import org.apache.zookeeper.data.Stat;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -96,6 +101,11 @@ public final class ZkController {
  public final static String COLLECTION_PARAM_PREFIX="collection.";
  public final static String CONFIGNAME_PROP="configName";

+  private ThreadPoolExecutor cmdDistribExecutor = new ThreadPoolExecutor(
+      0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS,
+      new SynchronousQueue<Runnable>(), new DefaultSolrThreadFactory(
+          "cmdDistribExecutor"));
+  
  private final Map<String, ElectionContext> electionContexts = Collections.synchronizedMap(new HashMap<String, ElectionContext>());
  
  private SolrZkClient zkClient;
@ -125,6 +135,9 @@ public final class ZkController {

  private String leaderVoteWait;

+  private int clientTimeout;
+
+
  /**
   * @param cc
   * @param zkServerAddress
@ -177,13 +190,19 @@ public final class ZkController {
    this.nodeName = this.hostName + ':' + this.localHostPort + '_' + this.localHostContext;
    this.baseURL = this.localHost + ":" + this.localHostPort + "/" + this.localHostContext;
    this.leaderVoteWait = leaderVoteWait;
-
+    this.clientTimeout = zkClientTimeout;
    zkClient = new SolrZkClient(zkServerAddress, zkClientTimeout, zkClientConnectTimeout,
        // on reconnect, reload cloud info
        new OnReconnect() {

          public void command() {
            try {
+              markAllAsNotLeader(registerOnReconnect);
+              
+              // this is troublesome - we dont want to kill anything the old leader accepted
+              // though I guess sync will likely get those updates back? But only if
+              // he is involved in the sync, and he certainly may not be
+            //  ExecutorUtil.shutdownAndAwaitTermination(cc.getCmdDistribExecutor());
              // we need to create all of our lost watches
              
              // seems we dont need to do this again...
@ -192,16 +211,18 @@ public final class ZkController {
              String adminPath;
              shardHandler = cc.getShardHandlerFactory().getShardHandler();
              adminPath = cc.getAdminPath();
-              ExecutorUtil.shutdownAndAwaitTermination(cc.getCmdDistribExecutor());
-              cc.newCmdDistribExecutor();
+
+              cc.cancelCoreRecoveries();
+              
+              registerAllCoresAsDown(registerOnReconnect, false);
+
              ZkController.this.overseer = new Overseer(shardHandler, adminPath, zkStateReader);
              ElectionContext context = new OverseerElectionContext(zkClient, overseer, getNodeName());
              overseerElector.joinElection(context);
              zkStateReader.createClusterStateWatchersAndUpdate();
-              
-              registerAllCoresAsDown(registerOnReconnect);
-              

+            //  cc.newCmdDistribExecutor();
+              
              // we have to register as live first to pick up docs in the buffer
              createEphemeralLiveNode();
              
@ -250,7 +271,7 @@ public final class ZkController {
  }

  private void registerAllCoresAsDown(
-      final CurrentCoreDescriptorProvider registerOnReconnect) {
+      final CurrentCoreDescriptorProvider registerOnReconnect, boolean updateLastPublished) {
    List<CoreDescriptor> descriptors = registerOnReconnect
        .getCurrentDescriptors();
    if (descriptors != null) {
@ -261,20 +282,58 @@ public final class ZkController {
            + descriptor.getName();
        try {
          descriptor.getCloudDescriptor().isLeader = false;
-          publish(descriptor, ZkStateReader.DOWN);
+          publish(descriptor, ZkStateReader.DOWN, updateLastPublished);
+        } catch (Exception e) {
+          try {
+            Thread.sleep(1000);
+          } catch (InterruptedException e1) {
+            Thread.currentThread().interrupt();
+          }
+          try {
+            publish(descriptor, ZkStateReader.DOWN);
+          } catch (Exception e2) {
+            SolrException.log(log, "", e2);
+            continue;
+          }
+        }
+        try {
          waitForLeaderToSeeDownState(descriptor, coreZkNodeName);
        } catch (Exception e) {
          SolrException.log(log, "", e);
+          try {
+            Thread.sleep(5000);
+          } catch (InterruptedException e1) {
+            Thread.currentThread().interrupt();
+          }
        }
      }
    }
  }
+  
+  private void markAllAsNotLeader(
+      final CurrentCoreDescriptorProvider registerOnReconnect) {
+    List<CoreDescriptor> descriptors = registerOnReconnect
+        .getCurrentDescriptors();
+    if (descriptors != null) {
+      for (CoreDescriptor descriptor : descriptors) {
+        descriptor.getCloudDescriptor().isLeader = false;
+      }
+    }
+  }

  /**
   * Closes the underlying ZooKeeper client.
   */
  public void close() {
    
+    if (cmdDistribExecutor != null) {
+      try {
+        ExecutorUtil.shutdownNowAndAwaitTermination(cmdDistribExecutor);
+      } catch (Throwable e) {
+        SolrException.log(log, e);
+      }
+    }
+    
    for (ElectionContext context : electionContexts.values()) {
      context.close();
    }
@ -380,7 +439,7 @@ public final class ZkController {
  }

  private void init(CurrentCoreDescriptorProvider registerOnReconnect) {
-    registerAllCoresAsDown(registerOnReconnect);
+    registerAllCoresAsDown(registerOnReconnect, true);
    
    try {
      // makes nodes zkNode
@ -658,8 +717,22 @@ public final class ZkController {
  /**
   * Get leader props directly from zk nodes.
   */
-  private ZkCoreNodeProps getLeaderProps(final String collection,
+  public ZkCoreNodeProps getLeaderProps(final String collection,
      final String slice) throws InterruptedException {
+    return getLeaderProps(collection, slice, false);
+  }
+  
+  /**
+   * Get leader props directly from zk nodes.
+   * 
+   * @param collection
+   * @param slice
+   * @return
+   * @throws KeeperException
+   * @throws InterruptedException
+   */
+  public ZkCoreNodeProps getLeaderProps(final String collection,
+      final String slice, boolean failImmediatelyOnExpiration) throws InterruptedException {
    int iterCount = 60;
    Exception exp = null;
    while (iterCount-- > 0) {
@ -672,15 +745,21 @@ public final class ZkController {
        return leaderProps;
      } catch (InterruptedException e) {
        throw e;
-      } catch (Exception e) {
+      } catch (SessionExpiredException e) {
+        if (failImmediatelyOnExpiration) {
+          throw new RuntimeException("Session has expired - could not get leader props", exp);
+        }
+        exp = e;
+        Thread.sleep(500);
+      }  catch (Exception e) {
        exp = e;
        Thread.sleep(500);
      }
      if (cc.isShutDown()) {
-        throw new RuntimeException("CoreContainer is shutdown");
+        throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "CoreContainer is shutdown");
      }
    }
-    throw new RuntimeException("Could not get leader props", exp);
+    throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Could not get leader props", exp);
  }


@ -700,7 +779,7 @@ public final class ZkController {
        .getCollectionName();
    
    ElectionContext context = new ShardLeaderElectionContext(leaderElector, shardId,
-        collection, coreZkNodeName, ourProps, this, cc, afterExpiration);
+        collection, coreZkNodeName, ourProps, this, cc);

    leaderElector.setup(context);
    electionContexts.put(coreZkNodeName, context);
@ -755,6 +834,10 @@ public final class ZkController {
    return baseURL;
  }

+  public void publish(final CoreDescriptor cd, final String state) throws KeeperException, InterruptedException {
+    publish(cd, state, true);
+  }
+  
  /**
   * Publish core state to overseer.
   * @param cd
@ -762,7 +845,7 @@ public final class ZkController {
   * @throws KeeperException
   * @throws InterruptedException
   */
-  public void publish(final CoreDescriptor cd, final String state) throws KeeperException, InterruptedException {
+  public void publish(final CoreDescriptor cd, final String state, boolean updateLastState) throws KeeperException, InterruptedException {
    //System.out.println(Thread.currentThread().getStackTrace()[3]);
    Integer numShards = cd.getCloudDescriptor().getNumShards();
    if (numShards == null) { //XXX sys prop hack
@ -780,6 +863,7 @@ public final class ZkController {
            .getCollectionName(), ZkStateReader.STATE_PROP, state,
        ZkStateReader.NUM_SHARDS_PROP, numShards != null ? numShards.toString()
            : null);
+    cd.getCloudDescriptor().lastPublished = state;
    overseerJobQueue.offer(ZkStateReader.toJSON(m));
  }

@ -1103,7 +1187,6 @@ public final class ZkController {
      prepCmd.setNodeName(getNodeName());
      prepCmd.setCoreNodeName(coreZkNodeName);
      prepCmd.setState(ZkStateReader.DOWN);
-      prepCmd.setPauseFor(0);
      
      // let's retry a couple times - perhaps the leader just went down,
      // or perhaps he is just not quite ready for us yet
@ -1211,5 +1294,14 @@ public final class ZkController {
  public DistributedQueue getOverseerCollectionQueue() {
    return overseerCollectionQueue;
  }
+  
+  public int getClientTimeout() {
+    return clientTimeout;
+  }
+
+  // may return null if not in zk mode
+  public ThreadPoolExecutor getCmdDistribExecutor() {
+    return cmdDistribExecutor;
+  }

 }
--- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
@ -144,8 +144,6 @@ public class CoreContainer
  private String zkHost;
  private Map<SolrCore,String> coreToOrigName = new ConcurrentHashMap<SolrCore,String>();
  private String leaderVoteWait;
-
-  private volatile ThreadPoolExecutor cmdDistribExecutor;
  
  {
    log.info("New CoreContainer " + System.identityHashCode(this));
@ -190,8 +188,6 @@ public class CoreContainer
  }

  protected void initZooKeeper(String zkHost, int zkClientTimeout) {
-    newCmdDistribExecutor();
-    
    // if zkHost sys property is not set, we are not using ZooKeeper
    String zookeeperHost;
    if(zkHost == null) {
@ -294,17 +290,6 @@ public class CoreContainer
    
  }

-  public void newCmdDistribExecutor() {
-    cmdDistribExecutor = new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5,
-        TimeUnit.SECONDS, new SynchronousQueue<Runnable>(),
-        new DefaultSolrThreadFactory("cmdDistribExecutor"));
-  }
-
-  // may return null if not in zk mode
-  public ThreadPoolExecutor getCmdDistribExecutor() {
-    return cmdDistribExecutor;
-  }
-
  public Properties getContainerProperties() {
    return containerProperties;
  }
@ -476,7 +461,7 @@ public class CoreContainer
    hostContext = cfg.get("solr/cores/@hostContext", DEFAULT_HOST_CONTEXT);
    host = cfg.get("solr/cores/@host", null);
    
-    leaderVoteWait = cfg.get("solr/cores/@leaderVoteWait", null);
+    leaderVoteWait = cfg.get("solr/cores/@leaderVoteWait", "180000"); // 3 minutes

    if(shareSchema){
      indexSchemaCache = new ConcurrentHashMap<String ,IndexSchema>();
@ -601,48 +586,42 @@ public class CoreContainer
   * Stops all cores.
   */
  public void shutdown() {
-    log.info("Shutting down CoreContainer instance="+System.identityHashCode(this));
+    log.info("Shutting down CoreContainer instance="
+        + System.identityHashCode(this));
    isShutDown = true;
    
-    if (cmdDistribExecutor != null) {
-      try {
-        ExecutorUtil.shutdownAndAwaitTermination(cmdDistribExecutor);
-      } catch (Throwable e) {
-        SolrException.log(log, e);
-      }
-    }
-    
    if (isZooKeeperAware()) {
      cancelCoreRecoveries();
    }
-    
-    synchronized(cores) {
-      try {
+    try {
+      synchronized (cores) {
+        
        for (SolrCore core : cores.values()) {
          try {
-             core.close();
+            core.close();
          } catch (Throwable t) {
            SolrException.log(log, "Error shutting down core", t);
          }
        }
        cores.clear();
-      } finally {
-        if (shardHandlerFactory != null) {
-          shardHandlerFactory.close();
-        }
-
-        // we want to close zk stuff last
-        if(zkController != null) {
-          zkController.close();
-        }
-        if (zkServer != null) {
-          zkServer.stop();
-        }
      }
+    } finally {
+      if (shardHandlerFactory != null) {
+        shardHandlerFactory.close();
+      }
+      
+      // we want to close zk stuff last
+      if (zkController != null) {
+        zkController.close();
+      }
+      if (zkServer != null) {
+        zkServer.stop();
+      }
+      
    }
  }

-  private void cancelCoreRecoveries() {
+  public void cancelCoreRecoveries() {
    ArrayList<SolrCoreState> coreStates = new ArrayList<SolrCoreState>();
    synchronized (cores) {
      for (SolrCore core : cores.values()) {
--- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
@ -902,10 +902,7 @@ public class CoreAdminHandler extends RequestHandlerBase {
    String waitForState = params.get("state");
    Boolean checkLive = params.getBool("checkLive");
    Boolean onlyIfLeader = params.getBool("onlyIfLeader");
-    int pauseFor = params.getInt("pauseFor", 0);
-    

-    
    String state = null;
    boolean live = false;
    int retry = 0;
@ -965,13 +962,7 @@ public class CoreAdminHandler extends RequestHandlerBase {
      }
      Thread.sleep(1000);
    }
-    
-    // small safety net for any updates that started with state that
-    // kept it from sending the update to be buffered -
-    // pause for a while to let any outstanding updates finish
-    // System.out.println("I saw state:" + state + " sleep for " + pauseFor +
-    // " live:" + live);
-    Thread.sleep(pauseFor);
+
    
    // solrcloud_debug
    // try {;
--- a/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandlerFactory.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandlerFactory.java
@ -164,7 +164,7 @@ public class HttpShardHandlerFactory extends ShardHandlerFactory implements Plug
      SolrException.log(log, e);
    }
    try {
-      ExecutorUtil.shutdownAndAwaitTermination(commExecutor);
+      ExecutorUtil.shutdownNowAndAwaitTermination(commExecutor);
    } catch (Throwable e) {
      SolrException.log(log, e);
    }
--- a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java
@ -523,8 +523,9 @@ public class RealTimeGetComponent extends SearchComponent
  public void processSync(ResponseBuilder rb, int nVersions, String sync) {
    List<String> replicas = StrUtils.splitSmart(sync, ",", true);
    
+    boolean cantReachIsSuccess = rb.req.getParams().getBool("cantReachIsSuccess", false);
    
-    PeerSync peerSync = new PeerSync(rb.req.getCore(), replicas, nVersions);
+    PeerSync peerSync = new PeerSync(rb.req.getCore(), replicas, nVersions, cantReachIsSuccess);
    boolean success = peerSync.sync();
    
    // TODO: more complex response?
--- a/solr/core/src/java/org/apache/solr/update/PeerSync.java
+++ b/solr/core/src/java/org/apache/solr/update/PeerSync.java
@ -46,7 +46,6 @@ import org.apache.solr.handler.component.ShardResponse;
 import org.apache.solr.request.LocalSolrQueryRequest;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.response.SolrQueryResponse;
-import org.apache.solr.update.processor.DistributedUpdateProcessor;
 import org.apache.solr.update.processor.DistributedUpdateProcessorFactory;
 import org.apache.solr.update.processor.RunUpdateProcessorFactory;
 import org.apache.solr.update.processor.UpdateRequestProcessor;
@ -78,6 +77,7 @@ public class PeerSync  {
  private Set<Long> requestedUpdateSet;
  private long ourLowThreshold;  // 20th percentile
  private long ourHighThreshold; // 80th percentile
+  private boolean cantReachIsSuccess;
  private static final HttpClient client;
  static {
    ModifiableSolrParams params = new ModifiableSolrParams();
@ -127,18 +127,21 @@ public class PeerSync  {
    Exception updateException;
  }

-
+  public PeerSync(SolrCore core, List<String> replicas, int nUpdates) {
+    this(core, replicas, nUpdates, false);
+  }
+  
  /**
   *
   * @param core
   * @param replicas
   * @param nUpdates
   */
-  public PeerSync(SolrCore core, List<String> replicas, int nUpdates) {
+  public PeerSync(SolrCore core, List<String> replicas, int nUpdates, boolean cantReachIsSuccess) {
    this.replicas = replicas;
    this.nUpdates = nUpdates;
    this.maxUpdates = nUpdates;
-
+    this.cantReachIsSuccess = cantReachIsSuccess;

    
    uhandler = core.getUpdateHandler();
@ -214,6 +217,7 @@ public class PeerSync  {
    if (startingVersions != null) {
      if (startingVersions.size() == 0) {
        // no frame of reference to tell of we've missed updates
+        log.warn("no frame of reference to tell of we've missed updates");
        return false;
      }
      Collections.sort(startingVersions, absComparator);
@ -298,20 +302,25 @@ public class PeerSync  {
      // If the replica went down between asking for versions and asking for specific updates, that
      // shouldn't be treated as success since we counted on getting those updates back (and avoided
      // redundantly asking other replicas for them).
-      if (sreq.purpose == 1 && srsp.getException() instanceof SolrServerException) {
+      if (cantReachIsSuccess && sreq.purpose == 1 && srsp.getException() instanceof SolrServerException) {
        Throwable solrException = ((SolrServerException) srsp.getException())
            .getRootCause();
        if (solrException instanceof ConnectException
            || solrException instanceof NoHttpResponseException) {
-          log.info(msg() + " couldn't connect to " + srsp.getShardAddress() + ", counting as success");
+          log.warn(msg() + " couldn't connect to " + srsp.getShardAddress() + ", counting as success");

          return true;
        }
      }
+      
+      if (cantReachIsSuccess && sreq.purpose == 1 && srsp.getException() instanceof SolrException && ((SolrException) srsp.getException()).code() == 503) {
+        log.warn(msg() + " got a 503 from " + srsp.getShardAddress() + ", counting as success");
+        return true;
+      }
      // TODO: at least log???
      // srsp.getException().printStackTrace(System.out);
-      
-      log.warn(msg() + " exception talking to " + srsp.getShardAddress() + ", counting as success");
+     
+      log.warn(msg() + " exception talking to " + srsp.getShardAddress() + ", failed", srsp.getException());
      
      return false;
    }
--- a/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java
+++ b/solr/core/src/java/org/apache/solr/update/SolrCmdDistributor.java
@ -29,9 +29,7 @@ import java.util.concurrent.CompletionService;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorCompletionService;
 import java.util.concurrent.Future;
-import java.util.concurrent.SynchronousQueue;
 import java.util.concurrent.ThreadPoolExecutor;
-import java.util.concurrent.TimeUnit;

 import org.apache.http.client.HttpClient;
 import org.apache.solr.client.solrj.SolrServerException;
@ -51,7 +49,7 @@ import org.slf4j.LoggerFactory;


 public class SolrCmdDistributor {
-  private static final int MAX_RETRIES_ON_FORWARD = 6;
+  private static final int MAX_RETRIES_ON_FORWARD = 10;
  public static Logger log = LoggerFactory.getLogger(SolrCmdDistributor.class);

  static final HttpClient client;
@ -85,9 +83,12 @@ public class SolrCmdDistributor {
    ModifiableSolrParams params;
  }
  
+  public static interface AbortCheck {
+    public boolean abortCheck();
+  }
+  
  public SolrCmdDistributor(int numHosts, ThreadPoolExecutor executor) {
-    int maxPermits = Math.max(8, (numHosts - 1) * 8);
-    
+    int maxPermits = Math.max(16, numHosts * 16);
    // limits how many tasks can actually execute at once
    if (maxPermits != semaphore.getMaxPermits()) {
      semaphore.setMaxPermits(maxPermits);
@ -307,12 +308,13 @@ public class SolrCmdDistributor {
    Callable<Request> task = new Callable<Request>() {
      @Override
      public Request call() throws Exception {
-        Request clonedRequest = new Request();
-        clonedRequest.node = sreq.node;
-        clonedRequest.ureq = sreq.ureq;
-        clonedRequest.retries = sreq.retries;
-        
+        Request clonedRequest = null;
        try {
+          clonedRequest = new Request();
+          clonedRequest.node = sreq.node;
+          clonedRequest.ureq = sreq.ureq;
+          clonedRequest.retries = sreq.retries;
+          
          String fullUrl;
          if (!url.startsWith("http://") && !url.startsWith("https://")) {
            fullUrl = "http://" + url;
@ -349,7 +351,7 @@ public class SolrCmdDistributor {
      semaphore.acquire();
    } catch (InterruptedException e) {
      Thread.currentThread().interrupt();
-      throw new RuntimeException("Update thread interrupted");
+      throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Update thread interrupted", e);
    }
    pending.add(completionService.submit(task));
    
--- a/solr/core/src/java/org/apache/solr/update/UpdateLog.java
+++ b/solr/core/src/java/org/apache/solr/update/UpdateLog.java
@ -20,6 +20,7 @@ package org.apache.solr.update;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.util.BytesRef;
 import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.params.SolrParams;
@ -31,6 +32,7 @@ import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.request.SolrRequestInfo;
 import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.search.SolrIndexSearcher;
+import org.apache.solr.update.processor.DistributedUpdateProcessor;
 import org.apache.solr.update.processor.DistributedUpdateProcessorFactory;
 import org.apache.solr.update.processor.RunUpdateProcessorFactory;
 import org.apache.solr.update.processor.UpdateRequestProcessor;
@ -82,6 +84,8 @@ public class UpdateLog implements PluginInfoInitialized {
    public int deleteByQuery;
    public int errors;

+    public boolean failed;
+
    @Override
    public String toString() {
      return "RecoveryInfo{adds="+adds+" deletes="+deletes+ " deleteByQuery="+deleteByQuery+" errors="+errors + " positionOfStart="+positionOfStart+"}";
@ -1117,6 +1121,7 @@ public class UpdateLog implements PluginInfoInitialized {
    public void run() {
      ModifiableSolrParams params = new ModifiableSolrParams();
      params.set(DISTRIB_UPDATE_PARAM, FROMLEADER.toString());
+      params.set(DistributedUpdateProcessor.LOG_REPLAY, "true");
      req = new LocalSolrQueryRequest(uhandler.core, params);
      rsp = new SolrQueryResponse();
      SolrRequestInfo.setRequestInfo(new SolrRequestInfo(req, rsp));    // setting request info will help logging
@ -1125,9 +1130,17 @@ public class UpdateLog implements PluginInfoInitialized {
        for (TransactionLog translog : translogs) {
          doReplay(translog);
        }
+      } catch (SolrException e) {
+        if (e.code() == ErrorCode.SERVICE_UNAVAILABLE.code) {
+          SolrException.log(log, e);
+          recoveryInfo.failed = true;
+        } else {
+          recoveryInfo.errors++;
+          SolrException.log(log, e);
+        }
      } catch (Throwable e) {
        recoveryInfo.errors++;
-        SolrException.log(log,e);
+        SolrException.log(log, e);
      } finally {
        // change the state while updates are still blocked to prevent races
        state = State.ACTIVE;
@ -1275,6 +1288,13 @@ public class UpdateLog implements PluginInfoInitialized {
            recoveryInfo.errors++;
            loglog.warn("REPLAY_ERR: Unexpected log entry or corrupt log.  Entry=" + o, cl);
            // would be caused by a corrupt transaction log
+          }  catch (SolrException ex) {
+            if (ex.code() == ErrorCode.SERVICE_UNAVAILABLE.code) {
+              throw ex;
+            }
+            recoveryInfo.errors++;
+            loglog.warn("REYPLAY_ERR: IOException reading log", ex);
+            // could be caused by an incomplete flush if recovering from log
          } catch (Throwable ex) {
            recoveryInfo.errors++;
            loglog.warn("REPLAY_ERR: Exception replaying log", ex);
--- a/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java
+++ b/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java
@ -100,7 +100,8 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
  }

  public static final String COMMIT_END_POINT = "commit_end_point";
-
+  public static final String LOG_REPLAY = "log_replay";
+  
  private final SolrQueryRequest req;
  private final SolrQueryResponse rsp;
  private final UpdateRequestProcessor next;
@ -120,7 +121,7 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
  
  private final SchemaField idField;
  
-  private final SolrCmdDistributor cmdDistrib;
+  private SolrCmdDistributor cmdDistrib;

  private boolean zkEnabled = false;

@ -162,6 +163,7 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
    zkController = req.getCore().getCoreDescriptor().getCoreContainer().getZkController();
    if (zkEnabled) {
      numNodes =  zkController.getZkStateReader().getClusterState().getLiveNodes().size();
+      cmdDistrib = new SolrCmdDistributor(numNodes, coreDesc.getCoreContainer().getZkController().getCmdDistribExecutor());
    }
    //this.rsp = reqInfo != null ? reqInfo.getRsp() : null;

@ -170,8 +172,6 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
    if (cloudDesc != null) {
      collection = cloudDesc.getCollectionName();
    }
-
-    cmdDistrib = new SolrCmdDistributor(numNodes, coreDesc.getCoreContainer().getCmdDistribExecutor());
  }

  private List<Node> setupRequest(int hash) {
@ -249,33 +249,17 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {

  private void doDefensiveChecks(String shardId, DistribPhase phase) {
    String from = req.getParams().get("distrib.from");
+    boolean logReplay = req.getParams().getBool(LOG_REPLAY, false);
    boolean localIsLeader = req.getCore().getCoreDescriptor().getCloudDescriptor().isLeader();
-    if (DistribPhase.FROMLEADER == phase && localIsLeader && from != null) { // from will be null on log replay
+    if (!logReplay && DistribPhase.FROMLEADER == phase && localIsLeader && from != null) { // from will be null on log replay
      log.error("Request says it is coming from leader, but we are the leader: " + req.getParamString());
      throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Request says it is coming from leader, but we are the leader");
    }
-
-    // this is too restrictive - cluster state can be stale - can cause shard inconsistency
-//    if (DistribPhase.FROMLEADER == phase && from != null) { // from will be null on log replay
-//     
-//      ZkCoreNodeProps clusterStateLeader = new ZkCoreNodeProps(zkController
-//          .getClusterState().getLeader(collection, shardId));
-//    
-//      if (clusterStateLeader.getNodeProps() == null
-//          || !clusterStateLeader.getCoreUrl().equals(from)) {
-//        String coreUrl = null;
-//        if (clusterStateLeader.getNodeProps() != null) {
-//          coreUrl = clusterStateLeader.getCoreUrl();
-//        }
-//        log.error("We got a request from the leader, but it's not who our cluster state says is the leader :"
-//            + req.getParamString()
-//            + " : "
-//            + coreUrl);
-//
-//        new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "We got a request from the leader, but it's not who our cluster state says is the leader.");
-//      }
-// 
-//    }
+    
+    if (isLeader && !localIsLeader) {
+      log.error("ClusterState says we are the leader, but locally we don't think so");
+      throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "ClusterState says we are the leader, but locally we don't think so");
+    }
  }


@ -350,10 +334,6 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
    
    ModifiableSolrParams params = null;
    if (nodes != null) {
-      if (isLeader && !req.getCore().getCoreDescriptor().getCloudDescriptor().isLeader()) {
-        log.error("Abort sending request to replicas, we are no longer leader");
-        throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Abort sending request to replicas, we are no longer leader");
-      }
      
      params = new ModifiableSolrParams(req.getParams());
      params.set(DISTRIB_UPDATE_PARAM, 
@ -693,10 +673,6 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {

    ModifiableSolrParams params = null;
    if (nodes != null) {
-      if (isLeader && !req.getCore().getCoreDescriptor().getCloudDescriptor().isLeader()) {
-        log.error("Abort sending request to replicas, we are no longer leader");
-        throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Abort sending request to replicas, we are no longer leader");
-      }
      
      params = new ModifiableSolrParams(req.getParams());
      params.set(DISTRIB_UPDATE_PARAM, 
@ -764,7 +740,7 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
        try {
          leaderProps = zkController.getZkStateReader().getLeaderProps(collection, sliceName);
        } catch (InterruptedException e) {
-          throw new SolrException(ErrorCode.SERVER_ERROR, "Exception finding leader for shard " + sliceName, e);
+          throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Exception finding leader for shard " + sliceName, e);
        }

        // TODO: What if leaders changed in the meantime?
@ -865,10 +841,6 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {

    // forward to all replicas
    if (leaderLogic && replicas != null) {
-      if (!req.getCore().getCoreDescriptor().getCloudDescriptor().isLeader()) {
-        log.error("Abort sending request to replicas, we are no longer leader");
-        throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Abort sending request to replicas, we are no longer leader");
-      }
      ModifiableSolrParams params = new ModifiableSolrParams(req.getParams());
      params.set(VERSION_FIELD, Long.toString(cmd.getVersion()));
      params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString());
@ -890,11 +862,14 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {


  private void zkCheck() {
-    int retries = 10;
-    while (!zkController.isConnected()) {
-      
-      if (retries-- == 0) {
-        throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Cannot talk to ZooKeeper - Updates are disabled.");
+    if (zkController.isConnected()) {
+      return;
+    }
+    
+    long timeoutAt = System.currentTimeMillis() + zkController.getClientTimeout();
+    while (System.currentTimeMillis() < timeoutAt) {
+      if (zkController.isConnected()) {
+        return;
      }
      try {
        Thread.sleep(100);
@ -903,7 +878,7 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
        break;
      }
    }
-    
+    throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Cannot talk to ZooKeeper - Updates are disabled.");
  }

  private boolean versionDelete(DeleteUpdateCommand cmd) throws IOException {
@ -1044,7 +1019,7 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
  
  @Override
  public void finish() throws IOException {
-    doFinish();
+    if (zkEnabled) doFinish();
    
    if (next != null && nodes == null) next.finish();
  }
--- a/solr/core/src/test-files/solr/solr.xml
+++ b/solr/core/src/test-files/solr/solr.xml
@ -28,7 +28,8 @@
  adminPath: RequestHandler path to manage cores.  
    If 'null' (or absent), cores will not be manageable via request handler
  -->
-  <cores adminPath="/admin/cores" defaultCoreName="collection1" host="127.0.0.1" hostPort="${hostPort:8983}" hostContext="solr" zkClientTimeout="8000" numShards="${numShards:3}" shareSchema="${shareSchema:false}">
+  <cores adminPath="/admin/cores" defaultCoreName="collection1" host="127.0.0.1" hostPort="${hostPort:8983}" 
+         hostContext="solr" zkClientTimeout="5000" numShards="${numShards:3}" shareSchema="${shareSchema:false}">
    <core name="collection1" instanceDir="collection1" shard="${shard:}" collection="${collection:collection1}" config="${solrconfig:solrconfig.xml}" schema="${schema:schema.xml}"/>
  </cores>
 </solr>
--- a/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZk2Test.java
+++ b/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZk2Test.java
@ -223,7 +223,7 @@ public class BasicDistributedZk2Test extends AbstractFullDistribZkTestBase {
    
    query("q", "*:*", "sort", "n_tl1 desc");
    
-    // try adding a doc with CloudSolrServer
+
    cloudClient.setDefaultCollection(DEFAULT_COLLECTION);

    long numFound2 = cloudClient.query(new SolrQuery("*:*")).getResults().getNumFound();
@ -235,6 +235,7 @@ public class BasicDistributedZk2Test extends AbstractFullDistribZkTestBase {
    
    controlClient.add(doc);
    
+    // try adding a doc with CloudSolrServer
    UpdateRequest ureq = new UpdateRequest();
    ureq.add(doc);
    // ureq.setParam("update.chain", DISTRIB_UPDATE_CHAIN);
--- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeTest.java
@ -42,7 +42,7 @@ import org.slf4j.LoggerFactory;
 public class ChaosMonkeyNothingIsSafeTest extends AbstractFullDistribZkTestBase {
  public static Logger log = LoggerFactory.getLogger(ChaosMonkeyNothingIsSafeTest.class);
  
-  private static final int BASE_RUN_LENGTH = 20000;
+  private static final int BASE_RUN_LENGTH = 60000;

  @BeforeClass
  public static void beforeSuperClass() {
@ -112,14 +112,18 @@ public class ChaosMonkeyNothingIsSafeTest extends AbstractFullDistribZkTestBase
        searchThread.start();
      }
      
-      // TODO: only do this randomly - if we don't do it, compare against control below
-      FullThrottleStopableIndexingThread ftIndexThread = new FullThrottleStopableIndexingThread(
-          clients, i * 50000, true);
-      threads.add(ftIndexThread);
-      ftIndexThread.start();
+      // TODO: only do this sometimes so that we can sometimes compare against control
+      boolean runFullThrottle = random().nextBoolean();
+      if (runFullThrottle) {
+        FullThrottleStopableIndexingThread ftIndexThread = new FullThrottleStopableIndexingThread(
+            clients, i * 50000, true);
+        threads.add(ftIndexThread);
+        ftIndexThread.start();
+      }
      
-      chaosMonkey.startTheMonkey(true, 1500);
-      int runLength = atLeast(BASE_RUN_LENGTH);
+      chaosMonkey.startTheMonkey(true, 10000);
+      //int runLength = atLeast(BASE_RUN_LENGTH);
+      int runLength = BASE_RUN_LENGTH;
      try {
        Thread.sleep(runLength);
      } finally {
@ -138,7 +142,7 @@ public class ChaosMonkeyNothingIsSafeTest extends AbstractFullDistribZkTestBase
       // we expect full throttle fails, but not cloud client...
       for (StopableThread indexThread : threads) {
         if (indexThread instanceof StopableIndexingThread && !(indexThread instanceof FullThrottleStopableIndexingThread)) {
-           assertEquals(0, ((StopableIndexingThread) indexThread).getFails());
+           //assertEquals(0, ((StopableIndexingThread) indexThread).getFails());
         }
       }
      
@ -162,9 +166,9 @@ public class ChaosMonkeyNothingIsSafeTest extends AbstractFullDistribZkTestBase
      assertTrue(zkStateReader.getClusterState().getLiveNodes().size() > 0);
      
      
-      // we dont't current check vs control because the full throttle thread can
-      // have request fails
-      checkShardConsistency(false, true);
+      // full throttle thread can
+      // have request fails 
+      checkShardConsistency(!runFullThrottle, true);
      
      long ctrlDocs = controlClient.query(new SolrQuery("*:*")).getResults()
      .getNumFound(); 
--- a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java
@ -211,11 +211,12 @@ public class LeaderElectionIntegrationTest extends SolrTestCaseJ4 {
  @Test
  public void testLeaderElectionAfterClientTimeout() throws Exception {
    // TODO: work out the best timing here...
-    System.setProperty("zkClientTimeout", "500");
+    System.setProperty("zkClientTimeout", Integer.toString(ZkTestServer.TICK_TIME * 2 + 100));
    // timeout the leader
    String leader = getLeader();
    int leaderPort = getLeaderPort(leader);
-    containerMap.get(leaderPort).getZkController().getZkClient().getSolrZooKeeper().pauseCnxn(2000);
+    ZkController zkController = containerMap.get(leaderPort).getZkController();
+    zkController.getZkClient().getSolrZooKeeper().pauseCnxn(zkController.getClientTimeout() + 100);
    
    for (int i = 0; i < 60; i++) { // wait till leader is changed
      if (leaderPort != getLeaderPort(getLeader())) {
@ -224,6 +225,9 @@ public class LeaderElectionIntegrationTest extends SolrTestCaseJ4 {
      Thread.sleep(100);
    }
    
+    // make sure we have waited long enough for the first leader to have come back
+    Thread.sleep(ZkTestServer.TICK_TIME * 2 + 100);
+    
    if (VERBOSE) System.out.println("kill everyone");
    // kill everyone but the first leader that should have reconnected by now
    for (Map.Entry<Integer,CoreContainer> entry : containerMap.entrySet()) {
@ -232,11 +236,15 @@ public class LeaderElectionIntegrationTest extends SolrTestCaseJ4 {
      }
    }

-    for (int i = 0; i < 60; i++) { // wait till leader is changed
-      if (leaderPort == getLeaderPort(getLeader())) {
-        break;
+    for (int i = 0; i < 320; i++) { // wait till leader is changed
+      try {
+        if (leaderPort == getLeaderPort(getLeader())) {
+          break;
+        }
+        Thread.sleep(100);
+      } catch (Exception e) {
+        continue;
      }
-      Thread.sleep(100);
    }

    // the original leader should be leader again now - everyone else is down
--- a/solr/core/src/test/org/apache/solr/update/SolrCmdDistributorTest.java
+++ b/solr/core/src/test/org/apache/solr/update/SolrCmdDistributorTest.java
@ -36,6 +36,7 @@ import org.apache.solr.common.cloud.ZkCoreNodeProps;
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.common.util.ExecutorUtil;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.update.SolrCmdDistributor.Node;
 import org.apache.solr.update.SolrCmdDistributor.Response;
@ -43,16 +44,13 @@ import org.apache.solr.update.SolrCmdDistributor.StdNode;
 import org.apache.solr.util.DefaultSolrThreadFactory;

 public class SolrCmdDistributorTest extends BaseDistributedSearchTestCase {
-  private ThreadPoolExecutor executor = new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5,
-      TimeUnit.SECONDS, new SynchronousQueue<Runnable>(),
-      new DefaultSolrThreadFactory("cmdDistribExecutor"));
+  private static ThreadPoolExecutor executor;
  
  public SolrCmdDistributorTest() {
    fixShardCount = true;
    shardCount = 4;
    stress = 0;
  }
-  

  public static String getSchemaFile() {
    return "schema.xml";
@ -91,7 +89,7 @@ public class SolrCmdDistributorTest extends BaseDistributedSearchTestCase {
  public void doTest() throws Exception {
    del("*:*");
    
-    SolrCmdDistributor cmdDistrib = new SolrCmdDistributor(8, executor);
+    SolrCmdDistributor cmdDistrib = new SolrCmdDistributor(5, executor);
    
    ModifiableSolrParams params = new ModifiableSolrParams();
    List<Node> nodes = new ArrayList<Node>();
@ -125,7 +123,7 @@ public class SolrCmdDistributorTest extends BaseDistributedSearchTestCase {
    nodes.add(new StdNode(new ZkCoreNodeProps(nodeProps)));
    
    // add another 2 docs to control and 3 to client
-    cmdDistrib = new SolrCmdDistributor(8, executor);
+    cmdDistrib = new SolrCmdDistributor(5, executor);
    cmd.solrDoc = sdoc("id", 2);
    cmdDistrib.distribAdd(cmd, nodes, params);
    
@ -158,7 +156,7 @@ public class SolrCmdDistributorTest extends BaseDistributedSearchTestCase {
    DeleteUpdateCommand dcmd = new DeleteUpdateCommand(null);
    dcmd.id = "2";
    
-    cmdDistrib = new SolrCmdDistributor(8, executor);
+    cmdDistrib = new SolrCmdDistributor(5, executor);
    cmdDistrib.distribDelete(dcmd, nodes, params);
    
    cmdDistrib.distribCommit(ccmd, nodes, params);
@ -177,19 +175,16 @@ public class SolrCmdDistributorTest extends BaseDistributedSearchTestCase {
        .getNumFound();
    assertEquals(results.toString(), 2, numFound);
    
-    // debug stuff
    for (SolrServer c : clients) {
      c.optimize();
-      // distrib optimize is not working right yet, so call it on each client
      //System.out.println(clients.get(0).request(new LukeRequest()));
    }
    
    int id = 5;
    
-    cmdDistrib = new SolrCmdDistributor(8, executor);
+    cmdDistrib = new SolrCmdDistributor(5, executor);
    
-    nodes.clear();
-    int cnt = atLeast(200);
+    int cnt = atLeast(201);
    for (int i = 0; i < cnt; i++) {
      nodes.clear();
      for (SolrServer c : clients) {
@ -199,25 +194,52 @@ public class SolrCmdDistributorTest extends BaseDistributedSearchTestCase {
        HttpSolrServer httpClient = (HttpSolrServer) c;
        nodeProps = new ZkNodeProps(ZkStateReader.BASE_URL_PROP,
            httpClient.getBaseURL(), ZkStateReader.CORE_NAME_PROP, "");
+        
        nodes.add(new StdNode(new ZkCoreNodeProps(nodeProps)));

      }
+      AddUpdateCommand c = new AddUpdateCommand(null);
+      c.solrDoc = sdoc("id", id++);
+      if (nodes.size() > 0) {
+        cmdDistrib.distribAdd(c, nodes, params);
+      }
+    }
+    
+    nodes.clear();
+    
+    for (SolrServer c : clients) {
+      HttpSolrServer httpClient = (HttpSolrServer) c;
+      nodeProps = new ZkNodeProps(ZkStateReader.BASE_URL_PROP,
+          httpClient.getBaseURL(), ZkStateReader.CORE_NAME_PROP, "");
      
-      cmd.solrDoc = sdoc("id", id++);
-      cmdDistrib.distribAdd(cmd, nodes, params);
+      nodes.add(new StdNode(new ZkCoreNodeProps(nodeProps)));
    }

-    cmdDistrib.finish();
-    
    cmdDistrib.distribCommit(ccmd, nodes, params);
    
+    
+    cmdDistrib.finish();
+    
    for (SolrServer c : clients) {
      NamedList<Object> resp = c.request(new LukeRequest());
-      System.out.println(resp);
      assertEquals("SOLR-3428: We only did adds - there should be no deletes",
          ((NamedList<Object>) resp.get("index")).get("numDocs"),
          ((NamedList<Object>) resp.get("index")).get("maxDoc"));
    }

  }
+  
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    executor = new ThreadPoolExecutor(0, 5 * 16, 5,
+        TimeUnit.SECONDS, new SynchronousQueue<Runnable>(),
+        new DefaultSolrThreadFactory("cmdDistribExecutor"));
+  }
+  
+  @Override
+  public void tearDown() throws Exception {
+    ExecutorUtil.shutdownNowAndAwaitTermination(executor);
+    super.tearDown();
+  }
 }
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/request/CoreAdminRequest.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/request/CoreAdminRequest.java
@ -120,7 +120,6 @@ public class CoreAdminRequest extends SolrRequest
    protected String coreNodeName;
    protected String state;
    protected Boolean checkLive;
-    protected Integer pauseFor;
    protected Boolean onlyIfLeader;
    

@ -160,14 +159,6 @@ public class CoreAdminRequest extends SolrRequest
      this.checkLive = checkLive;
    }
    
-    public Integer getPauseFor() {
-      return pauseFor;
-    }
-
-    public void setPauseFor(Integer pauseFor) {
-      this.pauseFor = pauseFor;
-    }
-    
    public boolean isOnlyIfLeader() {
      return onlyIfLeader;
    }
@ -202,10 +193,6 @@ public class CoreAdminRequest extends SolrRequest
        params.set( "checkLive", checkLive);
      }
      
-      if (pauseFor != null) {
-        params.set( "pauseFor", pauseFor);
-      }
-      
      if (onlyIfLeader != null) {
        params.set( "onlyIfLeader", onlyIfLeader);
      }
--- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
+++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
@ -410,7 +410,7 @@ public class ZkStateReader {
      }
      Thread.sleep(50);
    }
-    throw new RuntimeException("No registered leader was found, collection:" + collection + " slice:" + shard);
+    throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "No registered leader was found, collection:" + collection + " slice:" + shard);
  }

  /**
--- a/solr/solrj/src/java/org/apache/solr/common/util/ExecutorUtil.java
+++ b/solr/solrj/src/java/org/apache/solr/common/util/ExecutorUtil.java
@ -20,7 +20,6 @@ package org.apache.solr.common.util;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.TimeUnit;

-import org.apache.solr.common.SolrException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

@ -28,24 +27,38 @@ import org.slf4j.LoggerFactory;
 public class ExecutorUtil {
  public static Logger log = LoggerFactory.getLogger(ExecutorUtil.class);
  
-  public static void shutdownAndAwaitTermination(ExecutorService pool) {
+  public static void shutdownNowAndAwaitTermination(ExecutorService pool) {
    pool.shutdown(); // Disable new tasks from being submitted
    pool.shutdownNow(); // Cancel currently executing tasks
-    try {
-      // Wait a while for existing tasks to terminate
-        if (!pool.awaitTermination(60, TimeUnit.SECONDS))
-            SolrException.log(log, "Executor still has running tasks.");
-    } catch (InterruptedException ie) {
-      // (Re-)Cancel if current thread also interrupted
-      pool.shutdownNow();
+    boolean shutdown = false;
+    while (!shutdown) {
      try {
-        if (!pool.awaitTermination(60, TimeUnit.SECONDS))
-          SolrException.log(log, "Executor still has running tasks.");
-      } catch (InterruptedException e) {
-    
+        // Wait a while for existing tasks to terminate
+        shutdown = pool.awaitTermination(5, TimeUnit.SECONDS);
+      } catch (InterruptedException ie) {
+        // Preserve interrupt status
+        Thread.currentThread().interrupt();
+      }
+      if (!shutdown) {
+        pool.shutdownNow(); // Cancel currently executing tasks
+      }
+    }
+  }
+  
+  public static void shutdownAndAwaitTermination(ExecutorService pool) {
+    pool.shutdown(); // Disable new tasks from being submitted
+    boolean shutdown = false;
+    while (!shutdown) {
+      try {
+        // Wait a while for existing tasks to terminate
+        shutdown = pool.awaitTermination(60, TimeUnit.SECONDS);
+      } catch (InterruptedException ie) {
+        // Preserve interrupt status
+        Thread.currentThread().interrupt();
+      }
+      if (!shutdown) {
+        pool.shutdownNow(); // Cancel currently executing tasks
      }
-      // Preserve interrupt status
-      Thread.currentThread().interrupt();
    }
  }
 }
--- a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java
+++ b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java
@ -242,7 +242,7 @@ public abstract class AbstractFullDistribZkTestBase extends AbstractDistribZkTes
          server.getLbServer().getHttpClient().getParams()
              .setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 5000);
          server.getLbServer().getHttpClient().getParams()
-              .setParameter(CoreConnectionPNames.SO_TIMEOUT, 40000);
+              .setParameter(CoreConnectionPNames.SO_TIMEOUT, 20000);
          cloudClient = server;
        } catch (MalformedURLException e) {
          throw new RuntimeException(e);
@ -891,7 +891,7 @@ public abstract class AbstractFullDistribZkTestBase extends AbstractDistribZkTes
              cnt += results;
              break;
            }
-          } catch (SolrServerException e) {
+          } catch (Exception e) {
            // if we have a problem, try the next one
            if (i == times - 1) {
              throw e;
--- a/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java
+++ b/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java
@ -32,6 +32,7 @@ import org.apache.solr.common.cloud.SolrZkClient;
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.core.CoreContainer;
+import org.apache.solr.core.SolrCore;
 import org.apache.solr.servlet.SolrDispatchFilter;
 import org.apache.zookeeper.KeeperException;
 import org.eclipse.jetty.servlet.FilterHolder;
@ -43,13 +44,16 @@ import org.slf4j.LoggerFactory;
 * 
 * It can also run in a background thread and start and stop jetties
 * randomly.
- *
+ * TODO: expire multiple sessions / connectionloss at once
+ * TODO: kill multiple jetties at once
+ * TODO: ? add random headhunter mode that always kills the leader
+ * TODO: chaosmonkey should be able to do cluster stop/start tests
 */
 public class ChaosMonkey {
  private static Logger log = LoggerFactory.getLogger(ChaosMonkey.class);
  
-  private static final int CONLOSS_PERCENT = 3; //30%
-  private static final int EXPIRE_PERCENT = 4; //40%
+  private static final int CONLOSS_PERCENT = 10; // 0 - 10 = 0 - 100%
+  private static final int EXPIRE_PERCENT = 10; // 0 - 10 = 0 - 100%
  private Map<String,List<CloudJettyRunner>> shardToJetty;
  
  private ZkTestServer zkServer;
@ -79,22 +83,50 @@ public class ChaosMonkey {
    this.zkStateReader = zkStateReader;
    this.collection = collection;
    Random random = LuceneTestCase.random();
-    expireSessions = random.nextBoolean();
-    causeConnectionLoss = random.nextBoolean();
+    expireSessions = true; //= random.nextBoolean();
+    
+    causeConnectionLoss = true;//= random.nextBoolean();
    monkeyLog("init - expire sessions:" + expireSessions
        + " cause connection loss:" + causeConnectionLoss);
  }
  
-  public void expireSession(JettySolrRunner jetty) {
+  // TODO: expire all clients at once?
+  public void expireSession(final JettySolrRunner jetty) {
    monkeyLog("expire session for " + jetty.getLocalPort() + " !");
-    SolrDispatchFilter solrDispatchFilter = (SolrDispatchFilter) jetty.getDispatchFilter().getFilter();
+    
+    SolrDispatchFilter solrDispatchFilter = (SolrDispatchFilter) jetty
+        .getDispatchFilter().getFilter();
    if (solrDispatchFilter != null) {
      CoreContainer cores = solrDispatchFilter.getCores();
      if (cores != null) {
-        long sessionId = cores.getZkController().getZkClient().getSolrZooKeeper().getSessionId();
-        zkServer.expire(sessionId);
+        causeConnectionLoss(jetty, cores.getZkController().getClientTimeout() + 200);
      }
    }
+    
+
+//    Thread thread = new Thread() {
+//      {
+//        setDaemon(true);
+//      }
+//      public void run() {
+//        SolrDispatchFilter solrDispatchFilter = (SolrDispatchFilter) jetty.getDispatchFilter().getFilter();
+//        if (solrDispatchFilter != null) {
+//          CoreContainer cores = solrDispatchFilter.getCores();
+//          if (cores != null) {
+//            try {
+//              Thread.sleep(ZkTestServer.TICK_TIME * 2 + 800);
+//            } catch (InterruptedException e) {
+//              // we act as only connection loss
+//              return;
+//            }
+//            long sessionId = cores.getZkController().getZkClient().getSolrZooKeeper().getSessionId();
+//            zkServer.expire(sessionId);
+//          }
+//        }
+//      }
+//    };
+//    thread.start();
+
  }
  
  public void expireRandomSession() throws KeeperException, InterruptedException {
@ -119,6 +151,10 @@ public class ChaosMonkey {
  }
  
  private void causeConnectionLoss(JettySolrRunner jetty) {
+    causeConnectionLoss(jetty, ZkTestServer.TICK_TIME * 2 + 200);
+  }
+  
+  private void causeConnectionLoss(JettySolrRunner jetty, int pauseTime) {
    SolrDispatchFilter solrDispatchFilter = (SolrDispatchFilter) jetty
        .getDispatchFilter().getFilter();
    if (solrDispatchFilter != null) {
@ -126,7 +162,7 @@ public class ChaosMonkey {
      if (cores != null) {
        SolrZkClient zkClient = cores.getZkController().getZkClient();
        // must be at least double tick time...
-        zkClient.getSolrZooKeeper().pauseCnxn(ZkTestServer.TICK_TIME * 2 + 200);
+        zkClient.getSolrZooKeeper().pauseCnxn(pauseTime);
      }
    }
  }
@ -291,6 +327,7 @@ public class ChaosMonkey {
      }
    }
    
+    // TODO: stale state makes this a tough call
    if (numActive < 2) {
      // we cannot kill anyone
      monkeyLog("only one active node in shard - monkey cannot kill :(");
@ -308,8 +345,44 @@ public class ChaosMonkey {
      int index = random.nextInt(jetties.size());
      cjetty = jetties.get(index);
      
-      ZkNodeProps leader = zkStateReader.getLeaderProps(collection, slice);
-      boolean isLeader = leader.getStr(ZkStateReader.NODE_NAME_PROP).equals(jetties.get(index).nodeName);
+      ZkNodeProps leader = null;
+      try {
+        leader = zkStateReader.getLeaderProps(collection, slice);
+      } catch (Throwable t) {
+        log.error("Could not get leader", t);
+        return null;
+      }
+      
+      FilterHolder fh = cjetty.jetty.getDispatchFilter();
+      if (fh == null) {
+        monkeyLog("selected jetty not running correctly - skip");
+        return null;
+      }
+      SolrDispatchFilter df = ((SolrDispatchFilter) fh.getFilter());
+      if (df == null) {
+        monkeyLog("selected jetty not running correctly - skip");
+        return null;
+      }
+      CoreContainer cores = df.getCores();
+      if (cores == null) {
+        monkeyLog("selected jetty not running correctly - skip");
+        return null;
+      }
+      SolrCore core = cores.getCore(leader.getStr(ZkStateReader.CORE_NAME_PROP));
+      if (core == null) {
+        monkeyLog("selected jetty not running correctly - skip");
+        return null;
+      }
+      // cluster state can be stale - also go by our 'near real-time' is leader prop
+      boolean rtIsLeader;
+      try {
+        rtIsLeader = core.getCoreDescriptor().getCloudDescriptor().isLeader();
+      } finally {
+        core.close();
+      }
+      
+      boolean isLeader = leader.getStr(ZkStateReader.NODE_NAME_PROP).equals(jetties.get(index).nodeName)
+          || rtIsLeader;
      if (!aggressivelyKillLeaders && isLeader) {
        // we don't kill leaders...
        monkeyLog("abort! I don't kill leaders");
@ -343,7 +416,7 @@ public class ChaosMonkey {
  
  // synchronously starts and stops shards randomly, unless there is only one
  // active shard up for a slice or if there is one active and others recovering
-  public void startTheMonkey(boolean killLeaders, final int roundPause) {
+  public void startTheMonkey(boolean killLeaders, final int roundPauseUpperLimit) {
    monkeyLog("starting");
    this.aggressivelyKillLeaders = killLeaders;
    startTime = System.currentTimeMillis();
@ -357,8 +430,9 @@ public class ChaosMonkey {
      public void run() {
        while (!stop) {
          try {
-            Thread.sleep(roundPause);
+    
            Random random = LuceneTestCase.random();
+            Thread.sleep(random.nextInt(roundPauseUpperLimit));
            if (random.nextBoolean()) {
             if (!deadPool.isEmpty()) {
               int index = random.nextInt(deadPool.size());
--- a/solr/test-framework/src/java/org/apache/solr/cloud/ZkTestServer.java
+++ b/solr/test-framework/src/java/org/apache/solr/cloud/ZkTestServer.java
@ -43,7 +43,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 public class ZkTestServer {
-  public static final int TICK_TIME = 3000;
+  public static final int TICK_TIME = 1000;

  private static Logger log = LoggerFactory.getLogger(ZkTestServer.class);