SOLR-5860: Use leaderConflictResolveWait in WaitForState during recovery/startup, improve logging and force refresh cluster state every 15 seconds

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1579954 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shalin Shekhar Mangar 2014-03-21 15:14:20 +00:00
parent b2335520fe
commit d50899c506
3 changed files with 45 additions and 6 deletions

View File

@ -139,6 +139,10 @@ New Features
* SOLR-5865: Provide a MiniSolrCloudCluster to enable easier testing.
(Greg Chanan via Mark Miller)
* SOLR-5860: Use leaderConflictResolveWait in WaitForState during recovery/startup,
improve logging and force refresh cluster state every 15 seconds.
(Timothy Potter via shalin)
Bug Fixes
----------------------

View File

@ -299,6 +299,10 @@ public final class ZkController {
public int getLeaderVoteWait() {
return leaderVoteWait;
}
public int getLeaderConflictResolveWait() {
return leaderConflictResolveWait;
}
public void forceOverSeer(){
try {

View File

@ -34,6 +34,7 @@ import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.DocRouter;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
@ -968,6 +969,7 @@ public class CoreAdminHandler extends RequestHandlerBase {
log.info("Going to wait for coreNodeName: " + coreNodeName + ", state: " + waitForState
+ ", checkLive: " + checkLive + ", onlyIfLeader: " + onlyIfLeader);
int maxTries = 0;
String state = null;
boolean live = false;
int retry = 0;
@ -991,10 +993,25 @@ public class CoreAdminHandler extends RequestHandlerBase {
CloudDescriptor cloudDescriptor = core.getCoreDescriptor()
.getCloudDescriptor();
if (retry == 15 || retry == 60) {
if (retry % 15 == 0) {
if (retry > 0 && log.isInfoEnabled())
log.info("After " + retry + " seconds, core " + cname + " (" +
cloudDescriptor.getShardId() + " of " +
cloudDescriptor.getCollectionName() + ") still does not have state: " +
waitForState + "; forcing ClusterState update from ZooKeeper");
// force a cluster state update
coreContainer.getZkController().getZkStateReader().updateClusterState(true);
}
if (maxTries == 0) {
// wait long enough for the leader conflict to work itself out plus a little extra
int conflictWaitMs = coreContainer.getZkController().getLeaderConflictResolveWait();
maxTries = (int) Math.round(conflictWaitMs / 1000) + 3;
log.info("Will wait a max of " + maxTries + " seconds to see " + cname + " (" +
cloudDescriptor.getShardId() + " of " +
cloudDescriptor.getCollectionName() + ") have state: " + waitForState);
}
ClusterState clusterState = coreContainer.getZkController()
.getClusterState();
@ -1023,13 +1040,28 @@ public class CoreAdminHandler extends RequestHandlerBase {
}
}
}
if (retry++ == 120) {
if (retry++ == maxTries) {
String collection = null;
String leaderInfo = null;
String shardId = null;
try {
CloudDescriptor cloudDescriptor =
core.getCoreDescriptor().getCloudDescriptor();
collection = cloudDescriptor.getCollectionName();
shardId = cloudDescriptor.getShardId();
leaderInfo = coreContainer.getZkController().
getZkStateReader().getLeaderUrl(collection, shardId, 0);
} catch (Exception exc) {
leaderInfo = "Not available due to: " + exc;
}
throw new SolrException(ErrorCode.BAD_REQUEST,
"I was asked to wait on state " + waitForState + " for "
+ nodeName
+ shardId + " in " + collection + " on " + nodeName
+ " but I still do not see the requested state. I see state: "
+ state + " live:" + live);
+ state + " live:" + live + " leader from ZK: " + leaderInfo
);
}
if (coreContainer.isShutDown()) {
@ -1040,7 +1072,6 @@ public class CoreAdminHandler extends RequestHandlerBase {
// solrcloud_debug
if (log.isDebugEnabled()) {
try {
;
LocalSolrQueryRequest r = new LocalSolrQueryRequest(core,
new ModifiableSolrParams());
CommitUpdateCommand commitCmd = new CommitUpdateCommand(r, false);