SOLR-13718: A more targeted fix for SPLITSHARD, thereby avoiding Backup/Restore test failures

This commit is contained in:
Ishan Chattopadhyaya 2019-08-29 17:20:46 +05:30
parent d606ffdea9
commit 12715da544
2 changed files with 30 additions and 18 deletions

View File

@ -1033,17 +1033,12 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
// If request is async wait for the core admin to complete before returning // If request is async wait for the core admin to complete before returning
if (asyncId != null) { if (asyncId != null) {
waitForAsyncCallsToComplete(results, true, msgOnError); waitForAsyncCallsToComplete(results); // TODO: Shouldn't we abort with msgOnError exception when failure?
shardAsyncIdByNode.clear(); shardAsyncIdByNode.clear();
} }
} }
private void waitForAsyncCallsToComplete(NamedList<Object> results) { private void waitForAsyncCallsToComplete(NamedList<Object> results) {
waitForAsyncCallsToComplete(results, false, null);
}
private void waitForAsyncCallsToComplete(NamedList<Object> results, boolean abortOnFailure, String msgOnError) {
boolean failed = false;
for (Map.Entry<String,String> nodeToAsync:shardAsyncIdByNode) { for (Map.Entry<String,String> nodeToAsync:shardAsyncIdByNode) {
final String node = nodeToAsync.getKey(); final String node = nodeToAsync.getKey();
final String shardAsyncId = nodeToAsync.getValue(); final String shardAsyncId = nodeToAsync.getValue();
@ -1055,14 +1050,10 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
if ("failed".equalsIgnoreCase(((String)reqResult.get("STATUS")))) { if ("failed".equalsIgnoreCase(((String)reqResult.get("STATUS")))) {
log.error("Error from shard {}: {}", node, reqResult); log.error("Error from shard {}: {}", node, reqResult);
addFailure(results, node, reqResult); addFailure(results, node, reqResult);
failed = true;
} else { } else {
addSuccess(results, node, reqResult); addSuccess(results, node, reqResult);
} }
} }
if (failed && abortOnFailure && msgOnError != null) {
throw new SolrException(ErrorCode.SERVER_ERROR, msgOnError);
}
} }
/** @deprecated consider to make it private after {@link CreateCollectionCmd} refactoring*/ /** @deprecated consider to make it private after {@link CreateCollectionCmd} refactoring*/

View File

@ -43,6 +43,7 @@ import org.apache.solr.cloud.Overseer;
import org.apache.solr.cloud.api.collections.OverseerCollectionMessageHandler.ShardRequestTracker; import org.apache.solr.cloud.api.collections.OverseerCollectionMessageHandler.ShardRequestTracker;
import org.apache.solr.cloud.overseer.OverseerAction; import org.apache.solr.cloud.overseer.OverseerAction;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.cloud.ClusterState; import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.CompositeIdRouter; import org.apache.solr.common.cloud.CompositeIdRouter;
import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.DocCollection;
@ -225,7 +226,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
final ShardRequestTracker shardRequestTracker = ocmh.asyncRequestTracker(asyncId); final ShardRequestTracker shardRequestTracker = ocmh.asyncRequestTracker(asyncId);
shardRequestTracker.sendShardRequest(parentShardLeader.getNodeName(), params, shardHandler); shardRequestTracker.sendShardRequest(parentShardLeader.getNodeName(), params, shardHandler);
SimpleOrderedMap<Object> getRangesResults = new SimpleOrderedMap<>(); SimpleOrderedMap<Object> getRangesResults = new SimpleOrderedMap<>();
shardRequestTracker.processResponses(getRangesResults, shardHandler, true, "SPLITSHARD failed to invoke SPLIT.getRanges core admin command"); String msgOnError = "SPLITSHARD failed to invoke SPLIT.getRanges core admin command";
shardRequestTracker.processResponses(getRangesResults, shardHandler, true, msgOnError);
handleFailureOnAsyncRequest(results, msgOnError);
// Extract the recommended splits from the shard response (if it exists) // Extract the recommended splits from the shard response (if it exists)
// example response: getRangesResults={success={127.0.0.1:62086_solr={responseHeader={status=0,QTime=1},ranges=10-20,3a-3f}}} // example response: getRangesResults={success={127.0.0.1:62086_solr={responseHeader={status=0,QTime=1},ranges=10-20,3a-3f}}}
@ -339,7 +342,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
{ {
final ShardRequestTracker syncRequestTracker = ocmh.syncRequestTracker(); final ShardRequestTracker syncRequestTracker = ocmh.syncRequestTracker();
syncRequestTracker.processResponses(results, shardHandler, true, "SPLITSHARD failed to create subshard leaders"); String msgOnError = "SPLITSHARD failed to create subshard leaders";
syncRequestTracker.processResponses(results, shardHandler, true, msgOnError);
handleFailureOnAsyncRequest(results, msgOnError);
} }
t.stop(); t.stop();
t = timings.sub("waitForSubSliceLeadersAlive"); t = timings.sub("waitForSubSliceLeadersAlive");
@ -361,7 +366,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
shardRequestTracker.sendShardRequest(nodeName, p, shardHandler); shardRequestTracker.sendShardRequest(nodeName, p, shardHandler);
} }
shardRequestTracker.processResponses(results, shardHandler, true, "SPLITSHARD timed out waiting for subshard leaders to come up"); String msgOnError = "SPLITSHARD timed out waiting for subshard leaders to come up";
shardRequestTracker.processResponses(results, shardHandler, true, msgOnError);
handleFailureOnAsyncRequest(results, msgOnError);
} }
t.stop(); t.stop();
@ -386,7 +393,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
final ShardRequestTracker shardRequestTracker = ocmh.asyncRequestTracker(asyncId); final ShardRequestTracker shardRequestTracker = ocmh.asyncRequestTracker(asyncId);
shardRequestTracker.sendShardRequest(parentShardLeader.getNodeName(), params, shardHandler); shardRequestTracker.sendShardRequest(parentShardLeader.getNodeName(), params, shardHandler);
shardRequestTracker.processResponses(results, shardHandler, true, "SPLITSHARD failed to invoke SPLIT core admin command"); String msgOnError = "SPLITSHARD failed to invoke SPLIT core admin command";
shardRequestTracker.processResponses(results, shardHandler, true, msgOnError);
handleFailureOnAsyncRequest(results, msgOnError);
} }
t.stop(); t.stop();
@ -409,9 +418,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
shardRequestTracker.sendShardRequest(nodeName, params, shardHandler); shardRequestTracker.sendShardRequest(nodeName, params, shardHandler);
} }
shardRequestTracker.processResponses(results, shardHandler, true, String msgOnError = "SPLITSHARD failed while asking sub shard leaders to apply buffered updates";
"SPLITSHARD failed while asking sub shard leaders" + shardRequestTracker.processResponses(results, shardHandler, true, msgOnError);
" to apply buffered updates"); handleFailureOnAsyncRequest(results, msgOnError);
} }
t.stop(); t.stop();
@ -569,7 +578,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
{ {
final ShardRequestTracker syncRequestTracker = ocmh.syncRequestTracker(); final ShardRequestTracker syncRequestTracker = ocmh.syncRequestTracker();
syncRequestTracker.processResponses(results, shardHandler, true, "SPLITSHARD failed to create subshard replicas"); String msgOnError = "SPLITSHARD failed to create subshard replicas";
syncRequestTracker.processResponses(results, shardHandler, true, msgOnError);
handleFailureOnAsyncRequest(results, msgOnError);
} }
t.stop(); t.stop();
@ -599,6 +610,16 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
} }
} }
/**
* In case of async requests, the ShardRequestTracker's processResponses() does not
* abort on failure (as it should). Handling this here temporarily for now.
*/
private void handleFailureOnAsyncRequest(NamedList results, String msgOnError) {
Object splitResultFailure = results.get("failure");
if (splitResultFailure != null) {
throw new SolrException(ErrorCode.SERVER_ERROR, msgOnError);
}
}
private void checkDiskSpace(String collection, String shard, Replica parentShardLeader) throws SolrException { private void checkDiskSpace(String collection, String shard, Replica parentShardLeader) throws SolrException {
// check that enough disk space is available on the parent leader node // check that enough disk space is available on the parent leader node
// otherwise the actual index splitting will always fail // otherwise the actual index splitting will always fail