SOLR-13718: A more targeted fix for SPLITSHARD, thereby avoiding Backup/Restore test failures

This commit is contained in:
Ishan Chattopadhyaya 2019-08-29 17:20:46 +05:30
parent 43d489cb4a
commit f27665198a
2 changed files with 30 additions and 18 deletions

View File

@ -1033,17 +1033,12 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
// If request is async wait for the core admin to complete before returning
if (asyncId != null) {
waitForAsyncCallsToComplete(results, true, msgOnError);
waitForAsyncCallsToComplete(results); // TODO: Shouldn't we abort with msgOnError exception when failure?
shardAsyncIdByNode.clear();
}
}
private void waitForAsyncCallsToComplete(NamedList<Object> results) {
waitForAsyncCallsToComplete(results, false, null);
}
private void waitForAsyncCallsToComplete(NamedList<Object> results, boolean abortOnFailure, String msgOnError) {
boolean failed = false;
for (Map.Entry<String,String> nodeToAsync:shardAsyncIdByNode) {
final String node = nodeToAsync.getKey();
final String shardAsyncId = nodeToAsync.getValue();
@ -1055,14 +1050,10 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
if ("failed".equalsIgnoreCase(((String)reqResult.get("STATUS")))) {
log.error("Error from shard {}: {}", node, reqResult);
addFailure(results, node, reqResult);
failed = true;
} else {
addSuccess(results, node, reqResult);
}
}
if (failed && abortOnFailure && msgOnError != null) {
throw new SolrException(ErrorCode.SERVER_ERROR, msgOnError);
}
}
/** @deprecated consider to make it private after {@link CreateCollectionCmd} refactoring*/

View File

@ -43,6 +43,7 @@ import org.apache.solr.cloud.Overseer;
import org.apache.solr.cloud.api.collections.OverseerCollectionMessageHandler.ShardRequestTracker;
import org.apache.solr.cloud.overseer.OverseerAction;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.CompositeIdRouter;
import org.apache.solr.common.cloud.DocCollection;
@ -225,7 +226,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
final ShardRequestTracker shardRequestTracker = ocmh.asyncRequestTracker(asyncId);
shardRequestTracker.sendShardRequest(parentShardLeader.getNodeName(), params, shardHandler);
SimpleOrderedMap<Object> getRangesResults = new SimpleOrderedMap<>();
shardRequestTracker.processResponses(getRangesResults, shardHandler, true, "SPLITSHARD failed to invoke SPLIT.getRanges core admin command");
String msgOnError = "SPLITSHARD failed to invoke SPLIT.getRanges core admin command";
shardRequestTracker.processResponses(getRangesResults, shardHandler, true, msgOnError);
handleFailureOnAsyncRequest(results, msgOnError);
// Extract the recommended splits from the shard response (if it exists)
// example response: getRangesResults={success={127.0.0.1:62086_solr={responseHeader={status=0,QTime=1},ranges=10-20,3a-3f}}}
@ -339,7 +342,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
{
final ShardRequestTracker syncRequestTracker = ocmh.syncRequestTracker();
syncRequestTracker.processResponses(results, shardHandler, true, "SPLITSHARD failed to create subshard leaders");
String msgOnError = "SPLITSHARD failed to create subshard leaders";
syncRequestTracker.processResponses(results, shardHandler, true, msgOnError);
handleFailureOnAsyncRequest(results, msgOnError);
}
t.stop();
t = timings.sub("waitForSubSliceLeadersAlive");
@ -361,7 +366,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
shardRequestTracker.sendShardRequest(nodeName, p, shardHandler);
}
shardRequestTracker.processResponses(results, shardHandler, true, "SPLITSHARD timed out waiting for subshard leaders to come up");
String msgOnError = "SPLITSHARD timed out waiting for subshard leaders to come up";
shardRequestTracker.processResponses(results, shardHandler, true, msgOnError);
handleFailureOnAsyncRequest(results, msgOnError);
}
t.stop();
@ -386,7 +393,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
final ShardRequestTracker shardRequestTracker = ocmh.asyncRequestTracker(asyncId);
shardRequestTracker.sendShardRequest(parentShardLeader.getNodeName(), params, shardHandler);
shardRequestTracker.processResponses(results, shardHandler, true, "SPLITSHARD failed to invoke SPLIT core admin command");
String msgOnError = "SPLITSHARD failed to invoke SPLIT core admin command";
shardRequestTracker.processResponses(results, shardHandler, true, msgOnError);
handleFailureOnAsyncRequest(results, msgOnError);
}
t.stop();
@ -409,9 +418,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
shardRequestTracker.sendShardRequest(nodeName, params, shardHandler);
}
shardRequestTracker.processResponses(results, shardHandler, true,
"SPLITSHARD failed while asking sub shard leaders" +
" to apply buffered updates");
String msgOnError = "SPLITSHARD failed while asking sub shard leaders to apply buffered updates";
shardRequestTracker.processResponses(results, shardHandler, true, msgOnError);
handleFailureOnAsyncRequest(results, msgOnError);
}
t.stop();
@ -569,7 +578,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
{
final ShardRequestTracker syncRequestTracker = ocmh.syncRequestTracker();
syncRequestTracker.processResponses(results, shardHandler, true, "SPLITSHARD failed to create subshard replicas");
String msgOnError = "SPLITSHARD failed to create subshard replicas";
syncRequestTracker.processResponses(results, shardHandler, true, msgOnError);
handleFailureOnAsyncRequest(results, msgOnError);
}
t.stop();
@ -599,6 +610,16 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
}
}
/**
* In case of async requests, the ShardRequestTracker's processResponses() does not
* abort on failure (as it should). Handling this here temporarily for now.
*/
private void handleFailureOnAsyncRequest(NamedList results, String msgOnError) {
Object splitResultFailure = results.get("failure");
if (splitResultFailure != null) {
throw new SolrException(ErrorCode.SERVER_ERROR, msgOnError);
}
}
private void checkDiskSpace(String collection, String shard, Replica parentShardLeader) throws SolrException {
// check that enough disk space is available on the parent leader node
// otherwise the actual index splitting will always fail