mirror of https://github.com/apache/lucene.git
SOLR-13718: A more targeted fix for SPLITSHARD, thereby avoiding Backup/Restore test failures
This commit is contained in:
parent
43d489cb4a
commit
f27665198a
|
@ -1033,17 +1033,12 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
|
|||
|
||||
// If request is async wait for the core admin to complete before returning
|
||||
if (asyncId != null) {
|
||||
waitForAsyncCallsToComplete(results, true, msgOnError);
|
||||
waitForAsyncCallsToComplete(results); // TODO: Shouldn't we abort with msgOnError exception when failure?
|
||||
shardAsyncIdByNode.clear();
|
||||
}
|
||||
}
|
||||
|
||||
private void waitForAsyncCallsToComplete(NamedList<Object> results) {
|
||||
waitForAsyncCallsToComplete(results, false, null);
|
||||
}
|
||||
|
||||
private void waitForAsyncCallsToComplete(NamedList<Object> results, boolean abortOnFailure, String msgOnError) {
|
||||
boolean failed = false;
|
||||
for (Map.Entry<String,String> nodeToAsync:shardAsyncIdByNode) {
|
||||
final String node = nodeToAsync.getKey();
|
||||
final String shardAsyncId = nodeToAsync.getValue();
|
||||
|
@ -1055,14 +1050,10 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
|
|||
if ("failed".equalsIgnoreCase(((String)reqResult.get("STATUS")))) {
|
||||
log.error("Error from shard {}: {}", node, reqResult);
|
||||
addFailure(results, node, reqResult);
|
||||
failed = true;
|
||||
} else {
|
||||
addSuccess(results, node, reqResult);
|
||||
}
|
||||
}
|
||||
if (failed && abortOnFailure && msgOnError != null) {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, msgOnError);
|
||||
}
|
||||
}
|
||||
|
||||
/** @deprecated consider to make it private after {@link CreateCollectionCmd} refactoring*/
|
||||
|
|
|
@ -43,6 +43,7 @@ import org.apache.solr.cloud.Overseer;
|
|||
import org.apache.solr.cloud.api.collections.OverseerCollectionMessageHandler.ShardRequestTracker;
|
||||
import org.apache.solr.cloud.overseer.OverseerAction;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrException.ErrorCode;
|
||||
import org.apache.solr.common.cloud.ClusterState;
|
||||
import org.apache.solr.common.cloud.CompositeIdRouter;
|
||||
import org.apache.solr.common.cloud.DocCollection;
|
||||
|
@ -225,7 +226,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
|
|||
final ShardRequestTracker shardRequestTracker = ocmh.asyncRequestTracker(asyncId);
|
||||
shardRequestTracker.sendShardRequest(parentShardLeader.getNodeName(), params, shardHandler);
|
||||
SimpleOrderedMap<Object> getRangesResults = new SimpleOrderedMap<>();
|
||||
shardRequestTracker.processResponses(getRangesResults, shardHandler, true, "SPLITSHARD failed to invoke SPLIT.getRanges core admin command");
|
||||
String msgOnError = "SPLITSHARD failed to invoke SPLIT.getRanges core admin command";
|
||||
shardRequestTracker.processResponses(getRangesResults, shardHandler, true, msgOnError);
|
||||
handleFailureOnAsyncRequest(results, msgOnError);
|
||||
|
||||
// Extract the recommended splits from the shard response (if it exists)
|
||||
// example response: getRangesResults={success={127.0.0.1:62086_solr={responseHeader={status=0,QTime=1},ranges=10-20,3a-3f}}}
|
||||
|
@ -339,7 +342,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
|
|||
|
||||
{
|
||||
final ShardRequestTracker syncRequestTracker = ocmh.syncRequestTracker();
|
||||
syncRequestTracker.processResponses(results, shardHandler, true, "SPLITSHARD failed to create subshard leaders");
|
||||
String msgOnError = "SPLITSHARD failed to create subshard leaders";
|
||||
syncRequestTracker.processResponses(results, shardHandler, true, msgOnError);
|
||||
handleFailureOnAsyncRequest(results, msgOnError);
|
||||
}
|
||||
t.stop();
|
||||
t = timings.sub("waitForSubSliceLeadersAlive");
|
||||
|
@ -361,7 +366,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
|
|||
shardRequestTracker.sendShardRequest(nodeName, p, shardHandler);
|
||||
}
|
||||
|
||||
shardRequestTracker.processResponses(results, shardHandler, true, "SPLITSHARD timed out waiting for subshard leaders to come up");
|
||||
String msgOnError = "SPLITSHARD timed out waiting for subshard leaders to come up";
|
||||
shardRequestTracker.processResponses(results, shardHandler, true, msgOnError);
|
||||
handleFailureOnAsyncRequest(results, msgOnError);
|
||||
}
|
||||
t.stop();
|
||||
|
||||
|
@ -386,7 +393,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
|
|||
final ShardRequestTracker shardRequestTracker = ocmh.asyncRequestTracker(asyncId);
|
||||
shardRequestTracker.sendShardRequest(parentShardLeader.getNodeName(), params, shardHandler);
|
||||
|
||||
shardRequestTracker.processResponses(results, shardHandler, true, "SPLITSHARD failed to invoke SPLIT core admin command");
|
||||
String msgOnError = "SPLITSHARD failed to invoke SPLIT core admin command";
|
||||
shardRequestTracker.processResponses(results, shardHandler, true, msgOnError);
|
||||
handleFailureOnAsyncRequest(results, msgOnError);
|
||||
}
|
||||
t.stop();
|
||||
|
||||
|
@ -409,9 +418,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
|
|||
shardRequestTracker.sendShardRequest(nodeName, params, shardHandler);
|
||||
}
|
||||
|
||||
shardRequestTracker.processResponses(results, shardHandler, true,
|
||||
"SPLITSHARD failed while asking sub shard leaders" +
|
||||
" to apply buffered updates");
|
||||
String msgOnError = "SPLITSHARD failed while asking sub shard leaders to apply buffered updates";
|
||||
shardRequestTracker.processResponses(results, shardHandler, true, msgOnError);
|
||||
handleFailureOnAsyncRequest(results, msgOnError);
|
||||
}
|
||||
t.stop();
|
||||
|
||||
|
@ -569,7 +578,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
|
|||
|
||||
{
|
||||
final ShardRequestTracker syncRequestTracker = ocmh.syncRequestTracker();
|
||||
syncRequestTracker.processResponses(results, shardHandler, true, "SPLITSHARD failed to create subshard replicas");
|
||||
String msgOnError = "SPLITSHARD failed to create subshard replicas";
|
||||
syncRequestTracker.processResponses(results, shardHandler, true, msgOnError);
|
||||
handleFailureOnAsyncRequest(results, msgOnError);
|
||||
}
|
||||
t.stop();
|
||||
|
||||
|
@ -599,6 +610,16 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* In case of async requests, the ShardRequestTracker's processResponses() does not
|
||||
* abort on failure (as it should). Handling this here temporarily for now.
|
||||
*/
|
||||
private void handleFailureOnAsyncRequest(NamedList results, String msgOnError) {
|
||||
Object splitResultFailure = results.get("failure");
|
||||
if (splitResultFailure != null) {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, msgOnError);
|
||||
}
|
||||
}
|
||||
private void checkDiskSpace(String collection, String shard, Replica parentShardLeader) throws SolrException {
|
||||
// check that enough disk space is available on the parent leader node
|
||||
// otherwise the actual index splitting will always fail
|
||||
|
|
Loading…
Reference in New Issue