SOLR-13718: SPLITSHARD (async) with failures in underlying sub-operations can result in data loss

When SPLITSHARD is issued asynchronously, any exception in a sub-operation isn't propagated and the overall
  SPLITSHARD task proceeds as if there were no failures. This results in marking the active parent shard inactive
  and can result in two empty sub-shards, thus causing data loss.
This commit is contained in:
Ishan Chattopadhyaya 2019-08-29 10:04:08 +05:30
parent da02e9f83c
commit d606ffdea9
2 changed files with 12 additions and 1 deletions

View File

@ -129,6 +129,8 @@ Bug Fixes
* SOLR-13699 - maxChars no longer working on CopyField with javabin (Chris Troullis via noble) * SOLR-13699 - maxChars no longer working on CopyField with javabin (Chris Troullis via noble)
* SOLR-13718: SPLITSHARD (async) with failures in underlying sub-operations can result in data loss (Ishan Chattopadhyaya)
Other Changes Other Changes
---------------------- ----------------------

View File

@ -1033,12 +1033,17 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
// If request is async wait for the core admin to complete before returning // If request is async wait for the core admin to complete before returning
if (asyncId != null) { if (asyncId != null) {
waitForAsyncCallsToComplete(results); waitForAsyncCallsToComplete(results, true, msgOnError);
shardAsyncIdByNode.clear(); shardAsyncIdByNode.clear();
} }
} }
private void waitForAsyncCallsToComplete(NamedList<Object> results) { private void waitForAsyncCallsToComplete(NamedList<Object> results) {
waitForAsyncCallsToComplete(results, false, null);
}
private void waitForAsyncCallsToComplete(NamedList<Object> results, boolean abortOnFailure, String msgOnError) {
boolean failed = false;
for (Map.Entry<String,String> nodeToAsync:shardAsyncIdByNode) { for (Map.Entry<String,String> nodeToAsync:shardAsyncIdByNode) {
final String node = nodeToAsync.getKey(); final String node = nodeToAsync.getKey();
final String shardAsyncId = nodeToAsync.getValue(); final String shardAsyncId = nodeToAsync.getValue();
@ -1050,10 +1055,14 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
if ("failed".equalsIgnoreCase(((String)reqResult.get("STATUS")))) { if ("failed".equalsIgnoreCase(((String)reqResult.get("STATUS")))) {
log.error("Error from shard {}: {}", node, reqResult); log.error("Error from shard {}: {}", node, reqResult);
addFailure(results, node, reqResult); addFailure(results, node, reqResult);
failed = true;
} else { } else {
addSuccess(results, node, reqResult); addSuccess(results, node, reqResult);
} }
} }
if (failed && abortOnFailure && msgOnError != null) {
throw new SolrException(ErrorCode.SERVER_ERROR, msgOnError);
}
} }
/** @deprecated consider to make it private after {@link CreateCollectionCmd} refactoring*/ /** @deprecated consider to make it private after {@link CreateCollectionCmd} refactoring*/