mirror of https://github.com/apache/lucene.git
SOLR-13718: SPLITSHARD (async) with failures in underlying sub-operations can result in data loss
When SPLITSHARD is issued asynchronously, any exception in a sub-operation isn't propagated and the overall SPLITSHARD task proceeds as if there were no failures. This results in marking the active parent shard inactive and can result in two empty sub-shards, thus causing data loss.
This commit is contained in:
parent
da02e9f83c
commit
d606ffdea9
|
@ -129,6 +129,8 @@ Bug Fixes
|
||||||
|
|
||||||
* SOLR-13699 - maxChars no longer working on CopyField with javabin (Chris Troullis via noble)
|
* SOLR-13699 - maxChars no longer working on CopyField with javabin (Chris Troullis via noble)
|
||||||
|
|
||||||
|
* SOLR-13718: SPLITSHARD (async) with failures in underlying sub-operations can result in data loss (Ishan Chattopadhyaya)
|
||||||
|
|
||||||
Other Changes
|
Other Changes
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
@ -1033,12 +1033,17 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
|
||||||
|
|
||||||
// If request is async wait for the core admin to complete before returning
|
// If request is async wait for the core admin to complete before returning
|
||||||
if (asyncId != null) {
|
if (asyncId != null) {
|
||||||
waitForAsyncCallsToComplete(results);
|
waitForAsyncCallsToComplete(results, true, msgOnError);
|
||||||
shardAsyncIdByNode.clear();
|
shardAsyncIdByNode.clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void waitForAsyncCallsToComplete(NamedList<Object> results) {
|
private void waitForAsyncCallsToComplete(NamedList<Object> results) {
|
||||||
|
waitForAsyncCallsToComplete(results, false, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void waitForAsyncCallsToComplete(NamedList<Object> results, boolean abortOnFailure, String msgOnError) {
|
||||||
|
boolean failed = false;
|
||||||
for (Map.Entry<String,String> nodeToAsync:shardAsyncIdByNode) {
|
for (Map.Entry<String,String> nodeToAsync:shardAsyncIdByNode) {
|
||||||
final String node = nodeToAsync.getKey();
|
final String node = nodeToAsync.getKey();
|
||||||
final String shardAsyncId = nodeToAsync.getValue();
|
final String shardAsyncId = nodeToAsync.getValue();
|
||||||
|
@ -1050,10 +1055,14 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
|
||||||
if ("failed".equalsIgnoreCase(((String)reqResult.get("STATUS")))) {
|
if ("failed".equalsIgnoreCase(((String)reqResult.get("STATUS")))) {
|
||||||
log.error("Error from shard {}: {}", node, reqResult);
|
log.error("Error from shard {}: {}", node, reqResult);
|
||||||
addFailure(results, node, reqResult);
|
addFailure(results, node, reqResult);
|
||||||
|
failed = true;
|
||||||
} else {
|
} else {
|
||||||
addSuccess(results, node, reqResult);
|
addSuccess(results, node, reqResult);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (failed && abortOnFailure && msgOnError != null) {
|
||||||
|
throw new SolrException(ErrorCode.SERVER_ERROR, msgOnError);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @deprecated consider to make it private after {@link CreateCollectionCmd} refactoring*/
|
/** @deprecated consider to make it private after {@link CreateCollectionCmd} refactoring*/
|
||||||
|
|
Loading…
Reference in New Issue