From d606ffdea92513a29bd7d7a1af3cfdf556aae93c Mon Sep 17 00:00:00 2001 From: Ishan Chattopadhyaya Date: Thu, 29 Aug 2019 10:04:08 +0530 Subject: [PATCH] SOLR-13718: SPLITSHARD (async) with failures in underlying sub-operations can result in data loss When SPLITSHARD is issued asynchronously, any exception in a sub-operation isn't propagated and the overall SPLITSHARD task proceeds as if there were no failures. This results in marking the active parent shard inactive and can result in two empty sub-shards, thus causing data loss. --- solr/CHANGES.txt | 2 ++ .../collections/OverseerCollectionMessageHandler.java | 11 ++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 030fc90624b..66f9132ffbc 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -129,6 +129,8 @@ Bug Fixes * SOLR-13699 - maxChars no longer working on CopyField with javabin (Chris Troullis via noble) +* SOLR-13718: SPLITSHARD (async) with failures in underlying sub-operations can result in data loss (Ishan Chattopadhyaya) + Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java index 6fbab131a5f..6ef7eb34ad1 100644 --- a/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java +++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java @@ -1033,12 +1033,17 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler, // If request is async wait for the core admin to complete before returning if (asyncId != null) { - waitForAsyncCallsToComplete(results); + waitForAsyncCallsToComplete(results, true, msgOnError); shardAsyncIdByNode.clear(); } } private void waitForAsyncCallsToComplete(NamedList results) { + waitForAsyncCallsToComplete(results, false, null); + } + + private void waitForAsyncCallsToComplete(NamedList results, boolean abortOnFailure, String msgOnError) { + boolean failed = false; for (Map.Entry nodeToAsync:shardAsyncIdByNode) { final String node = nodeToAsync.getKey(); final String shardAsyncId = nodeToAsync.getValue(); @@ -1050,10 +1055,14 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler, if ("failed".equalsIgnoreCase(((String)reqResult.get("STATUS")))) { log.error("Error from shard {}: {}", node, reqResult); addFailure(results, node, reqResult); + failed = true; } else { addSuccess(results, node, reqResult); } } + if (failed && abortOnFailure && msgOnError != null) { + throw new SolrException(ErrorCode.SERVER_ERROR, msgOnError); + } } /** @deprecated consider to make it private after {@link CreateCollectionCmd} refactoring*/