From a8d5bd34bf494da8f59baea52f6578bf3ba44ce8 Mon Sep 17 00:00:00 2001 From: Ishan Chattopadhyaya Date: Thu, 29 Aug 2019 10:04:08 +0530 Subject: [PATCH] SOLR-13718: SPLITSHARD (async) with failures in underlying sub-operations can result in data loss When SPLITSHARD is issued asynchronously, any exception in a sub-operation isn't propagated and the overall SPLITSHARD task proceeds as if there were no failures. This results in marking the active parent shard inactive and can result in two empty sub-shards, thus causing data loss. --- solr/CHANGES.txt | 2 ++ .../collections/OverseerCollectionMessageHandler.java | 11 ++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index f8498497219..8c0fe1a4c0a 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -182,6 +182,8 @@ Bug Fixes * SOLR-13699 - maxChars no longer working on CopyField with javabin (Chris Troullis via noble) +* SOLR-13718: SPLITSHARD (async) with failures in underlying sub-operations can result in data loss (Ishan Chattopadhyaya) + Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java index 6fbab131a5f..6ef7eb34ad1 100644 --- a/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java +++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java @@ -1033,12 +1033,17 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler, // If request is async wait for the core admin to complete before returning if (asyncId != null) { - waitForAsyncCallsToComplete(results); + waitForAsyncCallsToComplete(results, true, msgOnError); shardAsyncIdByNode.clear(); } } private void waitForAsyncCallsToComplete(NamedList results) { + waitForAsyncCallsToComplete(results, false, null); + } + + private void waitForAsyncCallsToComplete(NamedList results, boolean abortOnFailure, String msgOnError) { + boolean failed = false; for (Map.Entry nodeToAsync:shardAsyncIdByNode) { final String node = nodeToAsync.getKey(); final String shardAsyncId = nodeToAsync.getValue(); @@ -1050,10 +1055,14 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler, if ("failed".equalsIgnoreCase(((String)reqResult.get("STATUS")))) { log.error("Error from shard {}: {}", node, reqResult); addFailure(results, node, reqResult); + failed = true; } else { addSuccess(results, node, reqResult); } } + if (failed && abortOnFailure && msgOnError != null) { + throw new SolrException(ErrorCode.SERVER_ERROR, msgOnError); + } } /** @deprecated consider to make it private after {@link CreateCollectionCmd} refactoring*/