From d57942045255cbfd5bc6236aec36323803fb21af Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Wed, 10 Jun 2020 08:41:03 +0200 Subject: [PATCH] Stop Serializing Exceptions in SnapshotInfo (#57866) (#57898) In ff9e8c622427d42a2d87b4ceb298d043ae3c4e6a we changed the format used when serializing snapshot failures in the cluster state and `SnapshotInfo`. This turned them from a short string holding all the nested exception messages into a multi kb stacktrace in many cases. This is not great if you snapshot a large number of shards that all fail for example and massively blows up the size of the GET snapshots response if there are snapshots with failures in there. This change reverts to the format used for exceptions before the above commit. Also, this change short circuits logging and serialization of the failure for an aborted snapshot where we don't care about the specific message at all and aligns the message to "aborted" in all cases (current if we aborted before any IO, it would have been "aborted" and an exception when aborting later during IO). --- .../SharedClusterSnapshotRestoreIT.java | 5 +-- .../blobstore/BlobStoreRepository.java | 7 ++-- .../snapshots/AbortedSnapshotException.java | 25 ++++++++++++++ .../snapshots/SnapshotShardsService.java | 33 +++++++++++++++++-- .../snapshots/SnapshotShardsServiceTests.java | 16 +++++++++ 5 files changed, 78 insertions(+), 8 deletions(-) create mode 100644 server/src/main/java/org/elasticsearch/snapshots/AbortedSnapshotException.java diff --git a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java index fbdae997b62..3c41309701e 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java @@ -136,6 +136,7 @@ import static org.hamcrest.Matchers.anyOf; import static org.hamcrest.Matchers.containsInAnyOrder; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.empty; +import static org.hamcrest.Matchers.endsWith; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.hasSize; @@ -3181,7 +3182,7 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas SnapshotInfo snapshotInfo = waitForCompletion(repo, snapshot, TimeValue.timeValueSeconds(60)); assertEquals(1, snapshotInfo.shardFailures().size()); assertEquals(0, snapshotInfo.shardFailures().get(0).shardId()); - assertThat(snapshotInfo.shardFailures().get(0).reason(), containsString("IndexShardSnapshotFailedException[Aborted]")); + assertThat(snapshotInfo.shardFailures().get(0).reason(), is("aborted")); } public void testSnapshotSucceedsAfterSnapshotFailure() throws Exception { @@ -3223,7 +3224,7 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas assertThat(getFailureCount("test-repo"), greaterThan(0L)); assertThat(createSnapshotResponse.getSnapshotInfo().shardFailures().size(), greaterThan(0)); for (SnapshotShardFailure shardFailure : createSnapshotResponse.getSnapshotInfo().shardFailures()) { - assertThat(shardFailure.reason(), containsString("Random IOException")); + assertThat(shardFailure.reason(), endsWith("; nested: IOException[Random IOException]")); } } } catch (SnapshotException | RepositoryException ex) { diff --git a/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java b/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java index 979ba60e843..a2f7ee4158c 100644 --- a/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java +++ b/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java @@ -108,6 +108,7 @@ import org.elasticsearch.repositories.RepositoryStats; import org.elasticsearch.repositories.RepositoryVerificationException; import org.elasticsearch.snapshots.SnapshotCreationException; import org.elasticsearch.repositories.ShardGenerations; +import org.elasticsearch.snapshots.AbortedSnapshotException; import org.elasticsearch.snapshots.SnapshotException; import org.elasticsearch.snapshots.SnapshotId; import org.elasticsearch.snapshots.SnapshotInfo; @@ -1721,7 +1722,7 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp for (String fileName : fileNames) { if (snapshotStatus.isAborted()) { logger.debug("[{}] [{}] Aborted on the file [{}], exiting", shardId, snapshotId, fileName); - throw new IndexShardSnapshotFailedException(shardId, "Aborted"); + throw new AbortedSnapshotException(); } logger.trace("[{}] [{}] Processing [{}]", shardId, snapshotId, fileName); @@ -1865,7 +1866,7 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp private static Releasable incrementStoreRef(Store store, IndexShardSnapshotStatus snapshotStatus, ShardId shardId) { if (store.tryIncRef() == false) { if (snapshotStatus.isAborted()) { - throw new IndexShardSnapshotFailedException(shardId, "Aborted"); + throw new AbortedSnapshotException(); } else { assert false : "Store should not be closed concurrently unless snapshot is aborted"; throw new IndexShardSnapshotFailedException(shardId, "Store got closed concurrently"); @@ -2199,7 +2200,7 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp if (snapshotStatus.isAborted()) { logger.debug("[{}] [{}] Aborted on the file [{}], exiting", shardId, snapshotId, fileInfo.physicalName()); - throw new IndexShardSnapshotFailedException(shardId, "Aborted"); + throw new AbortedSnapshotException(); } } }; diff --git a/server/src/main/java/org/elasticsearch/snapshots/AbortedSnapshotException.java b/server/src/main/java/org/elasticsearch/snapshots/AbortedSnapshotException.java new file mode 100644 index 00000000000..3e24b68059b --- /dev/null +++ b/server/src/main/java/org/elasticsearch/snapshots/AbortedSnapshotException.java @@ -0,0 +1,25 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.snapshots; + +public final class AbortedSnapshotException extends RuntimeException { + public AbortedSnapshotException() { + super("aborted"); + } +} diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java index cdeb978cf9a..a2a1d0628f7 100644 --- a/server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java +++ b/server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java @@ -24,7 +24,6 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.message.ParameterizedMessage; import org.apache.lucene.index.IndexCommit; -import org.elasticsearch.ExceptionsHelper; import org.elasticsearch.Version; import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.ActionRequestValidationException; @@ -302,9 +301,15 @@ public class SnapshotShardsService extends AbstractLifecycleComponent implements @Override public void onFailure(Exception e) { - final String failure = ExceptionsHelper.stackTrace(e); + final String failure; + if (e instanceof AbortedSnapshotException) { + failure = "aborted"; + logger.debug(() -> new ParameterizedMessage("[{}][{}] aborted shard snapshot", shardId, snapshot), e); + } else { + failure = summarizeFailure(e); + logger.warn(() -> new ParameterizedMessage("[{}][{}] failed to snapshot shard", shardId, snapshot), e); + } snapshotStatus.moveToFailed(threadPool.absoluteTimeInMillis(), failure); - logger.warn(() -> new ParameterizedMessage("[{}][{}] failed to snapshot shard", shardId, snapshot), e); notifyFailedSnapshotShard(snapshot, shardId, failure); } }); @@ -312,6 +317,28 @@ public class SnapshotShardsService extends AbstractLifecycleComponent implements }); } + //package private for testing + static String summarizeFailure(Throwable t) { + if (t.getCause() == null) { + return t.getClass().getSimpleName() + "[" + t.getMessage() + "]"; + } else { + StringBuilder sb = new StringBuilder(); + while (t != null) { + sb.append(t.getClass().getSimpleName()); + if (t.getMessage() != null) { + sb.append("["); + sb.append(t.getMessage()); + sb.append("]"); + } + t = t.getCause(); + if (t != null) { + sb.append("; nested: "); + } + } + return sb.toString(); + } + } + /** * Creates shard snapshot * diff --git a/server/src/test/java/org/elasticsearch/snapshots/SnapshotShardsServiceTests.java b/server/src/test/java/org/elasticsearch/snapshots/SnapshotShardsServiceTests.java index f1b4928c0e2..5a513d16036 100644 --- a/server/src/test/java/org/elasticsearch/snapshots/SnapshotShardsServiceTests.java +++ b/server/src/test/java/org/elasticsearch/snapshots/SnapshotShardsServiceTests.java @@ -24,8 +24,24 @@ import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.EqualsHashCodeTestUtils; +import java.io.IOException; + +import static org.hamcrest.Matchers.is; + public class SnapshotShardsServiceTests extends ESTestCase { + public void testSummarizeFailure() { + final RuntimeException wrapped = new RuntimeException("wrapped"); + assertThat(SnapshotShardsService.summarizeFailure(wrapped), is("RuntimeException[wrapped]")); + final RuntimeException wrappedWithNested = new RuntimeException("wrapped", new IOException("nested")); + assertThat(SnapshotShardsService.summarizeFailure(wrappedWithNested), + is("RuntimeException[wrapped]; nested: IOException[nested]")); + final RuntimeException wrappedWithTwoNested = + new RuntimeException("wrapped", new IOException("nested", new IOException("root"))); + assertThat(SnapshotShardsService.summarizeFailure(wrappedWithTwoNested), + is("RuntimeException[wrapped]; nested: IOException[nested]; nested: IOException[root]")); + } + public void testEqualsAndHashcodeUpdateIndexShardSnapshotStatusRequest() { EqualsHashCodeTestUtils.checkEqualsAndHashCode( new SnapshotShardsService.UpdateIndexShardSnapshotStatusRequest(