Stop Serializing Exceptions in SnapshotInfo (#57866) (#57898)

In ff9e8c622427d42a2d87b4ceb298d043ae3c4e6a we changed the format
used when serializing snapshot failures in the cluster state and
`SnapshotInfo`. This turned them from a short string holding all the
nested exception messages into a multi kb stacktrace in many cases.
This is not great if you snapshot a large number of shards that all fail
for example and massively blows up the size of the GET snapshots response
if there are snapshots with failures in there.
This change reverts to the format used for exceptions before the above commit.

Also, this change short circuits logging and serialization of the failure
for an aborted snapshot where we don't care about the specific message at all
and aligns the message to "aborted" in all cases (current if we aborted before any IO,
it would have been "aborted" and an exception when aborting later during IO).
This commit is contained in:
Armin Braun 2020-06-10 08:41:03 +02:00 committed by GitHub
parent 95bd7b63b0
commit d579420452
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 78 additions and 8 deletions

View File

@ -136,6 +136,7 @@ import static org.hamcrest.Matchers.anyOf;
import static org.hamcrest.Matchers.containsInAnyOrder; import static org.hamcrest.Matchers.containsInAnyOrder;
import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.empty; import static org.hamcrest.Matchers.empty;
import static org.hamcrest.Matchers.endsWith;
import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.hasSize; import static org.hamcrest.Matchers.hasSize;
@ -3181,7 +3182,7 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas
SnapshotInfo snapshotInfo = waitForCompletion(repo, snapshot, TimeValue.timeValueSeconds(60)); SnapshotInfo snapshotInfo = waitForCompletion(repo, snapshot, TimeValue.timeValueSeconds(60));
assertEquals(1, snapshotInfo.shardFailures().size()); assertEquals(1, snapshotInfo.shardFailures().size());
assertEquals(0, snapshotInfo.shardFailures().get(0).shardId()); assertEquals(0, snapshotInfo.shardFailures().get(0).shardId());
assertThat(snapshotInfo.shardFailures().get(0).reason(), containsString("IndexShardSnapshotFailedException[Aborted]")); assertThat(snapshotInfo.shardFailures().get(0).reason(), is("aborted"));
} }
public void testSnapshotSucceedsAfterSnapshotFailure() throws Exception { public void testSnapshotSucceedsAfterSnapshotFailure() throws Exception {
@ -3223,7 +3224,7 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas
assertThat(getFailureCount("test-repo"), greaterThan(0L)); assertThat(getFailureCount("test-repo"), greaterThan(0L));
assertThat(createSnapshotResponse.getSnapshotInfo().shardFailures().size(), greaterThan(0)); assertThat(createSnapshotResponse.getSnapshotInfo().shardFailures().size(), greaterThan(0));
for (SnapshotShardFailure shardFailure : createSnapshotResponse.getSnapshotInfo().shardFailures()) { for (SnapshotShardFailure shardFailure : createSnapshotResponse.getSnapshotInfo().shardFailures()) {
assertThat(shardFailure.reason(), containsString("Random IOException")); assertThat(shardFailure.reason(), endsWith("; nested: IOException[Random IOException]"));
} }
} }
} catch (SnapshotException | RepositoryException ex) { } catch (SnapshotException | RepositoryException ex) {

View File

@ -108,6 +108,7 @@ import org.elasticsearch.repositories.RepositoryStats;
import org.elasticsearch.repositories.RepositoryVerificationException; import org.elasticsearch.repositories.RepositoryVerificationException;
import org.elasticsearch.snapshots.SnapshotCreationException; import org.elasticsearch.snapshots.SnapshotCreationException;
import org.elasticsearch.repositories.ShardGenerations; import org.elasticsearch.repositories.ShardGenerations;
import org.elasticsearch.snapshots.AbortedSnapshotException;
import org.elasticsearch.snapshots.SnapshotException; import org.elasticsearch.snapshots.SnapshotException;
import org.elasticsearch.snapshots.SnapshotId; import org.elasticsearch.snapshots.SnapshotId;
import org.elasticsearch.snapshots.SnapshotInfo; import org.elasticsearch.snapshots.SnapshotInfo;
@ -1721,7 +1722,7 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
for (String fileName : fileNames) { for (String fileName : fileNames) {
if (snapshotStatus.isAborted()) { if (snapshotStatus.isAborted()) {
logger.debug("[{}] [{}] Aborted on the file [{}], exiting", shardId, snapshotId, fileName); logger.debug("[{}] [{}] Aborted on the file [{}], exiting", shardId, snapshotId, fileName);
throw new IndexShardSnapshotFailedException(shardId, "Aborted"); throw new AbortedSnapshotException();
} }
logger.trace("[{}] [{}] Processing [{}]", shardId, snapshotId, fileName); logger.trace("[{}] [{}] Processing [{}]", shardId, snapshotId, fileName);
@ -1865,7 +1866,7 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
private static Releasable incrementStoreRef(Store store, IndexShardSnapshotStatus snapshotStatus, ShardId shardId) { private static Releasable incrementStoreRef(Store store, IndexShardSnapshotStatus snapshotStatus, ShardId shardId) {
if (store.tryIncRef() == false) { if (store.tryIncRef() == false) {
if (snapshotStatus.isAborted()) { if (snapshotStatus.isAborted()) {
throw new IndexShardSnapshotFailedException(shardId, "Aborted"); throw new AbortedSnapshotException();
} else { } else {
assert false : "Store should not be closed concurrently unless snapshot is aborted"; assert false : "Store should not be closed concurrently unless snapshot is aborted";
throw new IndexShardSnapshotFailedException(shardId, "Store got closed concurrently"); throw new IndexShardSnapshotFailedException(shardId, "Store got closed concurrently");
@ -2199,7 +2200,7 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
if (snapshotStatus.isAborted()) { if (snapshotStatus.isAborted()) {
logger.debug("[{}] [{}] Aborted on the file [{}], exiting", shardId, logger.debug("[{}] [{}] Aborted on the file [{}], exiting", shardId,
snapshotId, fileInfo.physicalName()); snapshotId, fileInfo.physicalName());
throw new IndexShardSnapshotFailedException(shardId, "Aborted"); throw new AbortedSnapshotException();
} }
} }
}; };

View File

@ -0,0 +1,25 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.snapshots;
public final class AbortedSnapshotException extends RuntimeException {
public AbortedSnapshotException() {
super("aborted");
}
}

View File

@ -24,7 +24,6 @@ import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage; import org.apache.logging.log4j.message.ParameterizedMessage;
import org.apache.lucene.index.IndexCommit; import org.apache.lucene.index.IndexCommit;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.Version; import org.elasticsearch.Version;
import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.ActionRequestValidationException; import org.elasticsearch.action.ActionRequestValidationException;
@ -302,9 +301,15 @@ public class SnapshotShardsService extends AbstractLifecycleComponent implements
@Override @Override
public void onFailure(Exception e) { public void onFailure(Exception e) {
final String failure = ExceptionsHelper.stackTrace(e); final String failure;
snapshotStatus.moveToFailed(threadPool.absoluteTimeInMillis(), failure); if (e instanceof AbortedSnapshotException) {
failure = "aborted";
logger.debug(() -> new ParameterizedMessage("[{}][{}] aborted shard snapshot", shardId, snapshot), e);
} else {
failure = summarizeFailure(e);
logger.warn(() -> new ParameterizedMessage("[{}][{}] failed to snapshot shard", shardId, snapshot), e); logger.warn(() -> new ParameterizedMessage("[{}][{}] failed to snapshot shard", shardId, snapshot), e);
}
snapshotStatus.moveToFailed(threadPool.absoluteTimeInMillis(), failure);
notifyFailedSnapshotShard(snapshot, shardId, failure); notifyFailedSnapshotShard(snapshot, shardId, failure);
} }
}); });
@ -312,6 +317,28 @@ public class SnapshotShardsService extends AbstractLifecycleComponent implements
}); });
} }
//package private for testing
static String summarizeFailure(Throwable t) {
if (t.getCause() == null) {
return t.getClass().getSimpleName() + "[" + t.getMessage() + "]";
} else {
StringBuilder sb = new StringBuilder();
while (t != null) {
sb.append(t.getClass().getSimpleName());
if (t.getMessage() != null) {
sb.append("[");
sb.append(t.getMessage());
sb.append("]");
}
t = t.getCause();
if (t != null) {
sb.append("; nested: ");
}
}
return sb.toString();
}
}
/** /**
* Creates shard snapshot * Creates shard snapshot
* *

View File

@ -24,8 +24,24 @@ import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.EqualsHashCodeTestUtils; import org.elasticsearch.test.EqualsHashCodeTestUtils;
import java.io.IOException;
import static org.hamcrest.Matchers.is;
public class SnapshotShardsServiceTests extends ESTestCase { public class SnapshotShardsServiceTests extends ESTestCase {
public void testSummarizeFailure() {
final RuntimeException wrapped = new RuntimeException("wrapped");
assertThat(SnapshotShardsService.summarizeFailure(wrapped), is("RuntimeException[wrapped]"));
final RuntimeException wrappedWithNested = new RuntimeException("wrapped", new IOException("nested"));
assertThat(SnapshotShardsService.summarizeFailure(wrappedWithNested),
is("RuntimeException[wrapped]; nested: IOException[nested]"));
final RuntimeException wrappedWithTwoNested =
new RuntimeException("wrapped", new IOException("nested", new IOException("root")));
assertThat(SnapshotShardsService.summarizeFailure(wrappedWithTwoNested),
is("RuntimeException[wrapped]; nested: IOException[nested]; nested: IOException[root]"));
}
public void testEqualsAndHashcodeUpdateIndexShardSnapshotStatusRequest() { public void testEqualsAndHashcodeUpdateIndexShardSnapshotStatusRequest() {
EqualsHashCodeTestUtils.checkEqualsAndHashCode( EqualsHashCodeTestUtils.checkEqualsAndHashCode(
new SnapshotShardsService.UpdateIndexShardSnapshotStatusRequest( new SnapshotShardsService.UpdateIndexShardSnapshotStatusRequest(