In ff9e8c622427d42a2d87b4ceb298d043ae3c4e6a we changed the format used when serializing snapshot failures in the cluster state and `SnapshotInfo`. This turned them from a short string holding all the nested exception messages into a multi kb stacktrace in many cases. This is not great if you snapshot a large number of shards that all fail for example and massively blows up the size of the GET snapshots response if there are snapshots with failures in there. This change reverts to the format used for exceptions before the above commit. Also, this change short circuits logging and serialization of the failure for an aborted snapshot where we don't care about the specific message at all and aligns the message to "aborted" in all cases (current if we aborted before any IO, it would have been "aborted" and an exception when aborting later during IO).
This commit is contained in:
parent
95bd7b63b0
commit
d579420452
|
@ -136,6 +136,7 @@ import static org.hamcrest.Matchers.anyOf;
|
|||
import static org.hamcrest.Matchers.containsInAnyOrder;
|
||||
import static org.hamcrest.Matchers.containsString;
|
||||
import static org.hamcrest.Matchers.empty;
|
||||
import static org.hamcrest.Matchers.endsWith;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.hamcrest.Matchers.greaterThan;
|
||||
import static org.hamcrest.Matchers.hasSize;
|
||||
|
@ -3181,7 +3182,7 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas
|
|||
SnapshotInfo snapshotInfo = waitForCompletion(repo, snapshot, TimeValue.timeValueSeconds(60));
|
||||
assertEquals(1, snapshotInfo.shardFailures().size());
|
||||
assertEquals(0, snapshotInfo.shardFailures().get(0).shardId());
|
||||
assertThat(snapshotInfo.shardFailures().get(0).reason(), containsString("IndexShardSnapshotFailedException[Aborted]"));
|
||||
assertThat(snapshotInfo.shardFailures().get(0).reason(), is("aborted"));
|
||||
}
|
||||
|
||||
public void testSnapshotSucceedsAfterSnapshotFailure() throws Exception {
|
||||
|
@ -3223,7 +3224,7 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas
|
|||
assertThat(getFailureCount("test-repo"), greaterThan(0L));
|
||||
assertThat(createSnapshotResponse.getSnapshotInfo().shardFailures().size(), greaterThan(0));
|
||||
for (SnapshotShardFailure shardFailure : createSnapshotResponse.getSnapshotInfo().shardFailures()) {
|
||||
assertThat(shardFailure.reason(), containsString("Random IOException"));
|
||||
assertThat(shardFailure.reason(), endsWith("; nested: IOException[Random IOException]"));
|
||||
}
|
||||
}
|
||||
} catch (SnapshotException | RepositoryException ex) {
|
||||
|
|
|
@ -108,6 +108,7 @@ import org.elasticsearch.repositories.RepositoryStats;
|
|||
import org.elasticsearch.repositories.RepositoryVerificationException;
|
||||
import org.elasticsearch.snapshots.SnapshotCreationException;
|
||||
import org.elasticsearch.repositories.ShardGenerations;
|
||||
import org.elasticsearch.snapshots.AbortedSnapshotException;
|
||||
import org.elasticsearch.snapshots.SnapshotException;
|
||||
import org.elasticsearch.snapshots.SnapshotId;
|
||||
import org.elasticsearch.snapshots.SnapshotInfo;
|
||||
|
@ -1721,7 +1722,7 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
|
|||
for (String fileName : fileNames) {
|
||||
if (snapshotStatus.isAborted()) {
|
||||
logger.debug("[{}] [{}] Aborted on the file [{}], exiting", shardId, snapshotId, fileName);
|
||||
throw new IndexShardSnapshotFailedException(shardId, "Aborted");
|
||||
throw new AbortedSnapshotException();
|
||||
}
|
||||
|
||||
logger.trace("[{}] [{}] Processing [{}]", shardId, snapshotId, fileName);
|
||||
|
@ -1865,7 +1866,7 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
|
|||
private static Releasable incrementStoreRef(Store store, IndexShardSnapshotStatus snapshotStatus, ShardId shardId) {
|
||||
if (store.tryIncRef() == false) {
|
||||
if (snapshotStatus.isAborted()) {
|
||||
throw new IndexShardSnapshotFailedException(shardId, "Aborted");
|
||||
throw new AbortedSnapshotException();
|
||||
} else {
|
||||
assert false : "Store should not be closed concurrently unless snapshot is aborted";
|
||||
throw new IndexShardSnapshotFailedException(shardId, "Store got closed concurrently");
|
||||
|
@ -2199,7 +2200,7 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
|
|||
if (snapshotStatus.isAborted()) {
|
||||
logger.debug("[{}] [{}] Aborted on the file [{}], exiting", shardId,
|
||||
snapshotId, fileInfo.physicalName());
|
||||
throw new IndexShardSnapshotFailedException(shardId, "Aborted");
|
||||
throw new AbortedSnapshotException();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.snapshots;
|
||||
|
||||
public final class AbortedSnapshotException extends RuntimeException {
|
||||
public AbortedSnapshotException() {
|
||||
super("aborted");
|
||||
}
|
||||
}
|
|
@ -24,7 +24,6 @@ import org.apache.logging.log4j.LogManager;
|
|||
import org.apache.logging.log4j.Logger;
|
||||
import org.apache.logging.log4j.message.ParameterizedMessage;
|
||||
import org.apache.lucene.index.IndexCommit;
|
||||
import org.elasticsearch.ExceptionsHelper;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.action.ActionListener;
|
||||
import org.elasticsearch.action.ActionRequestValidationException;
|
||||
|
@ -302,9 +301,15 @@ public class SnapshotShardsService extends AbstractLifecycleComponent implements
|
|||
|
||||
@Override
|
||||
public void onFailure(Exception e) {
|
||||
final String failure = ExceptionsHelper.stackTrace(e);
|
||||
final String failure;
|
||||
if (e instanceof AbortedSnapshotException) {
|
||||
failure = "aborted";
|
||||
logger.debug(() -> new ParameterizedMessage("[{}][{}] aborted shard snapshot", shardId, snapshot), e);
|
||||
} else {
|
||||
failure = summarizeFailure(e);
|
||||
logger.warn(() -> new ParameterizedMessage("[{}][{}] failed to snapshot shard", shardId, snapshot), e);
|
||||
}
|
||||
snapshotStatus.moveToFailed(threadPool.absoluteTimeInMillis(), failure);
|
||||
logger.warn(() -> new ParameterizedMessage("[{}][{}] failed to snapshot shard", shardId, snapshot), e);
|
||||
notifyFailedSnapshotShard(snapshot, shardId, failure);
|
||||
}
|
||||
});
|
||||
|
@ -312,6 +317,28 @@ public class SnapshotShardsService extends AbstractLifecycleComponent implements
|
|||
});
|
||||
}
|
||||
|
||||
//package private for testing
|
||||
static String summarizeFailure(Throwable t) {
|
||||
if (t.getCause() == null) {
|
||||
return t.getClass().getSimpleName() + "[" + t.getMessage() + "]";
|
||||
} else {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
while (t != null) {
|
||||
sb.append(t.getClass().getSimpleName());
|
||||
if (t.getMessage() != null) {
|
||||
sb.append("[");
|
||||
sb.append(t.getMessage());
|
||||
sb.append("]");
|
||||
}
|
||||
t = t.getCause();
|
||||
if (t != null) {
|
||||
sb.append("; nested: ");
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates shard snapshot
|
||||
*
|
||||
|
|
|
@ -24,8 +24,24 @@ import org.elasticsearch.index.shard.ShardId;
|
|||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.elasticsearch.test.EqualsHashCodeTestUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.hamcrest.Matchers.is;
|
||||
|
||||
public class SnapshotShardsServiceTests extends ESTestCase {
|
||||
|
||||
public void testSummarizeFailure() {
|
||||
final RuntimeException wrapped = new RuntimeException("wrapped");
|
||||
assertThat(SnapshotShardsService.summarizeFailure(wrapped), is("RuntimeException[wrapped]"));
|
||||
final RuntimeException wrappedWithNested = new RuntimeException("wrapped", new IOException("nested"));
|
||||
assertThat(SnapshotShardsService.summarizeFailure(wrappedWithNested),
|
||||
is("RuntimeException[wrapped]; nested: IOException[nested]"));
|
||||
final RuntimeException wrappedWithTwoNested =
|
||||
new RuntimeException("wrapped", new IOException("nested", new IOException("root")));
|
||||
assertThat(SnapshotShardsService.summarizeFailure(wrappedWithTwoNested),
|
||||
is("RuntimeException[wrapped]; nested: IOException[nested]; nested: IOException[root]"));
|
||||
}
|
||||
|
||||
public void testEqualsAndHashcodeUpdateIndexShardSnapshotStatusRequest() {
|
||||
EqualsHashCodeTestUtils.checkEqualsAndHashCode(
|
||||
new SnapshotShardsService.UpdateIndexShardSnapshotStatusRequest(
|
||||
|
|
Loading…
Reference in New Issue