[Tests] Add debug information to CorruptedFileIT

This test failed but the cause is not obvious. This commit adds more
debug logging traces so that if it reproduces we could gather more
information.

Related #30577
This commit is contained in:
Tanguy Leroux 2018-05-17 10:57:25 +02:00
parent 8a89306aaf
commit 7915b5f7aa
2 changed files with 14 additions and 7 deletions

View File

@ -1147,6 +1147,7 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
// TODO apparently we don't use the MetadataSnapshot#.recoveryDiff(...) here but we should // TODO apparently we don't use the MetadataSnapshot#.recoveryDiff(...) here but we should
final Collection<String> fileNames; final Collection<String> fileNames;
try { try {
logger.trace("[{}] [{}] Loading store metadata using index commit [{}]", shardId, snapshotId, snapshotIndexCommit);
metadata = store.getMetadata(snapshotIndexCommit); metadata = store.getMetadata(snapshotIndexCommit);
fileNames = snapshotIndexCommit.getFileNames(); fileNames = snapshotIndexCommit.getFileNames();
} catch (IOException e) { } catch (IOException e) {
@ -1242,9 +1243,6 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
/** /**
* Snapshot individual file * Snapshot individual file
* <p>
* This is asynchronous method. Upon completion of the operation latch is getting counted down and any failures are
* added to the {@code failures} list
* *
* @param fileInfo file to be snapshotted * @param fileInfo file to be snapshotted
*/ */

View File

@ -470,7 +470,7 @@ public class CorruptedFileIT extends ESIntegTestCase {
* TODO once checksum verification on snapshotting is implemented this test needs to be fixed or split into several * TODO once checksum verification on snapshotting is implemented this test needs to be fixed or split into several
* parts... We should also corrupt files on the actual snapshot and check that we don't restore the corrupted shard. * parts... We should also corrupt files on the actual snapshot and check that we don't restore the corrupted shard.
*/ */
@TestLogging("org.elasticsearch.monitor.fs:DEBUG") @TestLogging("org.elasticsearch.repositories:TRACE,org.elasticsearch.snapshots:TRACE")
public void testCorruptFileThenSnapshotAndRestore() throws ExecutionException, InterruptedException, IOException { public void testCorruptFileThenSnapshotAndRestore() throws ExecutionException, InterruptedException, IOException {
int numDocs = scaledRandomIntBetween(100, 1000); int numDocs = scaledRandomIntBetween(100, 1000);
internalCluster().ensureAtLeastNumDataNodes(2); internalCluster().ensureAtLeastNumDataNodes(2);
@ -494,6 +494,7 @@ public class CorruptedFileIT extends ESIntegTestCase {
assertHitCount(countResponse, numDocs); assertHitCount(countResponse, numDocs);
ShardRouting shardRouting = corruptRandomPrimaryFile(false); ShardRouting shardRouting = corruptRandomPrimaryFile(false);
logger.info("--> shard {} has a corrupted file", shardRouting);
// we don't corrupt segments.gen since S/R doesn't snapshot this file // we don't corrupt segments.gen since S/R doesn't snapshot this file
// the other problem here why we can't corrupt segments.X files is that the snapshot flushes again before // the other problem here why we can't corrupt segments.X files is that the snapshot flushes again before
// it snapshots and that will write a new segments.X+1 file // it snapshots and that will write a new segments.X+1 file
@ -504,9 +505,12 @@ public class CorruptedFileIT extends ESIntegTestCase {
.put("compress", randomBoolean()) .put("compress", randomBoolean())
.put("chunk_size", randomIntBetween(100, 1000), ByteSizeUnit.BYTES))); .put("chunk_size", randomIntBetween(100, 1000), ByteSizeUnit.BYTES)));
logger.info("--> snapshot"); logger.info("--> snapshot");
CreateSnapshotResponse createSnapshotResponse = client().admin().cluster().prepareCreateSnapshot("test-repo", "test-snap").setWaitForCompletion(true).setIndices("test").get(); final CreateSnapshotResponse createSnapshotResponse = client().admin().cluster().prepareCreateSnapshot("test-repo", "test-snap")
assertThat(createSnapshotResponse.getSnapshotInfo().state(), equalTo(SnapshotState.PARTIAL)); .setWaitForCompletion(true)
logger.info("failed during snapshot -- maybe SI file got corrupted"); .setIndices("test")
.get();
final SnapshotState snapshotState = createSnapshotResponse.getSnapshotInfo().state();
logger.info("--> snapshot terminated with state " + snapshotState);
final List<Path> files = listShardFiles(shardRouting); final List<Path> files = listShardFiles(shardRouting);
Path corruptedFile = null; Path corruptedFile = null;
for (Path file : files) { for (Path file : files) {
@ -515,6 +519,11 @@ public class CorruptedFileIT extends ESIntegTestCase {
break; break;
} }
} }
if (snapshotState != SnapshotState.PARTIAL) {
logger.info("--> listing shard files for investigation");
files.forEach(f -> logger.info("path: {}", f.toAbsolutePath()));
}
assertThat(createSnapshotResponse.getSnapshotInfo().state(), equalTo(SnapshotState.PARTIAL));
assertThat(corruptedFile, notNullValue()); assertThat(corruptedFile, notNullValue());
} }