[Tests] Add debug information to CorruptedFileIT
This test failed but the cause is not obvious. This commit adds more debug logging traces so that if it reproduces we could gather more information. Related #30577
This commit is contained in:
parent
8a89306aaf
commit
7915b5f7aa
|
@ -1147,6 +1147,7 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
|
||||||
// TODO apparently we don't use the MetadataSnapshot#.recoveryDiff(...) here but we should
|
// TODO apparently we don't use the MetadataSnapshot#.recoveryDiff(...) here but we should
|
||||||
final Collection<String> fileNames;
|
final Collection<String> fileNames;
|
||||||
try {
|
try {
|
||||||
|
logger.trace("[{}] [{}] Loading store metadata using index commit [{}]", shardId, snapshotId, snapshotIndexCommit);
|
||||||
metadata = store.getMetadata(snapshotIndexCommit);
|
metadata = store.getMetadata(snapshotIndexCommit);
|
||||||
fileNames = snapshotIndexCommit.getFileNames();
|
fileNames = snapshotIndexCommit.getFileNames();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
@ -1242,9 +1243,6 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Snapshot individual file
|
* Snapshot individual file
|
||||||
* <p>
|
|
||||||
* This is asynchronous method. Upon completion of the operation latch is getting counted down and any failures are
|
|
||||||
* added to the {@code failures} list
|
|
||||||
*
|
*
|
||||||
* @param fileInfo file to be snapshotted
|
* @param fileInfo file to be snapshotted
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -470,7 +470,7 @@ public class CorruptedFileIT extends ESIntegTestCase {
|
||||||
* TODO once checksum verification on snapshotting is implemented this test needs to be fixed or split into several
|
* TODO once checksum verification on snapshotting is implemented this test needs to be fixed or split into several
|
||||||
* parts... We should also corrupt files on the actual snapshot and check that we don't restore the corrupted shard.
|
* parts... We should also corrupt files on the actual snapshot and check that we don't restore the corrupted shard.
|
||||||
*/
|
*/
|
||||||
@TestLogging("org.elasticsearch.monitor.fs:DEBUG")
|
@TestLogging("org.elasticsearch.repositories:TRACE,org.elasticsearch.snapshots:TRACE")
|
||||||
public void testCorruptFileThenSnapshotAndRestore() throws ExecutionException, InterruptedException, IOException {
|
public void testCorruptFileThenSnapshotAndRestore() throws ExecutionException, InterruptedException, IOException {
|
||||||
int numDocs = scaledRandomIntBetween(100, 1000);
|
int numDocs = scaledRandomIntBetween(100, 1000);
|
||||||
internalCluster().ensureAtLeastNumDataNodes(2);
|
internalCluster().ensureAtLeastNumDataNodes(2);
|
||||||
|
@ -494,6 +494,7 @@ public class CorruptedFileIT extends ESIntegTestCase {
|
||||||
assertHitCount(countResponse, numDocs);
|
assertHitCount(countResponse, numDocs);
|
||||||
|
|
||||||
ShardRouting shardRouting = corruptRandomPrimaryFile(false);
|
ShardRouting shardRouting = corruptRandomPrimaryFile(false);
|
||||||
|
logger.info("--> shard {} has a corrupted file", shardRouting);
|
||||||
// we don't corrupt segments.gen since S/R doesn't snapshot this file
|
// we don't corrupt segments.gen since S/R doesn't snapshot this file
|
||||||
// the other problem here why we can't corrupt segments.X files is that the snapshot flushes again before
|
// the other problem here why we can't corrupt segments.X files is that the snapshot flushes again before
|
||||||
// it snapshots and that will write a new segments.X+1 file
|
// it snapshots and that will write a new segments.X+1 file
|
||||||
|
@ -504,9 +505,12 @@ public class CorruptedFileIT extends ESIntegTestCase {
|
||||||
.put("compress", randomBoolean())
|
.put("compress", randomBoolean())
|
||||||
.put("chunk_size", randomIntBetween(100, 1000), ByteSizeUnit.BYTES)));
|
.put("chunk_size", randomIntBetween(100, 1000), ByteSizeUnit.BYTES)));
|
||||||
logger.info("--> snapshot");
|
logger.info("--> snapshot");
|
||||||
CreateSnapshotResponse createSnapshotResponse = client().admin().cluster().prepareCreateSnapshot("test-repo", "test-snap").setWaitForCompletion(true).setIndices("test").get();
|
final CreateSnapshotResponse createSnapshotResponse = client().admin().cluster().prepareCreateSnapshot("test-repo", "test-snap")
|
||||||
assertThat(createSnapshotResponse.getSnapshotInfo().state(), equalTo(SnapshotState.PARTIAL));
|
.setWaitForCompletion(true)
|
||||||
logger.info("failed during snapshot -- maybe SI file got corrupted");
|
.setIndices("test")
|
||||||
|
.get();
|
||||||
|
final SnapshotState snapshotState = createSnapshotResponse.getSnapshotInfo().state();
|
||||||
|
logger.info("--> snapshot terminated with state " + snapshotState);
|
||||||
final List<Path> files = listShardFiles(shardRouting);
|
final List<Path> files = listShardFiles(shardRouting);
|
||||||
Path corruptedFile = null;
|
Path corruptedFile = null;
|
||||||
for (Path file : files) {
|
for (Path file : files) {
|
||||||
|
@ -515,6 +519,11 @@ public class CorruptedFileIT extends ESIntegTestCase {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (snapshotState != SnapshotState.PARTIAL) {
|
||||||
|
logger.info("--> listing shard files for investigation");
|
||||||
|
files.forEach(f -> logger.info("path: {}", f.toAbsolutePath()));
|
||||||
|
}
|
||||||
|
assertThat(createSnapshotResponse.getSnapshotInfo().state(), equalTo(SnapshotState.PARTIAL));
|
||||||
assertThat(corruptedFile, notNullValue());
|
assertThat(corruptedFile, notNullValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue