Cleanup Stale Root Level Blobs in Sn. Repository (#43542) (#44226)

* Cleans up all root level temp., snap-%s.dat, meta-%s.dat blobs that aren't referenced by any snapshot to deal with dangling blobs left behind by delete and snapshot finalization failures
   * The scenario that get's us here is a snapshot failing before it was finalized or a delete failing right after it wrote the updated index-(N+1) that doesn't reference a snapshot anymore but then fails to remove that snapshot
   * Not deleting other dangling blobs since that don't follow the snap-, meta- or tempfile naming schemes to not accidentally delete blobs not created by the snapshot logic
* Follow up to #42189
  * Same safety logic, get list of all blobs before writing index-N blobs, delete things after index-N blobs was written
This commit is contained in:
Armin Braun 2019-07-11 19:35:15 +02:00 committed by GitHub
parent e9f9f00940
commit 2768662822
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 55 additions and 2 deletions

View File

@ -58,6 +58,7 @@ import org.elasticsearch.common.metrics.CounterMetric;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.ByteSizeUnit;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.util.set.Sets;
import org.elasticsearch.common.xcontent.LoggingDeprecationHandler;
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
import org.elasticsearch.common.xcontent.XContentFactory;
@ -100,6 +101,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
@ -433,9 +435,10 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
return;
}
final SnapshotInfo finalSnapshotInfo = snapshot;
final List<String> snapMetaFilesToDelete =
Arrays.asList(snapshotFormat.blobName(snapshotId.getUUID()), globalMetaDataFormat.blobName(snapshotId.getUUID()));
try {
blobContainer().deleteBlobsIgnoringIfNotExists(
Arrays.asList(snapshotFormat.blobName(snapshotId.getUUID()), globalMetaDataFormat.blobName(snapshotId.getUUID())));
blobContainer().deleteBlobsIgnoringIfNotExists(snapMetaFilesToDelete);
} catch (IOException e) {
logger.warn(() -> new ParameterizedMessage("[{}] Unable to delete global metadata files", snapshotId), e);
}
@ -448,12 +451,56 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
snapshotId,
ActionListener.map(listener, v -> {
cleanupStaleIndices(foundIndices, survivingIndices);
cleanupStaleRootFiles(Sets.difference(rootBlobs, new HashSet<>(snapMetaFilesToDelete)), updatedRepositoryData);
return null;
})
);
}
}
private void cleanupStaleRootFiles(Set<String> rootBlobNames, RepositoryData repositoryData) {
final Set<String> allSnapshotIds =
repositoryData.getAllSnapshotIds().stream().map(SnapshotId::getUUID).collect(Collectors.toSet());
final List<String> blobsToDelete = rootBlobNames.stream().filter(
blob -> {
if (FsBlobContainer.isTempBlobName(blob)) {
return true;
}
if (blob.endsWith(".dat")) {
final String foundUUID;
if (blob.startsWith(SNAPSHOT_PREFIX)) {
foundUUID = blob.substring(SNAPSHOT_PREFIX.length(), blob.length() - ".dat".length());
assert snapshotFormat.blobName(foundUUID).equals(blob);
} else if (blob.startsWith(METADATA_PREFIX)) {
foundUUID = blob.substring(METADATA_PREFIX.length(), blob.length() - ".dat".length());
assert globalMetaDataFormat.blobName(foundUUID).equals(blob);
} else {
return false;
}
return allSnapshotIds.contains(foundUUID) == false;
}
return false;
}
).collect(Collectors.toList());
if (blobsToDelete.isEmpty()) {
return;
}
try {
logger.info("[{}] Found stale root level blobs {}. Cleaning them up", metadata.name(), blobsToDelete);
blobContainer().deleteBlobsIgnoringIfNotExists(blobsToDelete);
} catch (IOException e) {
logger.warn(() -> new ParameterizedMessage(
"[{}] The following blobs are no longer part of any snapshot [{}] but failed to remove them",
metadata.name(), blobsToDelete), e);
} catch (Exception e) {
// TODO: We shouldn't be blanket catching and suppressing all exceptions here and instead handle them safely upstream.
// Currently this catch exists as a stop gap solution to tackle unexpected runtime exceptions from implementations
// bubbling up and breaking the snapshot functionality.
assert false : e;
logger.warn(new ParameterizedMessage("[{}] Exception during cleanup of root level blobs", metadata.name()), e);
}
}
private void cleanupStaleIndices(Map<String, BlobContainer> foundIndices, Map<String, IndexId> survivingIndices) {
try {
final Set<String> survivingIndexIds = survivingIndices.values().stream()

View File

@ -236,6 +236,10 @@ public abstract class AbstractThirdPartyRepositoryTestCase extends ESSingleNodeT
final BlobStore blobStore = repo.blobStore();
blobStore.blobContainer(BlobPath.cleanPath().add("indices").add("foo"))
.writeBlob("bar", new ByteArrayInputStream(new byte[0]), 0, false);
for (String prefix : Arrays.asList("snap-", "meta-")) {
blobStore.blobContainer(BlobPath.cleanPath())
.writeBlob(prefix + "foo.dat", new ByteArrayInputStream(new byte[0]), 0, false);
}
future.onResponse(null);
}
});
@ -256,6 +260,8 @@ public abstract class AbstractThirdPartyRepositoryTestCase extends ESSingleNodeT
future.onResponse(
blobStore.blobContainer(BlobPath.cleanPath().add("indices")).children().containsKey("foo")
&& blobStore.blobContainer(BlobPath.cleanPath().add("indices").add("foo")).blobExists("bar")
&& blobStore.blobContainer(BlobPath.cleanPath()).blobExists("meta-foo.dat")
&& blobStore.blobContainer(BlobPath.cleanPath()).blobExists("snap-foo.dat")
);
}
});