From 2768662822aff1f4d0dfa34b606c975ac8ef3bc7 Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Thu, 11 Jul 2019 19:35:15 +0200 Subject: [PATCH] Cleanup Stale Root Level Blobs in Sn. Repository (#43542) (#44226) * Cleans up all root level temp., snap-%s.dat, meta-%s.dat blobs that aren't referenced by any snapshot to deal with dangling blobs left behind by delete and snapshot finalization failures * The scenario that get's us here is a snapshot failing before it was finalized or a delete failing right after it wrote the updated index-(N+1) that doesn't reference a snapshot anymore but then fails to remove that snapshot * Not deleting other dangling blobs since that don't follow the snap-, meta- or tempfile naming schemes to not accidentally delete blobs not created by the snapshot logic * Follow up to #42189 * Same safety logic, get list of all blobs before writing index-N blobs, delete things after index-N blobs was written --- .../blobstore/BlobStoreRepository.java | 51 ++++++++++++++++++- .../AbstractThirdPartyRepositoryTestCase.java | 6 +++ 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java b/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java index 2febef58d54..f5d17eca036 100644 --- a/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java +++ b/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java @@ -58,6 +58,7 @@ import org.elasticsearch.common.metrics.CounterMetric; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.ByteSizeUnit; import org.elasticsearch.common.unit.ByteSizeValue; +import org.elasticsearch.common.util.set.Sets; import org.elasticsearch.common.xcontent.LoggingDeprecationHandler; import org.elasticsearch.common.xcontent.NamedXContentRegistry; import org.elasticsearch.common.xcontent.XContentFactory; @@ -100,6 +101,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Optional; @@ -433,9 +435,10 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp return; } final SnapshotInfo finalSnapshotInfo = snapshot; + final List snapMetaFilesToDelete = + Arrays.asList(snapshotFormat.blobName(snapshotId.getUUID()), globalMetaDataFormat.blobName(snapshotId.getUUID())); try { - blobContainer().deleteBlobsIgnoringIfNotExists( - Arrays.asList(snapshotFormat.blobName(snapshotId.getUUID()), globalMetaDataFormat.blobName(snapshotId.getUUID()))); + blobContainer().deleteBlobsIgnoringIfNotExists(snapMetaFilesToDelete); } catch (IOException e) { logger.warn(() -> new ParameterizedMessage("[{}] Unable to delete global metadata files", snapshotId), e); } @@ -448,12 +451,56 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp snapshotId, ActionListener.map(listener, v -> { cleanupStaleIndices(foundIndices, survivingIndices); + cleanupStaleRootFiles(Sets.difference(rootBlobs, new HashSet<>(snapMetaFilesToDelete)), updatedRepositoryData); return null; }) ); } } + private void cleanupStaleRootFiles(Set rootBlobNames, RepositoryData repositoryData) { + final Set allSnapshotIds = + repositoryData.getAllSnapshotIds().stream().map(SnapshotId::getUUID).collect(Collectors.toSet()); + final List blobsToDelete = rootBlobNames.stream().filter( + blob -> { + if (FsBlobContainer.isTempBlobName(blob)) { + return true; + } + if (blob.endsWith(".dat")) { + final String foundUUID; + if (blob.startsWith(SNAPSHOT_PREFIX)) { + foundUUID = blob.substring(SNAPSHOT_PREFIX.length(), blob.length() - ".dat".length()); + assert snapshotFormat.blobName(foundUUID).equals(blob); + } else if (blob.startsWith(METADATA_PREFIX)) { + foundUUID = blob.substring(METADATA_PREFIX.length(), blob.length() - ".dat".length()); + assert globalMetaDataFormat.blobName(foundUUID).equals(blob); + } else { + return false; + } + return allSnapshotIds.contains(foundUUID) == false; + } + return false; + } + ).collect(Collectors.toList()); + if (blobsToDelete.isEmpty()) { + return; + } + try { + logger.info("[{}] Found stale root level blobs {}. Cleaning them up", metadata.name(), blobsToDelete); + blobContainer().deleteBlobsIgnoringIfNotExists(blobsToDelete); + } catch (IOException e) { + logger.warn(() -> new ParameterizedMessage( + "[{}] The following blobs are no longer part of any snapshot [{}] but failed to remove them", + metadata.name(), blobsToDelete), e); + } catch (Exception e) { + // TODO: We shouldn't be blanket catching and suppressing all exceptions here and instead handle them safely upstream. + // Currently this catch exists as a stop gap solution to tackle unexpected runtime exceptions from implementations + // bubbling up and breaking the snapshot functionality. + assert false : e; + logger.warn(new ParameterizedMessage("[{}] Exception during cleanup of root level blobs", metadata.name()), e); + } + } + private void cleanupStaleIndices(Map foundIndices, Map survivingIndices) { try { final Set survivingIndexIds = survivingIndices.values().stream() diff --git a/test/framework/src/main/java/org/elasticsearch/repositories/AbstractThirdPartyRepositoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/repositories/AbstractThirdPartyRepositoryTestCase.java index e32c3945673..62d2e50cef8 100644 --- a/test/framework/src/main/java/org/elasticsearch/repositories/AbstractThirdPartyRepositoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/repositories/AbstractThirdPartyRepositoryTestCase.java @@ -236,6 +236,10 @@ public abstract class AbstractThirdPartyRepositoryTestCase extends ESSingleNodeT final BlobStore blobStore = repo.blobStore(); blobStore.blobContainer(BlobPath.cleanPath().add("indices").add("foo")) .writeBlob("bar", new ByteArrayInputStream(new byte[0]), 0, false); + for (String prefix : Arrays.asList("snap-", "meta-")) { + blobStore.blobContainer(BlobPath.cleanPath()) + .writeBlob(prefix + "foo.dat", new ByteArrayInputStream(new byte[0]), 0, false); + } future.onResponse(null); } }); @@ -256,6 +260,8 @@ public abstract class AbstractThirdPartyRepositoryTestCase extends ESSingleNodeT future.onResponse( blobStore.blobContainer(BlobPath.cleanPath().add("indices")).children().containsKey("foo") && blobStore.blobContainer(BlobPath.cleanPath().add("indices").add("foo")).blobExists("bar") + && blobStore.blobContainer(BlobPath.cleanPath()).blobExists("meta-foo.dat") + && blobStore.blobContainer(BlobPath.cleanPath()).blobExists("snap-foo.dat") ); } });