From 287e334ef30d22449605e1e94d5ea5d4e6be30a2 Mon Sep 17 00:00:00 2001 From: Andrey Ershov Date: Mon, 1 Apr 2019 11:49:05 +0300 Subject: [PATCH] Do not perform cleanup if Manifest write fails with dirty exception (#40519) Currently, if Manifest write is unsuccessful (i.e. WriteStateException is thrown) we perform cleanup of newly created metadata files. However, this is wrong. Consider the following sequence (caught by CI here https://github.com/elastic/elasticsearch/issues/39077): - cluster global data is written **successful** - the associated manifest write **fails** (during the fsync, ie files have been written) - deleting (revert) the manifest files, **fails**, metadata is therefore persisted - deleting (revert) the cluster global data is **successful** In this case, when trying to load metadata (after node restart because of dirty WriteStateException), the following exception will happen ``` java.io.IOException: failed to find global metadata [generation: 0] ``` because the manifest file is referencing missing global metadata file. This commit checks if thrown WriteStateException is dirty and if its we don't perform any cleanup, because new Manifest file might be created, but its deletion has failed. In the future, we might add more fine-grained check - perform the clean up if WriteStateException is dirty, but Manifest deletion is successful. Closes https://github.com/elastic/elasticsearch/issues/39077 (cherry picked from commit 1fac56916bb3c4f3333c639e59188dbe743e385b) --- .../java/org/elasticsearch/gateway/GatewayMetaState.java | 9 ++++++++- .../org/elasticsearch/gateway/GatewayMetaStateTests.java | 1 - 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/gateway/GatewayMetaState.java b/server/src/main/java/org/elasticsearch/gateway/GatewayMetaState.java index bd6fd908d49..30361fa70ee 100644 --- a/server/src/main/java/org/elasticsearch/gateway/GatewayMetaState.java +++ b/server/src/main/java/org/elasticsearch/gateway/GatewayMetaState.java @@ -320,7 +320,14 @@ public class GatewayMetaState implements ClusterStateApplier, CoordinationState. finished = true; return generation; } catch (WriteStateException e) { - rollback(); + // if Manifest write results in dirty WriteStateException it's not safe to remove + // new metadata files, because if Manifest was actually written to disk and its deletion + // fails it will reference these new metadata files. + // In the future, we might decide to add more fine grained check to understand if after + // WriteStateException Manifest deletion has actually failed. + if (e.isDirty() == false) { + rollback(); + } throw e; } } diff --git a/server/src/test/java/org/elasticsearch/gateway/GatewayMetaStateTests.java b/server/src/test/java/org/elasticsearch/gateway/GatewayMetaStateTests.java index 1f4e0bafe4a..22259b919ec 100644 --- a/server/src/test/java/org/elasticsearch/gateway/GatewayMetaStateTests.java +++ b/server/src/test/java/org/elasticsearch/gateway/GatewayMetaStateTests.java @@ -374,7 +374,6 @@ public class GatewayMetaStateTests extends ESAllocationTestCase { return builder.build(); } - @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/39077") public void testAtomicityWithFailures() throws IOException { try (NodeEnvironment env = newNodeEnvironment()) { MetaStateServiceWithFailures metaStateService =