From 44fdf2020aa762d985f739cb8f543860b08ec80b Mon Sep 17 00:00:00 2001 From: Nhat Nguyen Date: Wed, 2 Oct 2019 17:30:55 -0400 Subject: [PATCH] Always flush in FullClusterRestartIT#testRecovery (#47465) The pattern in the latest failure is similar to the source fixed in #46956 but relates to synced-flush. If peer recovery happens after indexing, and indexing flushes some shard at the end, then a synced flush in the test will not roll or commit translog. Closes #46712 --- .../upgrades/FullClusterRestartIT.java | 35 ++++++++----------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/qa/full-cluster-restart/src/test/java/org/elasticsearch/upgrades/FullClusterRestartIT.java b/qa/full-cluster-restart/src/test/java/org/elasticsearch/upgrades/FullClusterRestartIT.java index 0c85134fb78..4980c5ea59c 100644 --- a/qa/full-cluster-restart/src/test/java/org/elasticsearch/upgrades/FullClusterRestartIT.java +++ b/qa/full-cluster-restart/src/test/java/org/elasticsearch/upgrades/FullClusterRestartIT.java @@ -733,28 +733,21 @@ public class FullClusterRestartIT extends AbstractFullClusterRestartTestCase { // make sure all recoveries are done ensureGreen(index); - // Recovering a synced-flush index from 5.x to 6.x might be subtle as a 5.x index commit does not have all 6.x commit tags. + + // Force flush so we're sure that all translog are committed + Request flushRequest = new Request("POST", "/" + index + "/_flush"); + flushRequest.addParameter("force", "true"); + flushRequest.addParameter("wait_if_ongoing", "true"); + assertOK(client().performRequest(flushRequest)); + if (randomBoolean()) { - // needs to call a replication action to sync the global checkpoint from primaries to replication. - assertOK(client().performRequest(new Request("POST", "/" + index + "/_refresh"))); - // We have to spin synced-flush requests here because we fire the global checkpoint sync for the last write operation. - // A synced-flush request considers the global checkpoint sync as an going operation because it acquires a shard permit. - assertBusy(() -> { - try { - Response resp = client().performRequest(new Request("POST", index + "/_flush/synced")); - Map result = ObjectPath.createFromResponse(resp).evaluate("_shards"); - assertThat(result.get("successful"), equalTo(result.get("total"))); - assertThat(result.get("failed"), equalTo(0)); - } catch (ResponseException ex) { - throw new AssertionError(ex); // cause assert busy to retry - } - }); - } else { - // Explicitly flush so we're sure to have a bunch of documents in the Lucene index - Request flushRequest = new Request("POST", "/" + index + "/_flush"); - flushRequest.addParameter("force", "true"); - flushRequest.addParameter("wait_if_ongoing", "true"); - assertOK(client().performRequest(flushRequest)); + // We had a bug before where we failed to perform peer recovery with sync_id from 5.x to 6.x. + // We added this synced flush so we can exercise different paths of recovery code. + try { + client().performRequest(new Request("POST", index + "/_flush/synced")); + } catch (ResponseException ignored) { + // synced flush is optional here + } } if (shouldHaveTranslog) { // Update a few documents so we are sure to have a translog