From 79d0c4ed1830f63fe82109fe4ec316acac2b9f52 Mon Sep 17 00:00:00 2001 From: Andrei Dan Date: Mon, 21 Sep 2020 20:40:01 +0100 Subject: [PATCH] ILM: allow check-migration step to continue if tier setting unset (#62636) (#62724) This allows the `check-migration` step to move past the allocation check if the tier routing settings are manually unset. This helps a user unblock ILM in case a tier is removed (ie. if the warm tier is decommissioned this will allow users to resume the ILM policies stuck in `check-migration` waiting for the warm nodes to become available and the managed index to allocate. this allows the index to allocate on the other available tiers) (cherry picked from commit d7a1eaa7f51d0972d10c0df1d3cd77d6b755dd41) Signed-off-by: Andrei Dan --- .../core/ilm/DataTierMigrationRoutedStep.java | 22 ++++- .../ilm/DataTierMigrationRoutedStepTests.java | 56 ++++++++++- .../xpack/ilm/DataTiersMigrationsTests.java | 93 +++++++++++++++++++ 3 files changed, 162 insertions(+), 9 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/DataTierMigrationRoutedStep.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/DataTierMigrationRoutedStep.java index 7d451c042c0..1120a72c872 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/DataTierMigrationRoutedStep.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/DataTierMigrationRoutedStep.java @@ -13,6 +13,7 @@ import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.node.DiscoveryNodeRole; import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders; +import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.settings.Settings; @@ -25,8 +26,9 @@ import java.util.Locale; import java.util.Set; import static org.elasticsearch.cluster.node.DiscoveryNodeRole.DATA_ROLE; -import static org.elasticsearch.xpack.cluster.routing.allocation.DataTierAllocationDecider.INDEX_ROUTING_INCLUDE_SETTING; +import static org.elasticsearch.xpack.cluster.routing.allocation.DataTierAllocationDecider.INDEX_ROUTING_PREFER_SETTING; import static org.elasticsearch.xpack.core.ilm.AllocationRoutedStep.getPendingAllocations; +import static org.elasticsearch.xpack.core.ilm.step.info.AllocationInfo.waitingForActiveShardsAllocationInfo; /** * Checks whether all shards have been correctly routed in response to updating the allocation rules for an index in order @@ -70,11 +72,23 @@ public class DataTierMigrationRoutedStep extends ClusterStateWaitStep { logger.debug("[{}] lifecycle action for index [{}] executed but index no longer exists", getKey().getAction(), index.getName()); return new Result(false, null); } - String destinationTier = INDEX_ROUTING_INCLUDE_SETTING.get(idxMeta.getSettings()); + String destinationTier = INDEX_ROUTING_PREFER_SETTING.get(idxMeta.getSettings()); if (ActiveShardCount.ALL.enoughShardsActive(clusterState, index.getName()) == false) { - logger.debug("[{}] migration of index [{}] to the [{}] tier cannot progress, as not all shards are active", + if (Strings.isEmpty(destinationTier)) { + logger.debug("[{}] lifecycle action for index [{}] cannot make progress because not all shards are active", + getKey().getAction(), index.getName()); + } else { + logger.debug("[{}] migration of index [{}] to the [{}] tier cannot progress, as not all shards are active", getKey().getAction(), index.getName(), destinationTier); - return new Result(false, AllocationInfo.waitingForActiveShardsAllocationInfo(idxMeta.getNumberOfReplicas())); + } + return new Result(false, waitingForActiveShardsAllocationInfo(idxMeta.getNumberOfReplicas())); + } + + if (Strings.isEmpty(destinationTier)) { + logger.debug("index [{}] has no data tier routing setting configured and all its shards are active. considering the [{}] " + + "step condition met and continuing to the next step", index.getName(), getKey().getName()); + // the user removed the tier routing setting and all the shards are active so we'll cary on + return new Result(true, null); } int allocationPendingAllShards = getPendingAllocations(index, ALLOCATION_DECIDERS, clusterState); diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/DataTierMigrationRoutedStepTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/DataTierMigrationRoutedStepTests.java index 13ae30adba4..6518706544a 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/DataTierMigrationRoutedStepTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/DataTierMigrationRoutedStepTests.java @@ -30,7 +30,7 @@ import java.util.Collections; import java.util.Set; import static java.util.Collections.emptyMap; -import static org.elasticsearch.xpack.cluster.routing.allocation.DataTierAllocationDecider.INDEX_ROUTING_INCLUDE_SETTING; +import static org.elasticsearch.xpack.cluster.routing.allocation.DataTierAllocationDecider.INDEX_ROUTING_PREFER; import static org.elasticsearch.xpack.core.ilm.step.info.AllocationInfo.waitingForActiveShardsAllocationInfo; import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.nullValue; @@ -95,7 +95,7 @@ public class DataTierMigrationRoutedStepTests extends AbstractStepTestCase roles) { return new DiscoveryNode(nodeId, buildNewFakeTransportAddress(), emptyMap(), roles, Version.CURRENT); } diff --git a/x-pack/plugin/ilm/src/internalClusterTest/java/org/elasticsearch/xpack/ilm/DataTiersMigrationsTests.java b/x-pack/plugin/ilm/src/internalClusterTest/java/org/elasticsearch/xpack/ilm/DataTiersMigrationsTests.java index 55429decaa3..1469a1a631e 100644 --- a/x-pack/plugin/ilm/src/internalClusterTest/java/org/elasticsearch/xpack/ilm/DataTiersMigrationsTests.java +++ b/x-pack/plugin/ilm/src/internalClusterTest/java/org/elasticsearch/xpack/ilm/DataTiersMigrationsTests.java @@ -6,11 +6,16 @@ package org.elasticsearch.xpack.ilm; +import org.elasticsearch.action.admin.cluster.allocation.ClusterAllocationExplainRequest; +import org.elasticsearch.action.admin.cluster.allocation.ClusterAllocationExplainResponse; import org.elasticsearch.action.admin.indices.create.CreateIndexResponse; +import org.elasticsearch.action.admin.indices.settings.put.UpdateSettingsRequest; +import org.elasticsearch.cluster.routing.ShardRoutingState; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.ESIntegTestCase; +import org.elasticsearch.xpack.cluster.routing.allocation.DataTierAllocationDecider; import org.elasticsearch.xpack.core.DataTier; import org.elasticsearch.xpack.core.LocalStateCompositeXPackPlugin; import org.elasticsearch.xpack.core.XPackSettings; @@ -29,6 +34,7 @@ import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Locale; +import java.util.concurrent.TimeUnit; import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS; import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS; @@ -153,4 +159,91 @@ public class DataTiersMigrationsTests extends ESIntegTestCase { assertThat(indexLifecycleExplainResponse.getStep(), is("complete")); }); } + + public void testUserOptsOutOfTierMigration() throws Exception { + internalCluster().startMasterOnlyNodes(1, Settings.EMPTY); + logger.info("starting hot data node"); + internalCluster().startNode(hotNode(Settings.EMPTY)); + + Phase hotPhase = new Phase("hot", TimeValue.ZERO, Collections.emptyMap()); + Phase warmPhase = new Phase("warm", TimeValue.ZERO, Collections.emptyMap()); + Phase coldPhase = new Phase("cold", TimeValue.ZERO, Collections.emptyMap()); + LifecyclePolicy lifecyclePolicy = new LifecyclePolicy( + policy, org.elasticsearch.common.collect.Map.of("hot", hotPhase, "warm", warmPhase, "cold", coldPhase) + ); + PutLifecycleAction.Request putLifecycleRequest = new PutLifecycleAction.Request(lifecyclePolicy); + PutLifecycleAction.Response putLifecycleResponse = client().execute(PutLifecycleAction.INSTANCE, putLifecycleRequest).get(); + assertAcked(putLifecycleResponse); + + Settings settings = Settings.builder().put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, 1) + .put(SETTING_NUMBER_OF_REPLICAS, 1).put(LifecycleSettings.LIFECYCLE_NAME, policy).build(); + CreateIndexResponse res = client().admin().indices().prepareCreate(managedIndex).setSettings(settings).get(); + assertTrue(res.isAcknowledged()); + + assertBusy(() -> { + ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(managedIndex); + ExplainLifecycleResponse explainResponse = client().execute(ExplainLifecycleAction.INSTANCE, + explainRequest).get(); + + IndexLifecycleExplainResponse indexLifecycleExplainResponse = explainResponse.getIndexResponses().get(managedIndex); + assertThat(indexLifecycleExplainResponse.getPhase(), is("warm")); + assertThat(indexLifecycleExplainResponse.getStep(), is(DataTierMigrationRoutedStep.NAME)); + }); + + Settings removeTierRoutingSetting = Settings.builder().putNull(DataTierAllocationDecider.INDEX_ROUTING_PREFER).build(); + UpdateSettingsRequest updateSettingsRequest = new UpdateSettingsRequest(managedIndex).settings(removeTierRoutingSetting); + assertAcked(client().admin().indices().updateSettings(updateSettingsRequest).actionGet()); + + assertBusy(() -> { + ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(managedIndex); + ExplainLifecycleResponse explainResponse = client().execute(ExplainLifecycleAction.INSTANCE, + explainRequest).get(); + + IndexLifecycleExplainResponse indexLifecycleExplainResponse = explainResponse.getIndexResponses().get(managedIndex); + assertThat(indexLifecycleExplainResponse.getPhase(), is("warm")); + assertThat(indexLifecycleExplainResponse.getStep(), is(DataTierMigrationRoutedStep.NAME)); + assertReplicaIsUnassigned(); + }, 30, TimeUnit.SECONDS); + + internalCluster().startNode(coldNode(Settings.EMPTY)); + + // the index should successfully allocate + ensureGreen(managedIndex); + + // the index is successfully allocated but the migrate action from the cold phase re-configured the tier migration setting to the + // cold tier so ILM is stuck in `check-migration` in the cold phase this time + // we have 2 options to resume the ILM execution: + // 1. start another cold node so both the primary and replica can relocate to the cold nodes + // 2. remove the tier routing setting from the index again (we're doing this below) + assertBusy(() -> { + ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(managedIndex); + ExplainLifecycleResponse explainResponse = client().execute(ExplainLifecycleAction.INSTANCE, + explainRequest).get(); + + IndexLifecycleExplainResponse indexLifecycleExplainResponse = explainResponse.getIndexResponses().get(managedIndex); + assertThat(indexLifecycleExplainResponse.getPhase(), is("cold")); + assertThat(indexLifecycleExplainResponse.getStep(), is(DataTierMigrationRoutedStep.NAME)); + }); + + // remove the tier routing setting again + assertAcked(client().admin().indices().updateSettings(updateSettingsRequest).actionGet()); + + // wait for lifecycle to complete in the cold phase + assertBusy(() -> { + ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(managedIndex); + ExplainLifecycleResponse explainResponse = client().execute(ExplainLifecycleAction.INSTANCE, + explainRequest).get(); + + IndexLifecycleExplainResponse indexLifecycleExplainResponse = explainResponse.getIndexResponses().get(managedIndex); + assertThat(indexLifecycleExplainResponse.getPhase(), is("cold")); + assertThat(indexLifecycleExplainResponse.getStep(), is("complete")); + }, 30, TimeUnit.SECONDS); + } + + private void assertReplicaIsUnassigned() { + ClusterAllocationExplainRequest explainReplicaShard = + new ClusterAllocationExplainRequest().setIndex(managedIndex).setPrimary(false).setShard(0); + ClusterAllocationExplainResponse response = client().admin().cluster().allocationExplain(explainReplicaShard).actionGet(); + assertThat(response.getExplanation().getShardState(), is(ShardRoutingState.UNASSIGNED)); + } }