[ML] Wait for .ml-config primary before assigning persistent tasks (#44170)

Now that ML job configs are stored in an index rather than
cluster state, availability of the .ml-config index is very
important to the operation of ML.  When a cluster starts up
the ML persistent tasks will be considered for node
assignment very early on.  It is best in this case if
assignment is deferred until after the .ml-config index is
available.

The introduction of data frame analytics jobs has made this
problem worse, because anomaly detection jobs already waited
for the primary shards of the .ml-state, .ml-anomalies-shared
and .ml-meta indices to be available before doing node
assignment, and by coincidence this would probably lead to
the primary shards of .ml-config also being searchable.  But
data frame analytics jobs had no other index checks prior to
this change.

This fixes problem 2 of #44156
This commit is contained in:
David Roberts 2019-07-11 11:42:03 +01:00
parent c0ed64bb92
commit 5886aefeed
5 changed files with 124 additions and 3 deletions

View File

@ -127,9 +127,11 @@ public class TransportOpenJobAction extends TransportMasterNodeAction<OpenJobAct
static String[] indicesOfInterest(String resultsIndex) { static String[] indicesOfInterest(String resultsIndex) {
if (resultsIndex == null) { if (resultsIndex == null) {
return new String[]{AnomalyDetectorsIndex.jobStateIndexPattern(), MlMetaIndex.INDEX_NAME}; return new String[]{AnomalyDetectorsIndex.jobStateIndexPattern(), MlMetaIndex.INDEX_NAME,
AnomalyDetectorsIndex.configIndexName()};
} }
return new String[]{AnomalyDetectorsIndex.jobStateIndexPattern(), resultsIndex, MlMetaIndex.INDEX_NAME}; return new String[]{AnomalyDetectorsIndex.jobStateIndexPattern(), resultsIndex, MlMetaIndex.INDEX_NAME,
AnomalyDetectorsIndex.configIndexName()};
} }
static List<String> verifyIndicesPrimaryShardsAreActive(String resultsWriteIndex, ClusterState clusterState) { static List<String> verifyIndicesPrimaryShardsAreActive(String resultsWriteIndex, ClusterState clusterState) {

View File

@ -15,6 +15,7 @@ import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.admin.cluster.node.tasks.cancel.CancelTasksRequest; import org.elasticsearch.action.admin.cluster.node.tasks.cancel.CancelTasksRequest;
import org.elasticsearch.action.admin.cluster.node.tasks.cancel.CancelTasksResponse; import org.elasticsearch.action.admin.cluster.node.tasks.cancel.CancelTasksResponse;
import org.elasticsearch.action.support.ActionFilters; import org.elasticsearch.action.support.ActionFilters;
import org.elasticsearch.action.support.IndicesOptions;
import org.elasticsearch.action.support.master.AcknowledgedResponse; import org.elasticsearch.action.support.master.AcknowledgedResponse;
import org.elasticsearch.action.support.master.TransportMasterNodeAction; import org.elasticsearch.action.support.master.TransportMasterNodeAction;
import org.elasticsearch.client.Client; import org.elasticsearch.client.Client;
@ -23,6 +24,7 @@ import org.elasticsearch.cluster.block.ClusterBlockException;
import org.elasticsearch.cluster.block.ClusterBlockLevel; import org.elasticsearch.cluster.block.ClusterBlockLevel;
import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver; import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver;
import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.routing.IndexRoutingTable;
import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.Nullable; import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.Inject;
@ -48,6 +50,7 @@ import org.elasticsearch.xpack.core.ml.action.StartDataFrameAnalyticsAction;
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig; import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig;
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsState; import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsState;
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsTaskState; import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsTaskState;
import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndex;
import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper; import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
import org.elasticsearch.xpack.ml.MachineLearning; import org.elasticsearch.xpack.ml.MachineLearning;
import org.elasticsearch.xpack.ml.dataframe.DataFrameAnalyticsManager; import org.elasticsearch.xpack.ml.dataframe.DataFrameAnalyticsManager;
@ -59,6 +62,8 @@ import org.elasticsearch.xpack.ml.job.JobNodeSelector;
import org.elasticsearch.xpack.ml.process.MlMemoryTracker; import org.elasticsearch.xpack.ml.process.MlMemoryTracker;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.atomic.AtomicReference;
@ -374,6 +379,19 @@ public class TransportStartDataFrameAnalyticsAction
} }
} }
static List<String> verifyIndicesPrimaryShardsAreActive(ClusterState clusterState, String... indexNames) {
IndexNameExpressionResolver resolver = new IndexNameExpressionResolver();
String[] concreteIndices = resolver.concreteIndexNames(clusterState, IndicesOptions.lenientExpandOpen(), indexNames);
List<String> unavailableIndices = new ArrayList<>(concreteIndices.length);
for (String index : concreteIndices) {
IndexRoutingTable routingTable = clusterState.getRoutingTable().index(index);
if (routingTable == null || routingTable.allPrimaryShardsActive() == false) {
unavailableIndices.add(index);
}
}
return unavailableIndices;
}
public static class TaskExecutor extends PersistentTasksExecutor<StartDataFrameAnalyticsAction.TaskParams> { public static class TaskExecutor extends PersistentTasksExecutor<StartDataFrameAnalyticsAction.TaskParams> {
private final Client client; private final Client client;
@ -421,11 +439,20 @@ public class TransportStartDataFrameAnalyticsAction
String id = params.getId(); String id = params.getId();
List<String> unavailableIndices = verifyIndicesPrimaryShardsAreActive(clusterState, AnomalyDetectorsIndex.configIndexName());
if (unavailableIndices.size() != 0) {
String reason = "Not opening data frame analytics job [" + id +
"], because not all primary shards are active for the following indices [" + String.join(",", unavailableIndices) + "]";
LOGGER.debug(reason);
return new PersistentTasksCustomMetaData.Assignment(null, reason);
}
boolean isMemoryTrackerRecentlyRefreshed = memoryTracker.isRecentlyRefreshed(); boolean isMemoryTrackerRecentlyRefreshed = memoryTracker.isRecentlyRefreshed();
if (isMemoryTrackerRecentlyRefreshed == false) { if (isMemoryTrackerRecentlyRefreshed == false) {
boolean scheduledRefresh = memoryTracker.asyncRefresh(); boolean scheduledRefresh = memoryTracker.asyncRefresh();
if (scheduledRefresh) { if (scheduledRefresh) {
String reason = "Not opening job [" + id + "] because job memory requirements are stale - refresh requested"; String reason = "Not opening data frame analytics job [" + id +
"] because job memory requirements are stale - refresh requested";
LOGGER.debug(reason); LOGGER.debug(reason);
return new PersistentTasksCustomMetaData.Assignment(null, reason); return new PersistentTasksCustomMetaData.Assignment(null, reason);
} }

View File

@ -435,6 +435,10 @@ public class MlMemoryTracker implements LocalNodeMasterListener {
}, e -> { }, e -> {
if (e instanceof ResourceNotFoundException) { if (e instanceof ResourceNotFoundException) {
// TODO: does this also happen if the .ml-config index exists but is unavailable? // TODO: does this also happen if the .ml-config index exists but is unavailable?
// However, note that we wait for the .ml-config index to be available earlier on in the
// job assignment process, so that scenario should be very rare, i.e. somebody has closed
// the .ml-config index (which would be unexpected and unsupported for an internal index)
// during the memory refresh.
logger.trace("[{}] anomaly detector job deleted during ML memory update", jobId); logger.trace("[{}] anomaly detector job deleted during ML memory update", jobId);
} else { } else {
logger.error("[" + jobId + "] failed to get anomaly detector job during ML memory update", e); logger.error("[" + jobId + "] failed to get anomaly detector job during ML memory update", e);

View File

@ -202,6 +202,7 @@ public class TransportOpenJobActionTests extends ESTestCase {
private void addIndices(MetaData.Builder metaData, RoutingTable.Builder routingTable) { private void addIndices(MetaData.Builder metaData, RoutingTable.Builder routingTable) {
List<String> indices = new ArrayList<>(); List<String> indices = new ArrayList<>();
indices.add(AnomalyDetectorsIndex.configIndexName());
indices.add(AnomalyDetectorsIndexFields.STATE_INDEX_PREFIX); indices.add(AnomalyDetectorsIndexFields.STATE_INDEX_PREFIX);
indices.add(MlMetaIndex.INDEX_NAME); indices.add(MlMetaIndex.INDEX_NAME);
indices.add(AuditorField.NOTIFICATIONS_INDEX); indices.add(AuditorField.NOTIFICATIONS_INDEX);

View File

@ -0,0 +1,87 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.action;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.metadata.AliasMetaData;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.metadata.MetaData;
import org.elasticsearch.cluster.routing.IndexRoutingTable;
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
import org.elasticsearch.cluster.routing.RecoverySource;
import org.elasticsearch.cluster.routing.RoutingTable;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.cluster.routing.UnassignedInfo;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndex;
import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndexFields;
import java.util.List;
import static org.hamcrest.Matchers.contains;
import static org.hamcrest.Matchers.empty;
public class TransportStartDataFrameAnalyticsActionTests extends ESTestCase {
public void testVerifyIndicesPrimaryShardsAreActive() {
// At present the only critical index is the config index
String indexName = AnomalyDetectorsIndex.configIndexName();
MetaData.Builder metaData = MetaData.builder();
RoutingTable.Builder routingTable = RoutingTable.builder();
IndexMetaData.Builder indexMetaData = IndexMetaData.builder(indexName);
indexMetaData.settings(Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
);
if (indexName.equals(AnomalyDetectorsIndexFields.STATE_INDEX_PREFIX)) {
indexMetaData.putAlias(new AliasMetaData.Builder(AnomalyDetectorsIndex.jobStateIndexWriteAlias()));
}
metaData.put(indexMetaData);
Index index = new Index(indexName, "_uuid");
ShardId shardId = new ShardId(index, 0);
ShardRouting shardRouting = ShardRouting.newUnassigned(shardId, true, RecoverySource.EmptyStoreRecoverySource.INSTANCE,
new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED, ""));
shardRouting = shardRouting.initialize("node_id", null, 0L);
shardRouting = shardRouting.moveToStarted();
routingTable.add(IndexRoutingTable.builder(index)
.addIndexShard(new IndexShardRoutingTable.Builder(shardId).addShard(shardRouting).build()));
ClusterState.Builder csBuilder = ClusterState.builder(new ClusterName("_name"));
csBuilder.routingTable(routingTable.build());
csBuilder.metaData(metaData);
ClusterState cs = csBuilder.build();
assertThat(TransportStartDataFrameAnalyticsAction.verifyIndicesPrimaryShardsAreActive(cs, indexName), empty());
metaData = new MetaData.Builder(cs.metaData());
routingTable = new RoutingTable.Builder(cs.routingTable());
if (randomBoolean()) {
routingTable.remove(indexName);
} else {
index = new Index(indexName, "_uuid");
shardId = new ShardId(index, 0);
shardRouting = ShardRouting.newUnassigned(shardId, true, RecoverySource.EmptyStoreRecoverySource.INSTANCE,
new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED, ""));
shardRouting = shardRouting.initialize("node_id", null, 0L);
routingTable.add(IndexRoutingTable.builder(index)
.addIndexShard(new IndexShardRoutingTable.Builder(shardId).addShard(shardRouting).build()));
}
csBuilder.routingTable(routingTable.build());
csBuilder.metaData(metaData);
List<String> result = TransportStartDataFrameAnalyticsAction.verifyIndicesPrimaryShardsAreActive(csBuilder.build(), indexName);
assertThat(result, contains(indexName));
}
}