[7.x][ML] Remove index.unassigned.node_left.delayed_timeout setting from M… (#51740) (#51764)

This setting was introduced with the purpose of reducing the time took by
tests that shut nodes down. Tests like `MlDistributedFailureIT` and
`NetworkDisruptionIT`. However, it is unfortunate to have to set the value
to an explicit value in production. In addition, and most important, the dynamically
choosing the value for this setting makes it impossible to adopt static index template configs
that we register via `IndexTemplateRegistry`, which we need to use in order to start
registering ILM policies for the ML indices.

This commit removes this setting from our templates. I run the tests a few times and could
not see execution time differing significantly.

Backport of #51740
This commit is contained in:
Dimitris Athanasiou 2020-01-31 20:28:29 +02:00 committed by GitHub
parent 5ca51562ec
commit 55b5c8f703
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 24 additions and 16 deletions

View File

@ -22,7 +22,6 @@ import org.elasticsearch.cluster.metadata.IndexTemplateMetaData;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.cluster.routing.UnassignedInfo;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.inject.Module;
@ -894,13 +893,6 @@ public class MachineLearning extends Plugin implements SystemIndexPlugin, Analys
@Override
public UnaryOperator<Map<String, IndexTemplateMetaData>> getIndexTemplateMetaDataUpgrader() {
return templates -> {
final TimeValue delayedNodeTimeOutSetting;
// Whether we are using native process is a good way to detect whether we are in dev / test mode:
if (MachineLearningField.AUTODETECT_PROCESS.get(settings)) {
delayedNodeTimeOutSetting = UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.get(settings);
} else {
delayedNodeTimeOutSetting = TimeValue.timeValueNanos(0);
}
try (XContentBuilder auditMapping = ElasticsearchMappings.auditMessageMapping()) {
IndexTemplateMetaData notificationMessageTemplate =
@ -912,8 +904,7 @@ public class MachineLearning extends Plugin implements SystemIndexPlugin, Analys
// Our indexes are small and one shard puts the
// least possible burden on Elasticsearch
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
.put(IndexMetaData.SETTING_AUTO_EXPAND_REPLICAS, "0-1")
.put(UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.getKey(), delayedNodeTimeOutSetting))
.put(IndexMetaData.SETTING_AUTO_EXPAND_REPLICAS, "0-1"))
.build();
templates.put(AuditorField.NOTIFICATIONS_INDEX, notificationMessageTemplate);
} catch (IOException e) {
@ -928,8 +919,7 @@ public class MachineLearning extends Plugin implements SystemIndexPlugin, Analys
// Our indexes are small and one shard puts the
// least possible burden on Elasticsearch
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
.put(IndexMetaData.SETTING_AUTO_EXPAND_REPLICAS, "0-1")
.put(UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.getKey(), delayedNodeTimeOutSetting))
.put(IndexMetaData.SETTING_AUTO_EXPAND_REPLICAS, "0-1"))
.version(Version.CURRENT.id)
.putMapping(SINGLE_MAPPING_NAME, Strings.toString(docMapping))
.build();
@ -947,7 +937,6 @@ public class MachineLearning extends Plugin implements SystemIndexPlugin, Analys
// least possible burden on Elasticsearch
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
.put(IndexMetaData.SETTING_AUTO_EXPAND_REPLICAS, "0-1")
.put(UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.getKey(), delayedNodeTimeOutSetting)
.put(IndexSettings.MAX_RESULT_WINDOW_SETTING.getKey(),
AnomalyDetectorsIndex.CONFIG_INDEX_MAX_RESULTS_WINDOW))
.version(Version.CURRENT.id)
@ -964,8 +953,7 @@ public class MachineLearning extends Plugin implements SystemIndexPlugin, Analys
.patterns(Collections.singletonList(AnomalyDetectorsIndex.jobStateIndexPattern()))
// TODO review these settings
.settings(Settings.builder()
.put(IndexMetaData.SETTING_AUTO_EXPAND_REPLICAS, "0-1")
.put(UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.getKey(), delayedNodeTimeOutSetting))
.put(IndexMetaData.SETTING_AUTO_EXPAND_REPLICAS, "0-1"))
.putMapping(SINGLE_MAPPING_NAME, Strings.toString(stateMapping))
.version(Version.CURRENT.id)
.build();
@ -981,7 +969,6 @@ public class MachineLearning extends Plugin implements SystemIndexPlugin, Analys
.patterns(Collections.singletonList(AnomalyDetectorsIndex.jobResultsIndexPrefix() + "*"))
.settings(Settings.builder()
.put(IndexMetaData.SETTING_AUTO_EXPAND_REPLICAS, "0-1")
.put(UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.getKey(), delayedNodeTimeOutSetting)
// Sacrifice durability for performance: in the event of power
// failure we can lose the last 5 seconds of changes, but it's
// much faster

View File

@ -68,6 +68,8 @@ public class BasicDistributedJobsIT extends BaseMlIntegTestCase {
client().execute(OpenJobAction.INSTANCE, openJobRequest).actionGet();
awaitJobOpenedAndAssigned(job.getId(), null);
setMlIndicesDelayedNodeLeftTimeoutToZero();
ensureGreen(); // replicas must be assigned, otherwise we could lose a whole index
internalCluster().stopRandomDataNode();
ensureStableCluster(3);
@ -109,6 +111,9 @@ public class BasicDistributedJobsIT extends BaseMlIntegTestCase {
OpenJobAction.Request openJobRequest = new OpenJobAction.Request(job.getId());
client().execute(OpenJobAction.INSTANCE, openJobRequest).actionGet();
awaitJobOpenedAndAssigned(job.getId(), null);
setMlIndicesDelayedNodeLeftTimeoutToZero();
StartDatafeedAction.Request startDataFeedRequest = new StartDatafeedAction.Request(config.getId(), 0L);
client().execute(StartDatafeedAction.INSTANCE, startDataFeedRequest);

View File

@ -418,6 +418,8 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
assertEquals(JobState.OPENED, statsResponse.getResponse().results().get(0).getState());
}, 20, TimeUnit.SECONDS);
setMlIndicesDelayedNodeLeftTimeoutToZero();
StartDatafeedAction.Request startDatafeedRequest = new StartDatafeedAction.Request(config.getId(), 0L);
client().execute(StartDatafeedAction.INSTANCE, startDatafeedRequest).get();
}

View File

@ -52,6 +52,9 @@ public class NetworkDisruptionIT extends BaseMlIntegTestCase {
OpenJobAction.Request openJobRequest = new OpenJobAction.Request(job.getId());
AcknowledgedResponse openJobResponse = client().execute(OpenJobAction.INSTANCE, openJobRequest).actionGet();
assertTrue(openJobResponse.isAcknowledged());
setMlIndicesDelayedNodeLeftTimeoutToZero();
ensureGreen();
// Record which node the job starts off on

View File

@ -7,6 +7,7 @@ package org.elasticsearch.xpack.ml.support;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.action.admin.indices.recovery.RecoveryResponse;
import org.elasticsearch.action.admin.indices.settings.put.UpdateSettingsRequest;
import org.elasticsearch.action.bulk.BulkItemResponse;
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.bulk.BulkResponse;
@ -17,6 +18,7 @@ import org.elasticsearch.analysis.common.CommonAnalysisPlugin;
import org.elasticsearch.client.Client;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.metadata.MetaData;
import org.elasticsearch.cluster.routing.UnassignedInfo;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.unit.TimeValue;
@ -402,4 +404,13 @@ public abstract class BaseMlIntegTestCase extends ESIntegTestCase {
});
return jobNode.get();
}
/**
* Sets delayed allocation to 0 to make sure we have tests are not delayed
*/
protected void setMlIndicesDelayedNodeLeftTimeoutToZero() {
client().admin().indices().updateSettings(new UpdateSettingsRequest(".ml-*")
.settings(Settings.builder().put(UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.getKey(), 0).build()))
.actionGet();
}
}