ML: update set_upgrade_mode, add logging (#38372) (#38538)

* ML: update set_upgrade_mode, add logging * Attempt to fix datafeed isolation Also renamed a few methods/variables for clarity and added some comments
2019-02-08 12:56:04 -06:00 · 2019-02-08 12:56:04 -06:00 · 24a8ea06f5
parent 9fd99f18a0
commit 24a8ea06f5
5 changed files with 67 additions and 19 deletions
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/messages/Messages.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/messages/Messages.java
@ -79,6 +79,7 @@ public final class Messages {
    public static final String JOB_AUDIT_DATAFEED_STARTED_FROM_TO = "Datafeed started (from: {0} to: {1}) with frequency [{2}]";
    public static final String JOB_AUDIT_DATAFEED_STARTED_REALTIME = "Datafeed started in real-time";
    public static final String JOB_AUDIT_DATAFEED_STOPPED = "Datafeed stopped";
+    public static final String JOB_AUDIT_DATAFEED_ISOLATED = "Datafeed isolated";
    public static final String JOB_AUDIT_DELETING = "Deleting job by task with id ''{0}''";
    public static final String JOB_AUDIT_DELETING_FAILED = "Error deleting job: {0}";
    public static final String JOB_AUDIT_DELETED = "Job deleted";
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MlLifeCycleService.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MlLifeCycleService.java
@ -45,7 +45,7 @@ public class MlLifeCycleService {
                // datafeeds, so they get reallocated.  We have to do this first, otherwise the datafeeds
                // could fail if they send data to a dead autodetect process.
                if (datafeedManager != null) {
-                    datafeedManager.isolateAllDatafeedsOnThisNode();
+                    datafeedManager.isolateAllDatafeedsOnThisNodeBeforeShutdown();
                }
                NativeController nativeController = NativeControllerHolder.getNativeController(environment);
                if (nativeController != null) {
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportSetUpgradeModeAction.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportSetUpgradeModeAction.java
@ -263,6 +263,9 @@ public class TransportSetUpgradeModeAction extends TransportMasterNodeAction<Set
            .sorted(Comparator.comparing(PersistentTask::getTaskName))
            .collect(Collectors.toList());

+        logger.info("Un-assigning persistent tasks : " +
+            datafeedAndJobTasks.stream().map(PersistentTask::getId).collect(Collectors.joining(", ", "[ ", " ]")));
+
        TypedChainTaskExecutor<PersistentTask<?>> chainTaskExecutor =
            new TypedChainTaskExecutor<>(client.threadPool().executor(executor()),
                r -> true,
@ -287,6 +290,7 @@ public class TransportSetUpgradeModeAction extends TransportMasterNodeAction<Set
                                  ActionListener<List<IsolateDatafeedAction.Response>> listener) {
        Set<String> datafeedsToIsolate = MlTasks.startedDatafeedIds(tasksCustomMetaData);

+        logger.info("Isolating datafeeds: " + datafeedsToIsolate.toString());
        TypedChainTaskExecutor<IsolateDatafeedAction.Response> isolateDatafeedsExecutor =
            new TypedChainTaskExecutor<>(client.threadPool().executor(executor()), r -> true, ex -> true);

--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/DatafeedManager.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/datafeed/DatafeedManager.java
@ -64,7 +64,6 @@ public class DatafeedManager {
    private final DatafeedJobBuilder datafeedJobBuilder;
    private final TaskRunner taskRunner = new TaskRunner();
    private final AutodetectProcessManager autodetectProcessManager;
-    private volatile boolean isolated;

    public DatafeedManager(ThreadPool threadPool, Client client, ClusterService clusterService, DatafeedJobBuilder datafeedJobBuilder,
                           Supplier<Long> currentTimeSupplier, Auditor auditor, AutodetectProcessManager autodetectProcessManager) {
@ -130,18 +129,20 @@ public class DatafeedManager {
     * This is used before the JVM is killed.  It differs from stopAllDatafeedsOnThisNode in that it leaves
     * the datafeed tasks in the "started" state, so that they get restarted on a different node.
     */
-    public void isolateAllDatafeedsOnThisNode() {
-        isolated = true;
+    public void isolateAllDatafeedsOnThisNodeBeforeShutdown() {
        Iterator<Holder> iter = runningDatafeedsOnThisNode.values().iterator();
        while (iter.hasNext()) {
            Holder next = iter.next();
            next.isolateDatafeed();
-            next.setRelocating();
+            // TODO: it's not ideal that this "isolate" method does something a bit different to the one below
+            next.setNodeIsShuttingDown();
            iter.remove();
        }
    }

    public void isolateDatafeed(long allocationId) {
+        // This calls get() rather than remove() because we expect that the persistent task will
+        // be removed shortly afterwards and that operation needs to be able to find the holder
        Holder holder = runningDatafeedsOnThisNode.get(allocationId);
        if (holder != null) {
            holder.isolateDatafeed();
@ -195,7 +196,7 @@ public class DatafeedManager {
                    holder.stop("general_lookback_failure", TimeValue.timeValueSeconds(20), e);
                    return;
                }
-                if (isolated == false) {
+                if (holder.isIsolated() == false) {
                    if (next != null) {
                        doDatafeedRealtime(next, holder.datafeedJob.getJobId(), holder);
                    } else {
@ -298,7 +299,7 @@ public class DatafeedManager {
        private final ProblemTracker problemTracker;
        private final Consumer<Exception> finishHandler;
        volatile Scheduler.Cancellable cancellable;
-        private volatile boolean isRelocating;
+        private volatile boolean isNodeShuttingDown;

        Holder(TransportStartDatafeedAction.DatafeedTask task, String datafeedId, DatafeedJob datafeedJob,
               ProblemTracker problemTracker, Consumer<Exception> finishHandler) {
@ -324,7 +325,7 @@ public class DatafeedManager {
        }

        public void stop(String source, TimeValue timeout, Exception e) {
-            if (isRelocating) {
+            if (isNodeShuttingDown) {
                return;
            }

@ -344,11 +345,12 @@ public class DatafeedManager {
                    if (cancellable != null) {
                        cancellable.cancel();
                    }
-                    auditor.info(datafeedJob.getJobId(), Messages.getMessage(Messages.JOB_AUDIT_DATAFEED_STOPPED));
+                    auditor.info(datafeedJob.getJobId(),
+                            Messages.getMessage(isIsolated() ? Messages.JOB_AUDIT_DATAFEED_ISOLATED : Messages.JOB_AUDIT_DATAFEED_STOPPED));
                    finishHandler.accept(e);
                    logger.info("[{}] datafeed [{}] for job [{}] has been stopped{}", source, datafeedId, datafeedJob.getJobId(),
                            acquired ? "" : ", but there may be pending tasks as the timeout [" + timeout.getStringRep() + "] expired");
-                    if (autoCloseJob) {
+                    if (autoCloseJob && isIsolated() == false) {
                        closeJob();
                    }
                    if (acquired) {
@ -361,16 +363,18 @@ public class DatafeedManager {
        }

        /**
-         * This stops a datafeed WITHOUT updating the corresponding persistent task.  It must ONLY be called
-         * immediately prior to shutting down a node.  Then the datafeed task can remain "started", and be
-         * relocated to a different node.  Calling this method at any other time will ruin the datafeed.
+         * This stops a datafeed WITHOUT updating the corresponding persistent task.  When called it
+         * will stop the datafeed from sending data to its job as quickly as possible.  The caller
+         * must do something sensible with the corresponding persistent task.  If the node is shutting
+         * down the task will automatically get reassigned.  Otherwise the caller must take action to
+         * remove or reassign the persistent task, or the datafeed will be left in limbo.
         */
        public void isolateDatafeed() {
            datafeedJob.isolate();
        }

-        public void setRelocating() {
-            isRelocating = true;
+        public void setNodeIsShuttingDown() {
+            isNodeShuttingDown = true;
        }

        private Long executeLookBack(long startTime, Long endTime) throws Exception {
--- a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/set_upgrade_mode.yml
+++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/set_upgrade_mode.yml
@ -6,6 +6,10 @@ setup:
      indices.create:
        index: airline-data
        body:
+          settings:
+            index:
+              number_of_replicas: 0
+              number_of_shards: 1
          mappings:
            properties:
              time:
@ -53,10 +57,9 @@ setup:
        job_id: set-upgrade-mode-job

  - do:
-      headers:
-        Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser
-      ml.start_datafeed:
-        datafeed_id: set-upgrade-mode-job-datafeed
+      cluster.health:
+        index: airline-data
+        wait_for_status: green

 ---
 teardown:
@ -70,6 +73,10 @@ teardown:

 ---
 "Test setting upgrade_mode to false when it is already false":
+  - do:
+      ml.start_datafeed:
+        datafeed_id: set-upgrade-mode-job-datafeed
+
  - do:
      ml.set_upgrade_mode:
        enabled: false
@ -92,6 +99,22 @@ teardown:

 ---
 "Setting upgrade_mode to enabled":
+  - do:
+      ml.start_datafeed:
+        datafeed_id: set-upgrade-mode-job-datafeed
+
+  - do:
+      cat.tasks: {}
+  - match:
+      $body: |
+        /.+job.+/
+
+  - do:
+      cat.tasks: {}
+  - match:
+      $body: |
+        /.+datafeed.+/
+
  - do:
      ml.info: {}
  - match: { upgrade_mode: false }
@ -125,6 +148,22 @@ teardown:

 ---
 "Setting upgrade mode to disabled from enabled":
+  - do:
+      ml.start_datafeed:
+        datafeed_id: set-upgrade-mode-job-datafeed
+
+  - do:
+      cat.tasks: {}
+  - match:
+      $body: |
+        /.+job.+/
+
+  - do:
+      cat.tasks: {}
+  - match:
+      $body: |
+        /.+datafeed.+/
+
  - do:
      ml.set_upgrade_mode:
        enabled: true