[Transform] simplify class structure of indexer (#46306)

simplify transform task and indexer - remove redundant transform id - moving client data frame indexer (and builder) into a separate file
2025-03-01 08:29:09 +00:00 · 2019-09-06 14:29:46 +02:00 · 2019-09-06 14:29:46 +02:00 · c2194aa7e1
commit c2194aa7e1
parent bb7bff5e30
6 changed files with 739 additions and 677 deletions
--- a/x-pack/plugin/data-frame/src/main/java/org/elasticsearch/xpack/dataframe/transforms/ClientDataFrameIndexer.java
+++ b/x-pack/plugin/data-frame/src/main/java/org/elasticsearch/xpack/dataframe/transforms/ClientDataFrameIndexer.java
@ -0,0 +1,591 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+
+package org.elasticsearch.xpack.dataframe.transforms;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.logging.log4j.message.ParameterizedMessage;
+import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.ResourceNotFoundException;
+import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.action.bulk.BulkAction;
+import org.elasticsearch.action.bulk.BulkItemResponse;
+import org.elasticsearch.action.bulk.BulkRequest;
+import org.elasticsearch.action.bulk.BulkResponse;
+import org.elasticsearch.action.search.SearchAction;
+import org.elasticsearch.action.search.SearchRequest;
+import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.client.Client;
+import org.elasticsearch.common.logging.LoggerMessageFormat;
+import org.elasticsearch.index.IndexNotFoundException;
+import org.elasticsearch.threadpool.ThreadPool;
+import org.elasticsearch.xpack.core.ClientHelper;
+import org.elasticsearch.xpack.core.dataframe.DataFrameMessages;
+import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameIndexerPosition;
+import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameIndexerTransformStats;
+import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameTransformCheckpoint;
+import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameTransformConfig;
+import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameTransformProgress;
+import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameTransformState;
+import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameTransformStoredDoc;
+import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameTransformTaskState;
+import org.elasticsearch.xpack.core.dataframe.utils.ExceptionsHelper;
+import org.elasticsearch.xpack.core.indexing.IndexerState;
+import org.elasticsearch.xpack.dataframe.checkpoint.CheckpointProvider;
+import org.elasticsearch.xpack.dataframe.notifications.DataFrameAuditor;
+import org.elasticsearch.xpack.dataframe.persistence.DataFrameTransformsConfigManager;
+import org.elasticsearch.xpack.dataframe.persistence.SeqNoPrimaryTermAndIndex;
+import org.elasticsearch.xpack.dataframe.transforms.pivot.AggregationResultUtils;
+
+import java.time.Instant;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicReference;
+
+class ClientDataFrameIndexer extends DataFrameIndexer {
+
+    private static final Logger logger = LogManager.getLogger(ClientDataFrameIndexer.class);
+
+    private long logEvery = 1;
+    private long logCount = 0;
+    private final Client client;
+    private final DataFrameTransformsConfigManager transformsConfigManager;
+    private final CheckpointProvider checkpointProvider;
+    private final DataFrameTransformTask transformTask;
+    private final AtomicInteger failureCount;
+    private volatile boolean auditBulkFailures = true;
+    // Indicates that the source has changed for the current run
+    private volatile boolean hasSourceChanged = true;
+    // Keeps track of the last exception that was written to our audit, keeps us from spamming the audit index
+    private volatile String lastAuditedExceptionMessage = null;
+    private final AtomicBoolean oldStatsCleanedUp = new AtomicBoolean(false);
+    private volatile Instant changesLastDetectedAt;
+
+    ClientDataFrameIndexer(DataFrameTransformsConfigManager transformsConfigManager,
+                           CheckpointProvider checkpointProvider,
+                           AtomicReference<IndexerState> initialState,
+                           DataFrameIndexerPosition initialPosition,
+                           Client client,
+                           DataFrameAuditor auditor,
+                           DataFrameIndexerTransformStats initialStats,
+                           DataFrameTransformConfig transformConfig,
+                           Map<String, String> fieldMappings,
+                           DataFrameTransformProgress transformProgress,
+                           DataFrameTransformCheckpoint lastCheckpoint,
+                           DataFrameTransformCheckpoint nextCheckpoint,
+                           DataFrameTransformTask parentTask) {
+        super(ExceptionsHelper.requireNonNull(parentTask, "parentTask")
+                .getThreadPool()
+                .executor(ThreadPool.Names.GENERIC),
+            ExceptionsHelper.requireNonNull(auditor, "auditor"),
+            transformConfig,
+            fieldMappings,
+            ExceptionsHelper.requireNonNull(initialState, "initialState"),
+            initialPosition,
+            initialStats == null ? new DataFrameIndexerTransformStats() : initialStats,
+            transformProgress,
+            lastCheckpoint,
+            nextCheckpoint);
+        this.transformsConfigManager = ExceptionsHelper.requireNonNull(transformsConfigManager, "transformsConfigManager");
+        this.checkpointProvider = ExceptionsHelper.requireNonNull(checkpointProvider, "checkpointProvider");
+
+        this.client = ExceptionsHelper.requireNonNull(client, "client");
+        this.transformTask = parentTask;
+        this.failureCount = new AtomicInteger(0);
+    }
+
+    @Override
+    protected void onStart(long now, ActionListener<Boolean> listener) {
+        if (transformTask.getTaskState() == DataFrameTransformTaskState.FAILED) {
+            logger.debug("[{}] attempted to start while failed.", getJobId());
+            listener.onFailure(new ElasticsearchException("Attempted to start a failed transform [{}].", getJobId()));
+            return;
+        }
+        // On each run, we need to get the total number of docs and reset the count of processed docs
+        // Since multiple checkpoints can be executed in the task while it is running on the same node, we need to gather
+        // the progress here, and not in the executor.
+        ActionListener<Void> updateConfigListener = ActionListener.wrap(
+            updateConfigResponse -> {
+                if (initialRun()) {
+                    createCheckpoint(ActionListener.wrap(cp -> {
+                        nextCheckpoint = cp;
+                        // If nextCheckpoint > 1, this means that we are now on the checkpoint AFTER the batch checkpoint
+                        // Consequently, the idea of percent complete no longer makes sense.
+                        if (nextCheckpoint.getCheckpoint() > 1) {
+                            progress = new DataFrameTransformProgress(null, 0L, 0L);
+                            super.onStart(now, listener);
+                            return;
+                        }
+                        TransformProgressGatherer.getInitialProgress(this.client, buildFilterQuery(), getConfig(), ActionListener.wrap(
+                            newProgress -> {
+                                logger.trace("[{}] reset the progress from [{}] to [{}].", getJobId(), progress, newProgress);
+                                progress = newProgress;
+                                super.onStart(now, listener);
+                            },
+                            failure -> {
+                                progress = null;
+                                logger.warn(new ParameterizedMessage("[{}] unable to load progress information for task.",
+                                    getJobId()),
+                                    failure);
+                                super.onStart(now, listener);
+                            }
+                        ));
+                    }, listener::onFailure));
+                } else {
+                    super.onStart(now, listener);
+                }
+            },
+            listener::onFailure
+        );
+
+        // If we are continuous, we will want to verify we have the latest stored configuration
+        ActionListener<Void> changedSourceListener = ActionListener.wrap(
+            r -> {
+                if (isContinuous()) {
+                    transformsConfigManager.getTransformConfiguration(getJobId(), ActionListener.wrap(
+                        config -> {
+                            transformConfig = config;
+                            logger.debug("[{}] successfully refreshed data frame transform config from index.", getJobId());
+                            updateConfigListener.onResponse(null);
+                        },
+                        failure -> {
+                            String msg = DataFrameMessages.getMessage(
+                                DataFrameMessages.FAILED_TO_RELOAD_TRANSFORM_CONFIGURATION,
+                                getJobId());
+                            logger.error(msg, failure);
+                            // If the transform config index or the transform config is gone, something serious occurred
+                            // We are in an unknown state and should fail out
+                            if (failure instanceof ResourceNotFoundException) {
+                                updateConfigListener.onFailure(new TransformConfigReloadingException(msg, failure));
+                            } else {
+                                auditor.warning(getJobId(), msg);
+                                updateConfigListener.onResponse(null);
+                            }
+                        }
+                    ));
+                } else {
+                    updateConfigListener.onResponse(null);
+                }
+            },
+            listener::onFailure
+        );
+
+        // If we are not on the initial batch checkpoint and its the first pass of whatever continuous checkpoint we are on,
+        // we should verify if there are local changes based on the sync config. If not, do not proceed further and exit.
+        if (transformTask.getCheckpoint() > 0 && initialRun()) {
+            sourceHasChanged(ActionListener.wrap(
+                hasChanged -> {
+                    hasSourceChanged = hasChanged;
+                    if (hasChanged) {
+                        changesLastDetectedAt = Instant.now();
+                        logger.debug("[{}] source has changed, triggering new indexer run.", getJobId());
+                        changedSourceListener.onResponse(null);
+                    } else {
+                        logger.trace("[{}] source has not changed, finish indexer early.", getJobId());
+                        // No changes, stop executing
+                        listener.onResponse(false);
+                    }
+                },
+                failure -> {
+                    // If we failed determining if the source changed, it's safer to assume there were changes.
+                    // We should allow the failure path to complete as normal
+                    hasSourceChanged = true;
+                    listener.onFailure(failure);
+                }
+            ));
+        } else {
+            hasSourceChanged = true;
+            changedSourceListener.onResponse(null);
+        }
+    }
+
+    public CheckpointProvider getCheckpointProvider() {
+        return checkpointProvider;
+    }
+
+    Instant getChangesLastDetectedAt() {
+        return changesLastDetectedAt;
+    }
+
+    @Override
+    public synchronized boolean maybeTriggerAsyncJob(long now) {
+        if (transformTask.getTaskState() == DataFrameTransformTaskState.FAILED) {
+            logger.debug("[{}] schedule was triggered for transform but task is failed. Ignoring trigger.", getJobId());
+            return false;
+        }
+
+        // ignore trigger if indexer is running, prevents log spam in A2P indexer
+        IndexerState indexerState = getState();
+        if (IndexerState.INDEXING.equals(indexerState) || IndexerState.STOPPING.equals(indexerState)) {
+            logger.debug("[{}] indexer for transform has state [{}]. Ignoring trigger.", getJobId(), indexerState);
+            return false;
+        }
+
+        return super.maybeTriggerAsyncJob(now);
+    }
+
+    @Override
+    protected void doNextSearch(SearchRequest request, ActionListener<SearchResponse> nextPhase) {
+        if (transformTask.getTaskState() == DataFrameTransformTaskState.FAILED) {
+            logger.debug("[{}] attempted to search while failed.", getJobId());
+            nextPhase.onFailure(new ElasticsearchException("Attempted to do a search request for failed transform [{}].",
+                getJobId()));
+            return;
+        }
+        ClientHelper.executeWithHeadersAsync(transformConfig.getHeaders(), ClientHelper.DATA_FRAME_ORIGIN, client,
+                SearchAction.INSTANCE, request, nextPhase);
+    }
+
+    @Override
+    protected void doNextBulk(BulkRequest request, ActionListener<BulkResponse> nextPhase) {
+        if (transformTask.getTaskState() == DataFrameTransformTaskState.FAILED) {
+            logger.debug("[{}] attempted to bulk index while failed.", getJobId());
+            nextPhase.onFailure(new ElasticsearchException("Attempted to do a bulk index request for failed transform [{}].",
+                getJobId()));
+            return;
+        }
+        ClientHelper.executeWithHeadersAsync(transformConfig.getHeaders(),
+            ClientHelper.DATA_FRAME_ORIGIN,
+            client,
+            BulkAction.INSTANCE,
+            request,
+            ActionListener.wrap(bulkResponse -> {
+                if (bulkResponse.hasFailures()) {
+                    int failureCount = 0;
+                    for(BulkItemResponse item : bulkResponse.getItems()) {
+                        if (item.isFailed()) {
+                            failureCount++;
+                        }
+                        // TODO gather information on irrecoverable failures and update isIrrecoverableFailure
+                    }
+                    if (auditBulkFailures) {
+                        auditor.warning(getJobId(),
+                            "Experienced at least [" +
+                                failureCount +
+                                "] bulk index failures. See the logs of the node running the transform for details. " +
+                                bulkResponse.buildFailureMessage());
+                        auditBulkFailures = false;
+                    }
+                    // This calls AsyncTwoPhaseIndexer#finishWithIndexingFailure
+                    // It increments the indexing failure, and then calls the `onFailure` logic
+                    nextPhase.onFailure(
+                        new BulkIndexingException("Bulk index experienced failures. " +
+                            "See the logs of the node running the transform for details."));
+                } else {
+                    auditBulkFailures = true;
+                    nextPhase.onResponse(bulkResponse);
+                }
+            }, nextPhase::onFailure));
+    }
+
+    @Override
+    protected void doSaveState(IndexerState indexerState, DataFrameIndexerPosition position, Runnable next) {
+        if (transformTask.getTaskState() == DataFrameTransformTaskState.FAILED) {
+            logger.debug("[{}] attempted to save state and stats while failed.", getJobId());
+            // If we are failed, we should call next to allow failure handling to occur if necessary.
+            next.run();
+            return;
+        }
+        if (indexerState.equals(IndexerState.ABORTING)) {
+            // If we're aborting, just invoke `next` (which is likely an onFailure handler)
+            next.run();
+            return;
+        }
+
+        // This means that the indexer was triggered to discover changes, found none, and exited early.
+        // If the state is `STOPPED` this means that DataFrameTransformTask#stop was called while we were checking for changes.
+        // Allow the stop call path to continue
+        if (hasSourceChanged == false && indexerState.equals(IndexerState.STOPPED) == false) {
+            next.run();
+            return;
+        }
+
+        DataFrameTransformTaskState taskState = transformTask.getTaskState();
+
+        if (indexerState.equals(IndexerState.STARTED)
+            && transformTask.getCheckpoint() == 1
+            && this.isContinuous() == false) {
+            // set both to stopped so they are persisted as such
+            indexerState = IndexerState.STOPPED;
+
+            auditor.info(transformConfig.getId(), "Data frame finished indexing all data, initiating stop");
+            logger.info("[{}] data frame transform finished indexing all data, initiating stop.", transformConfig.getId());
+        }
+
+        // If we are `STOPPED` on a `doSaveState` call, that indicates we transitioned to `STOPPED` from `STOPPING`
+        // OR we called `doSaveState` manually as the indexer was not actively running.
+        // Since we save the state to an index, we should make sure that our task state is in parity with the indexer state
+        if (indexerState.equals(IndexerState.STOPPED)) {
+            // We don't want adjust the stored taskState because as soon as it is `STOPPED` a user could call
+            // .start again.
+            taskState = DataFrameTransformTaskState.STOPPED;
+        }
+
+        final DataFrameTransformState state = new DataFrameTransformState(
+            taskState,
+            indexerState,
+            position,
+            transformTask.getCheckpoint(),
+            transformTask.getStateReason(),
+            getProgress());
+        logger.debug("[{}] updating persistent state of transform to [{}].", transformConfig.getId(), state.toString());
+
+        // This could be `null` but the putOrUpdateTransformStoredDoc handles that case just fine
+        SeqNoPrimaryTermAndIndex seqNoPrimaryTermAndIndex = transformTask.getSeqNoPrimaryTermAndIndex();
+
+        // Persist the current state and stats in the internal index. The interval of this method being
+        // called is controlled by AsyncTwoPhaseIndexer#onBulkResponse which calls doSaveState every so
+        // often when doing bulk indexing calls or at the end of one indexing run.
+        transformsConfigManager.putOrUpdateTransformStoredDoc(
+                new DataFrameTransformStoredDoc(getJobId(), state, getStats()),
+                seqNoPrimaryTermAndIndex,
+                ActionListener.wrap(
+                        r -> {
+                            transformTask.updateSeqNoPrimaryTermAndIndex(seqNoPrimaryTermAndIndex, r);
+                            // for auto stop shutdown the task
+                            if (state.getTaskState().equals(DataFrameTransformTaskState.STOPPED)) {
+                                transformTask.shutdown();
+                            }
+                            // Only do this clean up once, if it succeeded, no reason to do the query again.
+                            if (oldStatsCleanedUp.compareAndSet(false, true)) {
+                                transformsConfigManager.deleteOldTransformStoredDocuments(getJobId(), ActionListener.wrap(
+                                    nil -> {
+                                        logger.trace("[{}] deleted old transform stats and state document", getJobId());
+                                        next.run();
+                                    },
+                                    e -> {
+                                        String msg = LoggerMessageFormat.format("[{}] failed deleting old transform configurations.",
+                                            getJobId());
+                                        logger.warn(msg, e);
+                                        // If we have failed, we should attempt the clean up again later
+                                        oldStatsCleanedUp.set(false);
+                                        next.run();
+                                    }
+                                ));
+                            } else {
+                                next.run();
+                            }
+                        },
+                        statsExc -> {
+                            logger.error(new ParameterizedMessage("[{}] updating stats of transform failed.",
+                                transformConfig.getId()),
+                                statsExc);
+                            auditor.warning(getJobId(),
+                                "Failure updating stats of transform: " + statsExc.getMessage());
+                            // for auto stop shutdown the task
+                            if (state.getTaskState().equals(DataFrameTransformTaskState.STOPPED)) {
+                                transformTask.shutdown();
+                            }
+                            next.run();
+                        }
+                ));
+    }
+
+    @Override
+    protected void onFailure(Exception exc) {
+        // the failure handler must not throw an exception due to internal problems
+        try {
+            handleFailure(exc);
+        } catch (Exception e) {
+            logger.error(
+                new ParameterizedMessage("[{}] data frame transform encountered an unexpected internal exception: ", getJobId()),
+                e);
+        }
+    }
+
+    @Override
+    protected void onFinish(ActionListener<Void> listener) {
+        try {
+            // This indicates an early exit since no changes were found.
+            // So, don't treat this like a checkpoint being completed, as no work was done.
+            if (hasSourceChanged == false) {
+                listener.onResponse(null);
+                return;
+            }
+            // TODO: needs cleanup super is called with a listener, but listener.onResponse is called below
+            // super.onFinish() fortunately ignores the listener
+            super.onFinish(listener);
+            long checkpoint = transformTask.incrementCheckpoint();
+            lastCheckpoint = getNextCheckpoint();
+            nextCheckpoint = null;
+            // Reset our failure count as we have finished and may start again with a new checkpoint
+            failureCount.set(0);
+            transformTask.setStateReason(null);
+
+            // With bucket_selector we could have read all the buckets and completed the transform
+            // but not "see" all the buckets since they were filtered out. Consequently, progress would
+            // show less than 100% even though we are done.
+            // NOTE: this method is called in the same thread as the processing thread.
+            // Theoretically, there should not be a race condition with updating progress here.
+            // NOTE 2: getPercentComplete should only NOT be null on the first (batch) checkpoint
+            if (progress != null && progress.getPercentComplete() != null && progress.getPercentComplete() < 100.0) {
+                progress.incrementDocsProcessed(progress.getTotalDocs() - progress.getDocumentsProcessed());
+            }
+            // If the last checkpoint is now greater than 1, that means that we have just processed the first
+            // continuous checkpoint and should start recording the exponential averages
+            if (lastCheckpoint != null && lastCheckpoint.getCheckpoint() > 1) {
+                long docsIndexed = 0;
+                long docsProcessed = 0;
+                // This should not happen as we simply create a new one when we reach continuous checkpoints
+                // but this is a paranoid `null` check
+                if (progress != null) {
+                    docsIndexed = progress.getDocumentsIndexed();
+                    docsProcessed = progress.getDocumentsProcessed();
+                }
+                long durationMs = System.currentTimeMillis() - lastCheckpoint.getTimestamp();
+                getStats().incrementCheckpointExponentialAverages(durationMs < 0 ? 0 : durationMs, docsIndexed, docsProcessed);
+            }
+            if (shouldAuditOnFinish(checkpoint)) {
+                auditor.info(getJobId(),
+                    "Finished indexing for data frame transform checkpoint [" + checkpoint + "].");
+            }
+            logger.debug(
+                "[{}] finished indexing for data frame transform checkpoint [{}].", getJobId(), checkpoint);
+            auditBulkFailures = true;
+            listener.onResponse(null);
+        } catch (Exception e) {
+            listener.onFailure(e);
+        }
+    }
+
+    /**
+     * Indicates if an audit message should be written when onFinish is called for the given checkpoint
+     * We audit the first checkpoint, and then every 10 checkpoints until completedCheckpoint == 99
+     * Then we audit every 100, until completedCheckpoint == 999
+     *
+     * Then we always audit every 1_000 checkpoints
+     *
+     * @param completedCheckpoint The checkpoint that was just completed
+     * @return {@code true} if an audit message should be written
+     */
+    protected boolean shouldAuditOnFinish(long completedCheckpoint) {
+        if (++logCount % logEvery != 0) {
+            return false;
+        }
+        if (completedCheckpoint == 0) {
+            return true;
+        }
+        int log10Checkpoint = (int) Math.floor(Math.log10(completedCheckpoint));
+        logEvery = log10Checkpoint >= 3  ? 1_000 : (int)Math.pow(10.0, log10Checkpoint);
+        logCount = 0;
+        return true;
+    }
+
+    @Override
+    protected void onStop() {
+        auditor.info(transformConfig.getId(), "Data frame transform has stopped.");
+        logger.info("[{}] data frame transform has stopped.", transformConfig.getId());
+    }
+
+    @Override
+    protected void onAbort() {
+        auditor.info(transformConfig.getId(), "Received abort request, stopping data frame transform.");
+        logger.info("[{}] data frame transform received abort request. Stopping indexer.", transformConfig.getId());
+        transformTask.shutdown();
+    }
+
+    @Override
+    protected void createCheckpoint(ActionListener<DataFrameTransformCheckpoint> listener) {
+        checkpointProvider.createNextCheckpoint(getLastCheckpoint(), ActionListener.wrap(
+                checkpoint -> transformsConfigManager.putTransformCheckpoint(checkpoint,
+                    ActionListener.wrap(
+                        putCheckPointResponse -> listener.onResponse(checkpoint),
+                        createCheckpointException -> {
+                            logger.warn(new ParameterizedMessage("[{}] failed to create checkpoint.", getJobId()),
+                                createCheckpointException);
+                            listener.onFailure(
+                                new RuntimeException("Failed to create checkpoint due to " + createCheckpointException.getMessage(),
+                                    createCheckpointException));
+                        }
+                )),
+                getCheckPointException -> {
+                    logger.warn(new ParameterizedMessage("[{}] failed to retrieve checkpoint.", getJobId()),
+                        getCheckPointException);
+                    listener.onFailure(
+                        new RuntimeException("Failed to retrieve checkpoint due to " + getCheckPointException.getMessage(),
+                            getCheckPointException));
+                }
+        ));
+    }
+
+    @Override
+    protected void sourceHasChanged(ActionListener<Boolean> hasChangedListener) {
+        checkpointProvider.sourceHasChanged(getLastCheckpoint(),
+            ActionListener.wrap(
+                hasChanged -> {
+                    logger.trace("[{}] change detected [{}].", getJobId(), hasChanged);
+                    hasChangedListener.onResponse(hasChanged);
+                },
+                e -> {
+                    logger.warn(
+                        new ParameterizedMessage(
+                            "[{}] failed to detect changes for data frame transform. Skipping update till next check.",
+                            getJobId()),
+                        e);
+                    auditor.warning(getJobId(),
+                        "Failed to detect changes for data frame transform, skipping update till next check. Exception: "
+                            + e.getMessage());
+                    hasChangedListener.onResponse(false);
+                }));
+    }
+
+    private boolean isIrrecoverableFailure(Exception e) {
+        return e instanceof IndexNotFoundException
+            || e instanceof AggregationResultUtils.AggregationExtractionException
+            || e instanceof TransformConfigReloadingException;
+    }
+
+    synchronized void handleFailure(Exception e) {
+        logger.warn(new ParameterizedMessage("[{}] data frame transform encountered an exception: ",
+            getJobId()),
+            e);
+        if (handleCircuitBreakingException(e)) {
+            return;
+        }
+
+        if (isIrrecoverableFailure(e) || failureCount.incrementAndGet() > transformTask.getNumFailureRetries()) {
+            String failureMessage = isIrrecoverableFailure(e) ?
+                "task encountered irrecoverable failure: " + e.getMessage() :
+                "task encountered more than " + transformTask.getNumFailureRetries() + " failures; latest failure: " + e.getMessage();
+            failIndexer(failureMessage);
+        } else {
+            // Since our schedule fires again very quickly after failures it is possible to run into the same failure numerous
+            // times in a row, very quickly. We do not want to spam the audit log with repeated failures, so only record the first one
+            if (e.getMessage().equals(lastAuditedExceptionMessage) == false) {
+                auditor.warning(getJobId(),
+                    "Data frame transform encountered an exception: " + e.getMessage() +
+                        " Will attempt again at next scheduled trigger.");
+                lastAuditedExceptionMessage = e.getMessage();
+            }
+        }
+    }
+
+    @Override
+    protected void failIndexer(String failureMessage) {
+        logger.error("[{}] transform has failed; experienced: [{}].", getJobId(), failureMessage);
+        auditor.error(getJobId(), failureMessage);
+        transformTask.markAsFailed(failureMessage, ActionListener.wrap(
+            r -> {
+                // Successfully marked as failed, reset counter so that task can be restarted
+                failureCount.set(0);
+            }, e -> {}));
+    }
+
+    // Considered a recoverable indexing failure
+    private static class BulkIndexingException extends ElasticsearchException {
+        BulkIndexingException(String msg, Object... args) {
+            super(msg, args);
+        }
+    }
+
+    private static class TransformConfigReloadingException extends ElasticsearchException {
+        TransformConfigReloadingException(String msg, Throwable cause, Object... args) {
+            super(msg, cause, args);
+        }
+    }
+}
--- a/x-pack/plugin/data-frame/src/main/java/org/elasticsearch/xpack/dataframe/transforms/ClientDataFrameIndexerBuilder.java
+++ b/x-pack/plugin/data-frame/src/main/java/org/elasticsearch/xpack/dataframe/transforms/ClientDataFrameIndexerBuilder.java
@ -0,0 +1,123 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+
+package org.elasticsearch.xpack.dataframe.transforms;
+
+import org.elasticsearch.client.Client;
+import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameIndexerPosition;
+import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameIndexerTransformStats;
+import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameTransformCheckpoint;
+import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameTransformConfig;
+import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameTransformProgress;
+import org.elasticsearch.xpack.core.indexing.IndexerState;
+import org.elasticsearch.xpack.dataframe.checkpoint.CheckpointProvider;
+import org.elasticsearch.xpack.dataframe.checkpoint.DataFrameTransformsCheckpointService;
+import org.elasticsearch.xpack.dataframe.notifications.DataFrameAuditor;
+import org.elasticsearch.xpack.dataframe.persistence.DataFrameTransformsConfigManager;
+
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicReference;
+
+class ClientDataFrameIndexerBuilder {
+    private Client client;
+    private DataFrameTransformsConfigManager transformsConfigManager;
+    private DataFrameTransformsCheckpointService transformsCheckpointService;
+    private DataFrameAuditor auditor;
+    private Map<String, String> fieldMappings;
+    private DataFrameTransformConfig transformConfig;
+    private DataFrameIndexerTransformStats initialStats;
+    private IndexerState indexerState = IndexerState.STOPPED;
+    private DataFrameIndexerPosition initialPosition;
+    private DataFrameTransformProgress progress;
+    private DataFrameTransformCheckpoint lastCheckpoint;
+    private DataFrameTransformCheckpoint nextCheckpoint;
+
+    ClientDataFrameIndexerBuilder() {
+        this.initialStats = new DataFrameIndexerTransformStats();
+    }
+
+    ClientDataFrameIndexer build(DataFrameTransformTask parentTask) {
+        CheckpointProvider checkpointProvider = transformsCheckpointService.getCheckpointProvider(transformConfig);
+
+        return new ClientDataFrameIndexer(this.transformsConfigManager,
+            checkpointProvider,
+            new AtomicReference<>(this.indexerState),
+            this.initialPosition,
+            this.client,
+            this.auditor,
+            this.initialStats,
+            this.transformConfig,
+            this.fieldMappings,
+            this.progress,
+            this.lastCheckpoint,
+            this.nextCheckpoint,
+            parentTask);
+    }
+
+    ClientDataFrameIndexerBuilder setClient(Client client) {
+        this.client = client;
+        return this;
+    }
+
+    ClientDataFrameIndexerBuilder setTransformsConfigManager(DataFrameTransformsConfigManager transformsConfigManager) {
+        this.transformsConfigManager = transformsConfigManager;
+        return this;
+    }
+
+    ClientDataFrameIndexerBuilder setTransformsCheckpointService(DataFrameTransformsCheckpointService transformsCheckpointService) {
+        this.transformsCheckpointService = transformsCheckpointService;
+        return this;
+    }
+
+    ClientDataFrameIndexerBuilder setAuditor(DataFrameAuditor auditor) {
+        this.auditor = auditor;
+        return this;
+    }
+
+    ClientDataFrameIndexerBuilder setFieldMappings(Map<String, String> fieldMappings) {
+        this.fieldMappings = fieldMappings;
+        return this;
+    }
+
+    ClientDataFrameIndexerBuilder setTransformConfig(DataFrameTransformConfig transformConfig) {
+        this.transformConfig = transformConfig;
+        return this;
+    }
+
+    DataFrameTransformConfig getTransformConfig() {
+        return this.transformConfig;
+    }
+
+    ClientDataFrameIndexerBuilder setInitialStats(DataFrameIndexerTransformStats initialStats) {
+        this.initialStats = initialStats;
+        return this;
+    }
+
+    ClientDataFrameIndexerBuilder setIndexerState(IndexerState indexerState) {
+        this.indexerState = indexerState;
+        return this;
+    }
+
+    ClientDataFrameIndexerBuilder setInitialPosition(DataFrameIndexerPosition initialPosition) {
+        this.initialPosition = initialPosition;
+        return this;
+    }
+
+    ClientDataFrameIndexerBuilder setProgress(DataFrameTransformProgress progress) {
+        this.progress = progress;
+        return this;
+    }
+
+    ClientDataFrameIndexerBuilder setLastCheckpoint(DataFrameTransformCheckpoint lastCheckpoint) {
+        this.lastCheckpoint = lastCheckpoint;
+        return this;
+    }
+
+    ClientDataFrameIndexerBuilder setNextCheckpoint(DataFrameTransformCheckpoint nextCheckpoint) {
+        this.nextCheckpoint = nextCheckpoint;
+        return this;
+    }
+}
--- a/x-pack/plugin/data-frame/src/main/java/org/elasticsearch/xpack/dataframe/transforms/DataFrameIndexer.java
+++ b/x-pack/plugin/data-frame/src/main/java/org/elasticsearch/xpack/dataframe/transforms/DataFrameIndexer.java
@ -117,6 +117,11 @@ public abstract class DataFrameIndexer extends AsyncTwoPhaseIndexer<DataFrameInd
        return pageSize;
    }

+    @Override
+    protected String getJobId() {
+        return transformConfig.getId();
+    }
+
    public DataFrameTransformConfig getConfig() {
        return transformConfig;
    }
--- a/x-pack/plugin/data-frame/src/main/java/org/elasticsearch/xpack/dataframe/transforms/DataFrameTransformPersistentTasksExecutor.java
+++ b/x-pack/plugin/data-frame/src/main/java/org/elasticsearch/xpack/dataframe/transforms/DataFrameTransformPersistentTasksExecutor.java
@ -130,8 +130,8 @@ public class DataFrameTransformPersistentTasksExecutor extends PersistentTasksEx
        // We want the rest of the state to be populated in the task when it is loaded on the node so that users can force start it again
        // later if they want.

-        final DataFrameTransformTask.ClientDataFrameIndexerBuilder indexerBuilder =
-            new DataFrameTransformTask.ClientDataFrameIndexerBuilder(transformId)
+        final ClientDataFrameIndexerBuilder indexerBuilder =
+            new ClientDataFrameIndexerBuilder()
                .setAuditor(auditor)
                .setClient(client)
                .setTransformsCheckpointService(dataFrameTransformsCheckpointService)
@ -300,7 +300,7 @@ public class DataFrameTransformPersistentTasksExecutor extends PersistentTasksEx
    }

    private void startTask(DataFrameTransformTask buildTask,
-                           DataFrameTransformTask.ClientDataFrameIndexerBuilder indexerBuilder,
+                           ClientDataFrameIndexerBuilder indexerBuilder,
                           Long previousCheckpoint,
                           ActionListener<StartDataFrameTransformTaskAction.Response> listener) {
        buildTask.initializeIndexer(indexerBuilder);
--- a/x-pack/plugin/data-frame/src/main/java/org/elasticsearch/xpack/dataframe/transforms/DataFrameTransformTask.java
+++ b/x-pack/plugin/data-frame/src/main/java/org/elasticsearch/xpack/dataframe/transforms/DataFrameTransformTask.java
@ -12,27 +12,15 @@ import org.apache.logging.log4j.message.ParameterizedMessage;
 import org.apache.lucene.util.SetOnce;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.ElasticsearchStatusException;
-import org.elasticsearch.ResourceNotFoundException;
 import org.elasticsearch.action.ActionListener;
-import org.elasticsearch.action.bulk.BulkAction;
-import org.elasticsearch.action.bulk.BulkItemResponse;
-import org.elasticsearch.action.bulk.BulkRequest;
-import org.elasticsearch.action.bulk.BulkResponse;
-import org.elasticsearch.action.search.SearchAction;
-import org.elasticsearch.action.search.SearchRequest;
-import org.elasticsearch.action.search.SearchResponse;
-import org.elasticsearch.client.Client;
 import org.elasticsearch.common.Nullable;
-import org.elasticsearch.common.logging.LoggerMessageFormat;
 import org.elasticsearch.common.settings.Setting;
 import org.elasticsearch.common.unit.TimeValue;
-import org.elasticsearch.index.IndexNotFoundException;
 import org.elasticsearch.persistent.AllocatedPersistentTask;
 import org.elasticsearch.persistent.PersistentTasksCustomMetaData;
 import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.tasks.TaskId;
 import org.elasticsearch.threadpool.ThreadPool;
-import org.elasticsearch.xpack.core.ClientHelper;
 import org.elasticsearch.xpack.core.dataframe.DataFrameField;
 import org.elasticsearch.xpack.core.dataframe.DataFrameMessages;
 import org.elasticsearch.xpack.core.dataframe.action.StartDataFrameTransformTaskAction;
@ -40,29 +28,18 @@ import org.elasticsearch.xpack.core.dataframe.action.StartDataFrameTransformTask
 import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameIndexerPosition;
 import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameIndexerTransformStats;
 import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameTransform;
-import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameTransformCheckpoint;
 import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameTransformCheckpointingInfo;
-import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameTransformConfig;
-import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameTransformProgress;
 import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameTransformState;
-import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameTransformStoredDoc;
 import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameTransformTaskState;
-import org.elasticsearch.xpack.core.dataframe.utils.ExceptionsHelper;
 import org.elasticsearch.xpack.core.indexing.IndexerState;
 import org.elasticsearch.xpack.core.scheduler.SchedulerEngine;
 import org.elasticsearch.xpack.core.scheduler.SchedulerEngine.Event;
-import org.elasticsearch.xpack.dataframe.checkpoint.CheckpointProvider;
 import org.elasticsearch.xpack.dataframe.checkpoint.DataFrameTransformsCheckpointService;
 import org.elasticsearch.xpack.dataframe.notifications.DataFrameAuditor;
-import org.elasticsearch.xpack.dataframe.persistence.DataFrameTransformsConfigManager;
 import org.elasticsearch.xpack.dataframe.persistence.SeqNoPrimaryTermAndIndex;
-import org.elasticsearch.xpack.dataframe.transforms.pivot.AggregationResultUtils;

-import java.time.Instant;
 import java.util.Arrays;
 import java.util.Map;
-import java.util.concurrent.atomic.AtomicBoolean;
-import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.atomic.AtomicReference;

@ -94,7 +71,6 @@ public class DataFrameTransformTask extends AllocatedPersistentTask implements S
    private final DataFrameAuditor auditor;
    private final DataFrameIndexerPosition initialPosition;
    private final IndexerState initialIndexerState;
-    private volatile Instant changesLastDetectedAt;

    private final SetOnce<ClientDataFrameIndexer> indexer = new SetOnce<>();

@ -190,6 +166,10 @@ public class DataFrameTransformTask extends AllocatedPersistentTask implements S
        return currentCheckpoint.get();
    }

+    long incrementCheckpoint() {
+        return currentCheckpoint.getAndIncrement();
+    }
+
    public void getCheckpointingInfo(DataFrameTransformsCheckpointService transformsCheckpointService,
            ActionListener<DataFrameTransformCheckpointingInfo> listener) {
        ClientDataFrameIndexer indexer = getIndexer();
@ -209,10 +189,10 @@ public class DataFrameTransformTask extends AllocatedPersistentTask implements S
                indexer.getProgress(),
                ActionListener.wrap(
                    info -> {
-                        if (changesLastDetectedAt == null) {
+                        if (indexer.getChangesLastDetectedAt() == null) {
                            listener.onResponse(info);
                        } else {
-                            listener.onResponse(info.setChangesLastDetectedAt(changesLastDetectedAt));
+                            listener.onResponse(info.setChangesLastDetectedAt(indexer.getChangesLastDetectedAt()));
                        }
                    },
                    listener::onFailure
@ -469,12 +449,12 @@ public class DataFrameTransformTask extends AllocatedPersistentTask implements S
        }
    }

-    public DataFrameTransformTask setNumFailureRetries(int numFailureRetries) {
+    DataFrameTransformTask setNumFailureRetries(int numFailureRetries) {
        this.numFailureRetries = numFailureRetries;
        return this;
    }

-    public int getNumFailureRetries() {
+    int getNumFailureRetries() {
        return numFailureRetries;
    }

@ -517,656 +497,19 @@ public class DataFrameTransformTask extends AllocatedPersistentTask implements S
        return seqNoPrimaryTermAndIndex.get();
    }

-    static class ClientDataFrameIndexerBuilder {
-        private Client client;
-        private DataFrameTransformsConfigManager transformsConfigManager;
-        private DataFrameTransformsCheckpointService transformsCheckpointService;
-        private String transformId;
-        private DataFrameAuditor auditor;
-        private Map<String, String> fieldMappings;
-        private DataFrameTransformConfig transformConfig;
-        private DataFrameIndexerTransformStats initialStats;
-        private IndexerState indexerState = IndexerState.STOPPED;
-        private DataFrameIndexerPosition initialPosition;
-        private DataFrameTransformProgress progress;
-        private DataFrameTransformCheckpoint lastCheckpoint;
-        private DataFrameTransformCheckpoint nextCheckpoint;
-
-        ClientDataFrameIndexerBuilder(String transformId) {
-            this.transformId = transformId;
-            this.initialStats = new DataFrameIndexerTransformStats();
-        }
-
-        ClientDataFrameIndexer build(DataFrameTransformTask parentTask) {
-            CheckpointProvider checkpointProvider = transformsCheckpointService.getCheckpointProvider(transformConfig);
-
-            return new ClientDataFrameIndexer(this.transformId,
-                this.transformsConfigManager,
-                checkpointProvider,
-                new AtomicReference<>(this.indexerState),
-                this.initialPosition,
-                this.client,
-                this.auditor,
-                this.initialStats,
-                this.transformConfig,
-                this.fieldMappings,
-                this.progress,
-                this.lastCheckpoint,
-                this.nextCheckpoint,
-                parentTask);
-        }
-
-        ClientDataFrameIndexerBuilder setClient(Client client) {
-            this.client = client;
-            return this;
-        }
-
-        ClientDataFrameIndexerBuilder setTransformsConfigManager(DataFrameTransformsConfigManager transformsConfigManager) {
-            this.transformsConfigManager = transformsConfigManager;
-            return this;
-        }
-
-        ClientDataFrameIndexerBuilder setTransformsCheckpointService(DataFrameTransformsCheckpointService transformsCheckpointService) {
-            this.transformsCheckpointService = transformsCheckpointService;
-            return this;
-        }
-
-        ClientDataFrameIndexerBuilder setTransformId(String transformId) {
-            this.transformId = transformId;
-            return this;
-        }
-
-        ClientDataFrameIndexerBuilder setAuditor(DataFrameAuditor auditor) {
-            this.auditor = auditor;
-            return this;
-        }
-
-        ClientDataFrameIndexerBuilder setFieldMappings(Map<String, String> fieldMappings) {
-            this.fieldMappings = fieldMappings;
-            return this;
-        }
-
-        ClientDataFrameIndexerBuilder setTransformConfig(DataFrameTransformConfig transformConfig) {
-            this.transformConfig = transformConfig;
-            return this;
-        }
-
-        DataFrameTransformConfig getTransformConfig() {
-            return this.transformConfig;
-        }
-
-        ClientDataFrameIndexerBuilder setInitialStats(DataFrameIndexerTransformStats initialStats) {
-            this.initialStats = initialStats;
-            return this;
-        }
-
-        ClientDataFrameIndexerBuilder setIndexerState(IndexerState indexerState) {
-            this.indexerState = indexerState;
-            return this;
-        }
-
-        ClientDataFrameIndexerBuilder setInitialPosition(DataFrameIndexerPosition initialPosition) {
-            this.initialPosition = initialPosition;
-            return this;
-        }
-
-        ClientDataFrameIndexerBuilder setProgress(DataFrameTransformProgress progress) {
-            this.progress = progress;
-            return this;
-        }
-
-        ClientDataFrameIndexerBuilder setLastCheckpoint(DataFrameTransformCheckpoint lastCheckpoint) {
-            this.lastCheckpoint = lastCheckpoint;
-            return this;
-        }
-
-        ClientDataFrameIndexerBuilder setNextCheckpoint(DataFrameTransformCheckpoint nextCheckpoint) {
-            this.nextCheckpoint = nextCheckpoint;
-            return this;
-        }
+    ThreadPool getThreadPool() {
+        return threadPool;
    }

-    static class ClientDataFrameIndexer extends DataFrameIndexer {
-
-        private long logEvery = 1;
-        private long logCount = 0;
-        private final Client client;
-        private final DataFrameTransformsConfigManager transformsConfigManager;
-        private final CheckpointProvider checkpointProvider;
-        private final String transformId;
-        private final DataFrameTransformTask transformTask;
-        private final AtomicInteger failureCount;
-        private volatile boolean auditBulkFailures = true;
-        // Indicates that the source has changed for the current run
-        private volatile boolean hasSourceChanged = true;
-        // Keeps track of the last exception that was written to our audit, keeps us from spamming the audit index
-        private volatile String lastAuditedExceptionMessage = null;
-        private final AtomicBoolean oldStatsCleanedUp = new AtomicBoolean(false);
-
-        ClientDataFrameIndexer(String transformId,
-                               DataFrameTransformsConfigManager transformsConfigManager,
-                               CheckpointProvider checkpointProvider,
-                               AtomicReference<IndexerState> initialState,
-                               DataFrameIndexerPosition initialPosition,
-                               Client client,
-                               DataFrameAuditor auditor,
-                               DataFrameIndexerTransformStats initialStats,
-                               DataFrameTransformConfig transformConfig,
-                               Map<String, String> fieldMappings,
-                               DataFrameTransformProgress transformProgress,
-                               DataFrameTransformCheckpoint lastCheckpoint,
-                               DataFrameTransformCheckpoint nextCheckpoint,
-                               DataFrameTransformTask parentTask) {
-            super(ExceptionsHelper.requireNonNull(parentTask, "parentTask")
-                    .threadPool
-                    .executor(ThreadPool.Names.GENERIC),
-                ExceptionsHelper.requireNonNull(auditor, "auditor"),
-                transformConfig,
-                fieldMappings,
-                ExceptionsHelper.requireNonNull(initialState, "initialState"),
-                initialPosition,
-                initialStats == null ? new DataFrameIndexerTransformStats() : initialStats,
-                transformProgress,
-                lastCheckpoint,
-                nextCheckpoint);
-            this.transformId = ExceptionsHelper.requireNonNull(transformId, "transformId");
-            this.transformsConfigManager = ExceptionsHelper.requireNonNull(transformsConfigManager, "transformsConfigManager");
-            this.checkpointProvider = ExceptionsHelper.requireNonNull(checkpointProvider, "checkpointProvider");
-
-            this.client = ExceptionsHelper.requireNonNull(client, "client");
-            this.transformTask = parentTask;
-            this.failureCount = new AtomicInteger(0);
-        }
-
-        @Override
-        protected void onStart(long now, ActionListener<Boolean> listener) {
-            if (transformTask.taskState.get() == DataFrameTransformTaskState.FAILED) {
-                logger.debug("[{}] attempted to start while failed.", transformId);
-                listener.onFailure(new ElasticsearchException("Attempted to start a failed transform [{}].", transformId));
-                return;
-            }
-            // On each run, we need to get the total number of docs and reset the count of processed docs
-            // Since multiple checkpoints can be executed in the task while it is running on the same node, we need to gather
-            // the progress here, and not in the executor.
-            ActionListener<Void> updateConfigListener = ActionListener.wrap(
-                updateConfigResponse -> {
-                    if (initialRun()) {
-                        createCheckpoint(ActionListener.wrap(cp -> {
-                            nextCheckpoint = cp;
-                            // If nextCheckpoint > 1, this means that we are now on the checkpoint AFTER the batch checkpoint
-                            // Consequently, the idea of percent complete no longer makes sense.
-                            if (nextCheckpoint.getCheckpoint() > 1) {
-                                progress = new DataFrameTransformProgress(null, 0L, 0L);
-                                super.onStart(now, listener);
-                                return;
-                            }
-                            TransformProgressGatherer.getInitialProgress(this.client, buildFilterQuery(), getConfig(), ActionListener.wrap(
-                                newProgress -> {
-                                    logger.trace("[{}] reset the progress from [{}] to [{}].", transformId, progress, newProgress);
-                                    progress = newProgress;
-                                    super.onStart(now, listener);
-                                },
-                                failure -> {
-                                    progress = null;
-                                    logger.warn(new ParameterizedMessage("[{}] unable to load progress information for task.",
-                                        transformId),
-                                        failure);
-                                    super.onStart(now, listener);
-                                }
-                            ));
-                        }, listener::onFailure));
-                    } else {
-                        super.onStart(now, listener);
-                    }
-                },
-                listener::onFailure
-            );
-
-            // If we are continuous, we will want to verify we have the latest stored configuration
-            ActionListener<Void> changedSourceListener = ActionListener.wrap(
-                r -> {
-                    if (isContinuous()) {
-                        transformsConfigManager.getTransformConfiguration(getJobId(), ActionListener.wrap(
-                            config -> {
-                                transformConfig = config;
-                                logger.debug("[{}] successfully refreshed data frame transform config from index.", transformId);
-                                updateConfigListener.onResponse(null);
-                            },
-                            failure -> {
-                                String msg = DataFrameMessages.getMessage(
-                                    DataFrameMessages.FAILED_TO_RELOAD_TRANSFORM_CONFIGURATION,
-                                    getJobId());
-                                logger.error(msg, failure);
-                                // If the transform config index or the transform config is gone, something serious occurred
-                                // We are in an unknown state and should fail out
-                                if (failure instanceof ResourceNotFoundException) {
-                                    updateConfigListener.onFailure(new TransformConfigReloadingException(msg, failure));
-                                } else {
-                                    auditor.warning(getJobId(), msg);
-                                    updateConfigListener.onResponse(null);
-                                }
-                            }
-                        ));
-                    } else {
-                        updateConfigListener.onResponse(null);
-                    }
-                },
-                listener::onFailure
-            );
-
-            // If we are not on the initial batch checkpoint and its the first pass of whatever continuous checkpoint we are on,
-            // we should verify if there are local changes based on the sync config. If not, do not proceed further and exit.
-            if (transformTask.currentCheckpoint.get() > 0 && initialRun()) {
-                sourceHasChanged(ActionListener.wrap(
-                    hasChanged -> {
-                        hasSourceChanged = hasChanged;
-                        if (hasChanged) {
-                            transformTask.changesLastDetectedAt = Instant.now();
-                            logger.debug("[{}] source has changed, triggering new indexer run.", transformId);
-                            changedSourceListener.onResponse(null);
-                        } else {
-                            logger.trace("[{}] source has not changed, finish indexer early.", transformId);
-                            // No changes, stop executing
-                            listener.onResponse(false);
-                        }
-                    },
-                    failure -> {
-                        // If we failed determining if the source changed, it's safer to assume there were changes.
-                        // We should allow the failure path to complete as normal
-                        hasSourceChanged = true;
-                        listener.onFailure(failure);
-                    }
-                ));
-            } else {
-                hasSourceChanged = true;
-                changedSourceListener.onResponse(null);
-            }
-        }
-
-        @Override
-        protected String getJobId() {
-            return transformId;
-        }
-
-        public CheckpointProvider getCheckpointProvider() {
-            return checkpointProvider;
-        }
-
-        @Override
-        public synchronized boolean maybeTriggerAsyncJob(long now) {
-            if (transformTask.taskState.get() == DataFrameTransformTaskState.FAILED) {
-                logger.debug("[{}] schedule was triggered for transform but task is failed. Ignoring trigger.", getJobId());
-                return false;
-            }
-
-            // ignore trigger if indexer is running, prevents log spam in A2P indexer
-            IndexerState indexerState = getState();
-            if (IndexerState.INDEXING.equals(indexerState) || IndexerState.STOPPING.equals(indexerState)) {
-                logger.debug("[{}] indexer for transform has state [{}]. Ignoring trigger.", getJobId(), indexerState);
-                return false;
-            }
-
-            return super.maybeTriggerAsyncJob(now);
-        }
-
-        @Override
-        protected void doNextSearch(SearchRequest request, ActionListener<SearchResponse> nextPhase) {
-            if (transformTask.taskState.get() == DataFrameTransformTaskState.FAILED) {
-                logger.debug("[{}] attempted to search while failed.", transformId);
-                nextPhase.onFailure(new ElasticsearchException("Attempted to do a search request for failed transform [{}].",
-                    transformId));
-                return;
-            }
-            ClientHelper.executeWithHeadersAsync(transformConfig.getHeaders(), ClientHelper.DATA_FRAME_ORIGIN, client,
-                    SearchAction.INSTANCE, request, nextPhase);
-        }
-
-        @Override
-        protected void doNextBulk(BulkRequest request, ActionListener<BulkResponse> nextPhase) {
-            if (transformTask.taskState.get() == DataFrameTransformTaskState.FAILED) {
-                logger.debug("[{}] attempted to bulk index while failed.", transformId);
-                nextPhase.onFailure(new ElasticsearchException("Attempted to do a bulk index request for failed transform [{}].",
-                    transformId));
-                return;
-            }
-            ClientHelper.executeWithHeadersAsync(transformConfig.getHeaders(),
-                ClientHelper.DATA_FRAME_ORIGIN,
-                client,
-                BulkAction.INSTANCE,
-                request,
-                ActionListener.wrap(bulkResponse -> {
-                    if (bulkResponse.hasFailures()) {
-                        int failureCount = 0;
-                        for(BulkItemResponse item : bulkResponse.getItems()) {
-                            if (item.isFailed()) {
-                                failureCount++;
-                            }
-                            // TODO gather information on irrecoverable failures and update isIrrecoverableFailure
-                        }
-                        if (auditBulkFailures) {
-                            auditor.warning(transformId,
-                                "Experienced at least [" +
-                                    failureCount +
-                                    "] bulk index failures. See the logs of the node running the transform for details. " +
-                                    bulkResponse.buildFailureMessage());
-                            auditBulkFailures = false;
-                        }
-                        // This calls AsyncTwoPhaseIndexer#finishWithIndexingFailure
-                        // It increments the indexing failure, and then calls the `onFailure` logic
-                        nextPhase.onFailure(
-                            new BulkIndexingException("Bulk index experienced failures. " +
-                                "See the logs of the node running the transform for details."));
-                    } else {
-                        auditBulkFailures = true;
-                        nextPhase.onResponse(bulkResponse);
-                    }
-                }, nextPhase::onFailure));
-        }
-
-        @Override
-        protected void doSaveState(IndexerState indexerState, DataFrameIndexerPosition position, Runnable next) {
-            if (transformTask.taskState.get() == DataFrameTransformTaskState.FAILED) {
-                logger.debug("[{}] attempted to save state and stats while failed.", transformId);
-                // If we are failed, we should call next to allow failure handling to occur if necessary.
-                next.run();
-                return;
-            }
-            if (indexerState.equals(IndexerState.ABORTING)) {
-                // If we're aborting, just invoke `next` (which is likely an onFailure handler)
-                next.run();
-                return;
-            }
-
-            // This means that the indexer was triggered to discover changes, found none, and exited early.
-            // If the state is `STOPPED` this means that DataFrameTransformTask#stop was called while we were checking for changes.
-            // Allow the stop call path to continue
-            if (hasSourceChanged == false && indexerState.equals(IndexerState.STOPPED) == false) {
-                next.run();
-                return;
-            }
-
-            DataFrameTransformTaskState taskState = transformTask.taskState.get();
-
-            if (indexerState.equals(IndexerState.STARTED)
-                && transformTask.currentCheckpoint.get() == 1
-                && this.isContinuous() == false) {
-                // set both to stopped so they are persisted as such
-                indexerState = IndexerState.STOPPED;
-
-                auditor.info(transformConfig.getId(), "Data frame finished indexing all data, initiating stop");
-                logger.info("[{}] data frame transform finished indexing all data, initiating stop.", transformConfig.getId());
-            }
-
-            // If we are `STOPPED` on a `doSaveState` call, that indicates we transitioned to `STOPPED` from `STOPPING`
-            // OR we called `doSaveState` manually as the indexer was not actively running.
-            // Since we save the state to an index, we should make sure that our task state is in parity with the indexer state
-            if (indexerState.equals(IndexerState.STOPPED)) {
-                // We don't want adjust the stored taskState because as soon as it is `STOPPED` a user could call
-                // .start again.
-                taskState = DataFrameTransformTaskState.STOPPED;
-            }
-
-            final DataFrameTransformState state = new DataFrameTransformState(
-                taskState,
-                indexerState,
-                position,
-                transformTask.currentCheckpoint.get(),
-                transformTask.stateReason.get(),
-                getProgress());
-            logger.debug("[{}] updating persistent state of transform to [{}].", transformConfig.getId(), state.toString());
-
-            // This could be `null` but the putOrUpdateTransformStoredDoc handles that case just fine
-            SeqNoPrimaryTermAndIndex seqNoPrimaryTermAndIndex = transformTask.getSeqNoPrimaryTermAndIndex();
-
-            // Persist the current state and stats in the internal index. The interval of this method being
-            // called is controlled by AsyncTwoPhaseIndexer#onBulkResponse which calls doSaveState every so
-            // often when doing bulk indexing calls or at the end of one indexing run.
-            transformsConfigManager.putOrUpdateTransformStoredDoc(
-                    new DataFrameTransformStoredDoc(transformId, state, getStats()),
-                    seqNoPrimaryTermAndIndex,
-                    ActionListener.wrap(
-                            r -> {
-                                transformTask.updateSeqNoPrimaryTermAndIndex(seqNoPrimaryTermAndIndex, r);
-                                // for auto stop shutdown the task
-                                if (state.getTaskState().equals(DataFrameTransformTaskState.STOPPED)) {
-                                    transformTask.shutdown();
-                                }
-                                // Only do this clean up once, if it succeeded, no reason to do the query again.
-                                if (oldStatsCleanedUp.compareAndSet(false, true)) {
-                                    transformsConfigManager.deleteOldTransformStoredDocuments(transformId, ActionListener.wrap(
-                                        nil -> {
-                                            logger.trace("[{}] deleted old transform stats and state document", transformId);
-                                            next.run();
-                                        },
-                                        e -> {
-                                            String msg = LoggerMessageFormat.format("[{}] failed deleting old transform configurations.",
-                                                transformId);
-                                            logger.warn(msg, e);
-                                            // If we have failed, we should attempt the clean up again later
-                                            oldStatsCleanedUp.set(false);
-                                            next.run();
-                                        }
-                                    ));
-                                } else {
-                                    next.run();
-                                }
-                            },
-                            statsExc -> {
-                                logger.error(new ParameterizedMessage("[{}] updating stats of transform failed.",
-                                    transformConfig.getId()),
-                                    statsExc);
-                                auditor.warning(getJobId(),
-                                    "Failure updating stats of transform: " + statsExc.getMessage());
-                                // for auto stop shutdown the task
-                                if (state.getTaskState().equals(DataFrameTransformTaskState.STOPPED)) {
-                                    transformTask.shutdown();
-                                }
-                                next.run();
-                            }
-                    ));
-        }
-
-        @Override
-        protected void onFailure(Exception exc) {
-            // the failure handler must not throw an exception due to internal problems
-            try {
-                handleFailure(exc);
-            } catch (Exception e) {
-                logger.error(
-                    new ParameterizedMessage("[{}] data frame transform encountered an unexpected internal exception: ", transformId),
-                    e);
-            }
-        }
-
-        @Override
-        protected void onFinish(ActionListener<Void> listener) {
-            try {
-                // This indicates an early exit since no changes were found.
-                // So, don't treat this like a checkpoint being completed, as no work was done.
-                if (hasSourceChanged == false) {
-                    listener.onResponse(null);
-                    return;
-                }
-                // TODO: needs cleanup super is called with a listener, but listener.onResponse is called below
-                // super.onFinish() fortunately ignores the listener
-                super.onFinish(listener);
-                long checkpoint = transformTask.currentCheckpoint.getAndIncrement();
-                lastCheckpoint = getNextCheckpoint();
-                nextCheckpoint = null;
-                // Reset our failure count as we have finished and may start again with a new checkpoint
-                failureCount.set(0);
-                transformTask.stateReason.set(null);
-
-                // With bucket_selector we could have read all the buckets and completed the transform
-                // but not "see" all the buckets since they were filtered out. Consequently, progress would
-                // show less than 100% even though we are done.
-                // NOTE: this method is called in the same thread as the processing thread.
-                // Theoretically, there should not be a race condition with updating progress here.
-                // NOTE 2: getPercentComplete should only NOT be null on the first (batch) checkpoint
-                if (progress != null && progress.getPercentComplete() != null && progress.getPercentComplete() < 100.0) {
-                    progress.incrementDocsProcessed(progress.getTotalDocs() - progress.getDocumentsProcessed());
-                }
-                // If the last checkpoint is now greater than 1, that means that we have just processed the first
-                // continuous checkpoint and should start recording the exponential averages
-                if (lastCheckpoint != null && lastCheckpoint.getCheckpoint() > 1) {
-                    long docsIndexed = 0;
-                    long docsProcessed = 0;
-                    // This should not happen as we simply create a new one when we reach continuous checkpoints
-                    // but this is a paranoid `null` check
-                    if (progress != null) {
-                        docsIndexed = progress.getDocumentsIndexed();
-                        docsProcessed = progress.getDocumentsProcessed();
-                    }
-                    long durationMs = System.currentTimeMillis() - lastCheckpoint.getTimestamp();
-                    getStats().incrementCheckpointExponentialAverages(durationMs < 0 ? 0 : durationMs, docsIndexed, docsProcessed);
-                }
-                if (shouldAuditOnFinish(checkpoint)) {
-                    auditor.info(transformTask.getTransformId(),
-                        "Finished indexing for data frame transform checkpoint [" + checkpoint + "].");
-                }
-                logger.debug(
-                    "[{}] finished indexing for data frame transform checkpoint [{}].", getJobId(), checkpoint);
-                auditBulkFailures = true;
-                listener.onResponse(null);
-            } catch (Exception e) {
-                listener.onFailure(e);
-            }
-        }
-
-        /**
-         * Indicates if an audit message should be written when onFinish is called for the given checkpoint
-         * We audit the first checkpoint, and then every 10 checkpoints until completedCheckpoint == 99
-         * Then we audit every 100, until completedCheckpoint == 999
-         *
-         * Then we always audit every 1_000 checkpoints
-         *
-         * @param completedCheckpoint The checkpoint that was just completed
-         * @return {@code true} if an audit message should be written
-         */
-        protected boolean shouldAuditOnFinish(long completedCheckpoint) {
-            if (++logCount % logEvery != 0) {
-                return false;
-            }
-            if (completedCheckpoint == 0) {
-                return true;
-            }
-            int log10Checkpoint = (int) Math.floor(Math.log10(completedCheckpoint));
-            logEvery = log10Checkpoint >= 3  ? 1_000 : (int)Math.pow(10.0, log10Checkpoint);
-            logCount = 0;
-            return true;
-        }
-
-        @Override
-        protected void onStop() {
-            auditor.info(transformConfig.getId(), "Data frame transform has stopped.");
-            logger.info("[{}] data frame transform has stopped.", transformConfig.getId());
-        }
-
-        @Override
-        protected void onAbort() {
-            auditor.info(transformConfig.getId(), "Received abort request, stopping data frame transform.");
-            logger.info("[{}] data frame transform received abort request. Stopping indexer.", transformConfig.getId());
-            transformTask.shutdown();
-        }
-
-        @Override
-        protected void createCheckpoint(ActionListener<DataFrameTransformCheckpoint> listener) {
-            checkpointProvider.createNextCheckpoint(getLastCheckpoint(), ActionListener.wrap(
-                    checkpoint -> transformsConfigManager.putTransformCheckpoint(checkpoint,
-                        ActionListener.wrap(
-                            putCheckPointResponse -> listener.onResponse(checkpoint),
-                            createCheckpointException -> {
-                                logger.warn(new ParameterizedMessage("[{}] failed to create checkpoint.", transformId),
-                                    createCheckpointException);
-                                listener.onFailure(
-                                    new RuntimeException("Failed to create checkpoint due to " + createCheckpointException.getMessage(),
-                                        createCheckpointException));
-                            }
-                    )),
-                    getCheckPointException -> {
-                        logger.warn(new ParameterizedMessage("[{}] failed to retrieve checkpoint.", transformId),
-                            getCheckPointException);
-                        listener.onFailure(
-                            new RuntimeException("Failed to retrieve checkpoint due to " + getCheckPointException.getMessage(),
-                                getCheckPointException));
-                    }
-            ));
-        }
-
-        @Override
-        protected void sourceHasChanged(ActionListener<Boolean> hasChangedListener) {
-            checkpointProvider.sourceHasChanged(getLastCheckpoint(),
-                ActionListener.wrap(
-                    hasChanged -> {
-                        logger.trace("[{}] change detected [{}].", transformId, hasChanged);
-                        hasChangedListener.onResponse(hasChanged);
-                    },
-                    e -> {
-                        logger.warn(
-                            new ParameterizedMessage(
-                                "[{}] failed to detect changes for data frame transform. Skipping update till next check.",
-                                transformId),
-                            e);
-                        auditor.warning(transformId,
-                            "Failed to detect changes for data frame transform, skipping update till next check. Exception: "
-                                + e.getMessage());
-                        hasChangedListener.onResponse(false);
-                    }));
-        }
-
-        private boolean isIrrecoverableFailure(Exception e) {
-            return e instanceof IndexNotFoundException
-                || e instanceof AggregationResultUtils.AggregationExtractionException
-                || e instanceof TransformConfigReloadingException;
-        }
-
-        synchronized void handleFailure(Exception e) {
-            logger.warn(new ParameterizedMessage("[{}] data frame transform encountered an exception: ",
-                transformTask.getTransformId()),
-                e);
-            if (handleCircuitBreakingException(e)) {
-                return;
-            }
-
-            if (isIrrecoverableFailure(e) || failureCount.incrementAndGet() > transformTask.getNumFailureRetries()) {
-                String failureMessage = isIrrecoverableFailure(e) ?
-                    "task encountered irrecoverable failure: " + e.getMessage() :
-                    "task encountered more than " + transformTask.getNumFailureRetries() + " failures; latest failure: " + e.getMessage();
-                failIndexer(failureMessage);
-            } else {
-                // Since our schedule fires again very quickly after failures it is possible to run into the same failure numerous
-                // times in a row, very quickly. We do not want to spam the audit log with repeated failures, so only record the first one
-                if (e.getMessage().equals(lastAuditedExceptionMessage) == false) {
-                    auditor.warning(transformTask.getTransformId(),
-                        "Data frame transform encountered an exception: " + e.getMessage() +
-                            " Will attempt again at next scheduled trigger.");
-                    lastAuditedExceptionMessage = e.getMessage();
-                }
-            }
-        }
-
-        @Override
-        protected void failIndexer(String failureMessage) {
-            logger.error("[{}] transform has failed; experienced: [{}].", getJobId(), failureMessage);
-            auditor.error(transformTask.getTransformId(), failureMessage);
-            transformTask.markAsFailed(failureMessage, ActionListener.wrap(
-                r -> {
-                    // Successfully marked as failed, reset counter so that task can be restarted
-                    failureCount.set(0);
-                }, e -> {}));
-        }
+    DataFrameTransformTaskState getTaskState() {
+        return taskState.get();
    }

-    // Considered a recoverable indexing failure
-    private static class BulkIndexingException extends ElasticsearchException {
-        BulkIndexingException(String msg, Object... args) {
-            super(msg, args);
-        }
+    void setStateReason(String reason) {
+        stateReason.set(reason);
    }

-    private static class TransformConfigReloadingException extends ElasticsearchException {
-        TransformConfigReloadingException(String msg, Throwable cause, Object... args) {
-            super(msg, cause, args);
-        }
+    String getStateReason() {
+        return stateReason.get();
    }
 }
--- a/x-pack/plugin/data-frame/src/test/java/org/elasticsearch/xpack/dataframe/transforms/ClientDataFrameIndexerTests.java
+++ b/x-pack/plugin/data-frame/src/test/java/org/elasticsearch/xpack/dataframe/transforms/ClientDataFrameIndexerTests.java
@ -46,7 +46,7 @@ public class ClientDataFrameIndexerTests extends ESTestCase {
            mock(DataFrameAuditor.class),
            threadPool,
            Collections.emptyMap());
-        DataFrameTransformTask.ClientDataFrameIndexer indexer = new DataFrameTransformTask.ClientDataFrameIndexer(randomAlphaOfLength(10),
+        ClientDataFrameIndexer indexer = new ClientDataFrameIndexer(
            mock(DataFrameTransformsConfigManager.class),
            mock(CheckpointProvider.class),
            new AtomicReference<>(IndexerState.STOPPED),