From 062d72b67eccbb754fe74c7328cd06e0026fe5bf Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Mon, 27 Mar 2023 11:07:19 -0700 Subject: [PATCH] Add timeout to TaskStartTimeoutFault. (#13970) * Add timeout to TaskStartTimeoutFault. Makes the error message a bit more useful. * Update docs. --- docs/multi-stage-query/reference.md | 2 +- .../msq/indexing/MSQWorkerTaskLauncher.java | 2 +- .../indexing/error/TaskStartTimeoutFault.java | 38 +++++++++++++++---- .../apache/druid/msq/exec/MSQTasksTest.java | 2 +- .../msq/indexing/error/MSQFaultSerdeTest.java | 2 +- 5 files changed, 35 insertions(+), 11 deletions(-) diff --git a/docs/multi-stage-query/reference.md b/docs/multi-stage-query/reference.md index 71fc1b43afe..b1a25b80e42 100644 --- a/docs/multi-stage-query/reference.md +++ b/docs/multi-stage-query/reference.md @@ -751,7 +751,7 @@ The following table describes error codes you may encounter in the `multiStageQu | `InvalidNullByte` | A string column included a null byte. Null bytes in strings are not permitted. | `column`: The column that included the null byte | | `QueryNotSupported` | QueryKit could not translate the provided native query to a multi-stage query.

This can happen if the query uses features that aren't supported, like GROUPING SETS. | | | `RowTooLarge` | The query tried to process a row that was too large to write to a single frame. See the [Limits](#limits) table for specific limits on frame size. Note that the effective maximum row size is smaller than the maximum frame size due to alignment considerations during frame writing. | `maxFrameSize`: The limit on the frame size. | -| `TaskStartTimeout` | Unable to launch all the worker tasks in time.

There might be insufficient available slots to start all the worker tasks simultaneously.

Try splitting up the query into smaller chunks with lesser `maxNumTasks` number. Another option is to increase capacity. | `numTasks`: The number of tasks attempted to launch. | +| `TaskStartTimeout` | Unable to launch `numTasks` tasks within `timeout` milliseconds.

There may be insufficient available slots to start all the worker tasks simultaneously. Try splitting up your query into smaller chunks using a smaller value of [`maxNumTasks`](#context-parameters). Another option is to increase capacity. | `numTasks`: The number of tasks attempted to launch.

`timeout`: Timeout, in milliseconds, that was exceeded. | | `TooManyAttemptsForJob` | Total relaunch attempt count across all workers exceeded max relaunch attempt limit. See the [Limits](#limits) table for the specific limit. | `maxRelaunchCount`: Max number of relaunches across all the workers defined in the [Limits](#limits) section.

`currentRelaunchCount`: current relaunch counter for the job across all workers.

`taskId`: Latest task id which failed

`rootErrorMessage`: Error message of the latest failed task.| | `TooManyAttemptsForWorker` | Worker exceeded maximum relaunch attempt count as defined in the [Limits](#limits) section. |`maxPerWorkerRelaunchCount`: Max number of relaunches allowed per worker as defined in the [Limits](#limits) section.

`workerNumber`: the worker number for which the task failed

`taskId`: Latest task id which failed

`rootErrorMessage`: Error message of the latest failed task.| | `TooManyBuckets` | Exceeded the maximum number of partition buckets for a stage (5,000 partition buckets).
< br />Partition buckets are created for each [`PARTITIONED BY`](#partitioned-by) time chunk for INSERT and REPLACE queries. The most common reason for this error is that your `PARTITIONED BY` is too narrow relative to your data. | `maxBuckets`: The limit on partition buckets. | diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/MSQWorkerTaskLauncher.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/MSQWorkerTaskLauncher.java index b6d1665015d..d9870daf396 100644 --- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/MSQWorkerTaskLauncher.java +++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/MSQWorkerTaskLauncher.java @@ -494,7 +494,7 @@ public class MSQWorkerTaskLauncher } else if (tracker.didRunTimeOut(maxTaskStartDelayMillis) && !canceledWorkerTasks.contains(taskId)) { removeWorkerFromFullyStartedWorkers(tracker); - throw new MSQException(new TaskStartTimeoutFault(numTasks + 1)); + throw new MSQException(new TaskStartTimeoutFault(numTasks + 1, maxTaskStartDelayMillis)); } else if (tracker.didFail() && !canceledWorkerTasks.contains(taskId)) { removeWorkerFromFullyStartedWorkers(tracker); log.info("Task[%s] failed because %s. Trying to relaunch the worker", taskId, tracker.status.getErrorMsg()); diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/TaskStartTimeoutFault.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/TaskStartTimeoutFault.java index dceb730393d..43c5a802e56 100644 --- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/TaskStartTimeoutFault.java +++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/indexing/error/TaskStartTimeoutFault.java @@ -25,6 +25,7 @@ import com.fasterxml.jackson.annotation.JsonTypeName; import org.apache.druid.msq.util.MultiStageQueryContext; import java.util.Objects; +import java.util.concurrent.TimeUnit; @JsonTypeName(TaskStartTimeoutFault.CODE) public class TaskStartTimeoutFault extends BaseMSQFault @@ -32,18 +33,26 @@ public class TaskStartTimeoutFault extends BaseMSQFault static final String CODE = "TaskStartTimeout"; private final int numTasks; + private final long timeout; @JsonCreator - public TaskStartTimeoutFault(@JsonProperty("numTasks") int numTasks) + public TaskStartTimeoutFault( + @JsonProperty("numTasks") int numTasks, + @JsonProperty("timeout") long timeout + ) { super( CODE, - "Unable to launch all the worker tasks in time. There might be insufficient available slots to start all the worker tasks simultaneously." - + " Try lowering '%s' in your query context to lower than [%d] tasks, or increasing capacity.", - MultiStageQueryContext.CTX_MAX_NUM_TASKS, - numTasks + "Unable to launch [%d] worker tasks within [%,d] seconds. " + + "There might be insufficient available slots to start all worker tasks simultaneously. " + + "Try lowering '%s' in your query context to a number that fits within your available task capacity, " + + "or try increasing capacity.", + numTasks, + TimeUnit.MILLISECONDS.toSeconds(timeout), + MultiStageQueryContext.CTX_MAX_NUM_TASKS ); this.numTasks = numTasks; + this.timeout = timeout; } @JsonProperty @@ -52,6 +61,12 @@ public class TaskStartTimeoutFault extends BaseMSQFault return numTasks; } + @JsonProperty + public long getTimeout() + { + return timeout; + } + @Override public boolean equals(Object o) { @@ -65,12 +80,21 @@ public class TaskStartTimeoutFault extends BaseMSQFault return false; } TaskStartTimeoutFault that = (TaskStartTimeoutFault) o; - return numTasks == that.numTasks; + return numTasks == that.numTasks && timeout == that.timeout; } @Override public int hashCode() { - return Objects.hash(super.hashCode(), numTasks); + return Objects.hash(super.hashCode(), numTasks, timeout); + } + + @Override + public String toString() + { + return "TaskStartTimeoutFault{" + + "numTasks=" + numTasks + + ", timeout=" + timeout + + '}'; } } diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQTasksTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQTasksTest.java index bb1d1b1dbc5..8371705dfde 100644 --- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQTasksTest.java +++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQTasksTest.java @@ -183,7 +183,7 @@ public class MSQTasksTest } catch (Exception e) { Assert.assertEquals( - MSQFaultUtils.generateMessageWithErrorCode(new TaskStartTimeoutFault(numTasks + 1)), + MSQFaultUtils.generateMessageWithErrorCode(new TaskStartTimeoutFault(numTasks + 1, 5000)), MSQFaultUtils.generateMessageWithErrorCode(((MSQException) e.getCause()).getFault()) ); } diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/error/MSQFaultSerdeTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/error/MSQFaultSerdeTest.java index 435c5de0658..1ad4d08b424 100644 --- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/error/MSQFaultSerdeTest.java +++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/error/MSQFaultSerdeTest.java @@ -64,7 +64,7 @@ public class MSQFaultSerdeTest assertFaultSerde(new NotEnoughMemoryFault(1000, 1000, 900, 1, 2)); assertFaultSerde(QueryNotSupportedFault.INSTANCE); assertFaultSerde(new RowTooLargeFault(1000)); - assertFaultSerde(new TaskStartTimeoutFault(10)); + assertFaultSerde(new TaskStartTimeoutFault(10, 11)); assertFaultSerde(new TooManyBucketsFault(10)); assertFaultSerde(new TooManyColumnsFault(10, 8)); assertFaultSerde(new TooManyClusteredByColumnsFault(10, 8, 1));