mirror of https://github.com/apache/druid.git
Add indexer task success and failure metrics (#16829)
This PR adds indexer-level task metrics- "indexer/task/failed/count" "indexer/task/success/count" the current "worker/task/completed/count" metric shows all the tasks completed irrespective of success or failure status so these metrics would help us get more visibility into the status of the completed tasks
This commit is contained in:
parent
c84e689eb8
commit
c8323d1a7c
|
@ -299,6 +299,8 @@ If the JVM does not support CPU time measurement for the current thread, `ingest
|
|||
|`worker/taskSlot/used/count`|Number of busy task slots on the reporting worker per emission period. This metric is only available if the `WorkerTaskCountStatsMonitor` module is included.| `category`, `workerVersion`|Varies|
|
||||
|`worker/task/assigned/count`|Number of tasks assigned to an indexer per emission period. This metric is only available if the `WorkerTaskCountStatsMonitor` module is included.|`dataSource`|Varies|
|
||||
|`worker/task/completed/count`|Number of tasks completed by an indexer per emission period. This metric is only available if the `WorkerTaskCountStatsMonitor` module is included.|`dataSource`|Varies|
|
||||
|`worker/task/failed/count`|Number of tasks that failed on an indexer during the emission period. This metric is only available if the `WorkerTaskCountStatsMonitor` module is included.|`dataSource`|Varies|
|
||||
|`worker/task/success/count`|Number of tasks that succeeded on an indexer during the emission period. This metric is only available if the `WorkerTaskCountStatsMonitor` module is included.|`dataSource`|Varies|
|
||||
|`worker/task/running/count`|Number of tasks running on an indexer per emission period. This metric is only available if the `WorkerTaskCountStatsMonitor` module is included.|`dataSource`|Varies|
|
||||
|
||||
## Shuffle metrics (Native parallel task)
|
||||
|
|
|
@ -74,8 +74,8 @@
|
|||
"worker/task/assigned/count" : { "dimensions" : ["dataSource"], "type" : "count" },
|
||||
"worker/task/running/count" : { "dimensions" : ["dataSource"], "type" : "count" },
|
||||
"worker/task/completed/count" : { "dimensions" : ["dataSource"], "type" : "count" },
|
||||
"worker/task/failed/count" : { "dimensions" : ["category", "workerVersion"], "type" : "count" },
|
||||
"worker/task/success/count" : { "dimensions" : ["category", "workerVersion"], "type" : "count" },
|
||||
"worker/task/failed/count" : { "dimensions" : ["category", "workerVersion", "dataSource"], "type" : "count" },
|
||||
"worker/task/success/count" : { "dimensions" : ["category", "workerVersion", "dataSource"], "type" : "count" },
|
||||
"worker/taskSlot/idle/count" : { "dimensions" : ["category", "workerVersion"], "type" : "gauge" },
|
||||
"worker/taskSlot/total/count" : { "dimensions" : ["category", "workerVersion"], "type" : "gauge" },
|
||||
"worker/taskSlot/used/count" : { "dimensions" : ["category", "workerVersion"], "type" : "gauge" },
|
||||
|
|
|
@ -640,6 +640,22 @@ public class WorkerTaskManager implements IndexerTaskCountStatsProvider
|
|||
return getNumTasksPerDatasource(this.getCompletedTasks().values(), TaskAnnouncement::getTaskDataSource);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Long> getWorkerFailedTasks()
|
||||
{
|
||||
return getNumTasksPerDatasource(completedTasks.entrySet().stream()
|
||||
.filter(entry -> entry.getValue().getTaskStatus().isFailure())
|
||||
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)).values(), TaskAnnouncement::getTaskDataSource);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Long> getWorkerSuccessfulTasks()
|
||||
{
|
||||
return getNumTasksPerDatasource(completedTasks.entrySet().stream()
|
||||
.filter(entry -> entry.getValue().getTaskStatus().isSuccess())
|
||||
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)).values(), TaskAnnouncement::getTaskDataSource);
|
||||
}
|
||||
|
||||
private static class TaskDetails
|
||||
{
|
||||
private final Task task;
|
||||
|
|
|
@ -455,6 +455,19 @@ public class WorkerTaskManagerTest
|
|||
return new NoopTask(id, null, dataSource, 100, 0, ImmutableMap.of(Tasks.PRIORITY_KEY, 0));
|
||||
}
|
||||
|
||||
private NoopTask createNoopFailingTask(String id, String dataSource)
|
||||
{
|
||||
return new NoopTask(id, null, dataSource, 100, 0, ImmutableMap.of(Tasks.PRIORITY_KEY, 0))
|
||||
{
|
||||
@Override
|
||||
public TaskStatus runTask(TaskToolbox toolbox) throws Exception
|
||||
{
|
||||
Thread.sleep(getRunTime());
|
||||
return TaskStatus.failure(getId(), "Failed to complete the task");
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Start the {@link #workerTaskManager}, submit a {@link NoopTask}, wait for it to be complete. Common preamble
|
||||
* for various tests of {@link WorkerTaskManager#doCompletedTasksCleanup()}.
|
||||
|
@ -494,7 +507,7 @@ public class WorkerTaskManagerTest
|
|||
|
||||
Task task1 = createNoopTask("task1", "wikipedia");
|
||||
Task task2 = createNoopTask("task2", "wikipedia");
|
||||
Task task3 = createNoopTask("task3", "animals");
|
||||
Task task3 = createNoopFailingTask("task3", "animals");
|
||||
|
||||
workerTaskManager.start();
|
||||
// befor assigning tasks we should get no running tasks
|
||||
|
@ -517,11 +530,19 @@ public class WorkerTaskManagerTest
|
|||
Thread.sleep(10);
|
||||
} while (!runningTasks.isEmpty());
|
||||
|
||||
// When running tasks are empty all task should be reported as completed
|
||||
// When running tasks are empty all task should be reported as completed and
|
||||
// one of the task for animals datasource should fail and other 2 tasks in
|
||||
// the wikipedia datasource should succeed
|
||||
Assert.assertEquals(workerTaskManager.getWorkerCompletedTasks(), ImmutableMap.of(
|
||||
"wikipedia", 2L,
|
||||
"animals", 1L
|
||||
));
|
||||
Assert.assertEquals(workerTaskManager.getWorkerFailedTasks(), ImmutableMap.of(
|
||||
"animals", 1L
|
||||
));
|
||||
Assert.assertEquals(workerTaskManager.getWorkerSuccessfulTasks(), ImmutableMap.of(
|
||||
"wikipedia", 2L
|
||||
));
|
||||
Assert.assertEquals(workerTaskManager.getWorkerAssignedTasks().size(), 0L);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -41,4 +41,8 @@ public interface IndexerTaskCountStatsProvider
|
|||
* Map from datasource name to the number of completed tasks by the Indexer.
|
||||
*/
|
||||
Map<String, Long> getWorkerCompletedTasks();
|
||||
|
||||
Map<String, Long> getWorkerFailedTasks();
|
||||
|
||||
Map<String, Long> getWorkerSuccessfulTasks();
|
||||
}
|
||||
|
|
|
@ -72,6 +72,8 @@ public class WorkerTaskCountStatsMonitor extends AbstractMonitor
|
|||
emit(emitter, "worker/task/running/count", indexerStatsProvider.getWorkerRunningTasks());
|
||||
emit(emitter, "worker/task/assigned/count", indexerStatsProvider.getWorkerAssignedTasks());
|
||||
emit(emitter, "worker/task/completed/count", indexerStatsProvider.getWorkerCompletedTasks());
|
||||
emit(emitter, "worker/task/failed/count", indexerStatsProvider.getWorkerFailedTasks());
|
||||
emit(emitter, "worker/task/success/count", indexerStatsProvider.getWorkerSuccessfulTasks());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -120,6 +120,24 @@ public class WorkerTaskCountStatsMonitorTest
|
|||
"metrics", 9L
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Long> getWorkerFailedTasks()
|
||||
{
|
||||
return ImmutableMap.of(
|
||||
"movies", 4L,
|
||||
"games", 6L
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Long> getWorkerSuccessfulTasks()
|
||||
{
|
||||
return ImmutableMap.of(
|
||||
"games", 23L,
|
||||
"inventory", 89L
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
nullStatsProvider = new WorkerTaskCountStatsProvider()
|
||||
|
@ -239,7 +257,7 @@ public class WorkerTaskCountStatsMonitorTest
|
|||
new WorkerTaskCountStatsMonitor(injectorForIndexer, ImmutableSet.of(NodeRole.INDEXER));
|
||||
final StubServiceEmitter emitter = new StubServiceEmitter("service", "host");
|
||||
monitor.doMonitor(emitter);
|
||||
Assert.assertEquals(6, emitter.getEvents().size());
|
||||
Assert.assertEquals(10, emitter.getEvents().size());
|
||||
emitter.verifyValue(
|
||||
"worker/task/running/count",
|
||||
ImmutableMap.of("dataSource", "wikipedia"),
|
||||
|
@ -270,6 +288,26 @@ public class WorkerTaskCountStatsMonitorTest
|
|||
ImmutableMap.of("dataSource", "metrics"),
|
||||
9L
|
||||
);
|
||||
emitter.verifyValue(
|
||||
"worker/task/failed/count",
|
||||
ImmutableMap.of("dataSource", "movies"),
|
||||
4L
|
||||
);
|
||||
emitter.verifyValue(
|
||||
"worker/task/failed/count",
|
||||
ImmutableMap.of("dataSource", "games"),
|
||||
6L
|
||||
);
|
||||
emitter.verifyValue(
|
||||
"worker/task/success/count",
|
||||
ImmutableMap.of("dataSource", "games"),
|
||||
23L
|
||||
);
|
||||
emitter.verifyValue(
|
||||
"worker/task/success/count",
|
||||
ImmutableMap.of("dataSource", "inventory"),
|
||||
89L
|
||||
);
|
||||
}
|
||||
@Test
|
||||
public void testMonitorWithNulls()
|
||||
|
|
Loading…
Reference in New Issue