From b7434be99e803cba22360a67ebe63948cb0def24 Mon Sep 17 00:00:00 2001 From: YongGang Date: Mon, 26 Jun 2023 10:26:37 -0700 Subject: [PATCH] Add ServiceStatusMonitor to monitor service health (#14443) * Add OverlordStatusMonitor and CoordinatorStatusMonitor to monitor service leader status * make the monitor more general * resolve conflict * use Supplier pattern to provide metrics * reformat code and doc * move service specific tag to dimension * minor refine * update doc * reformat code * address comments * remove declared exception * bind HeartbeatSupplier conditionally in Coordinator --- docs/configuration/index.md | 1 + docs/operations/metrics.md | 6 ++ .../server/metrics/ServiceStatusMonitor.java | 55 +++++++++++++ .../metrics/ServiceStatusMonitorTest.java | 82 +++++++++++++++++++ .../org/apache/druid/cli/CliCoordinator.java | 30 +++++++ .../org/apache/druid/cli/CliOverlord.java | 17 ++++ 6 files changed, 191 insertions(+) create mode 100644 server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java create mode 100644 server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java diff --git a/docs/configuration/index.md b/docs/configuration/index.md index 8c8ba45b106..355a1e8c892 100644 --- a/docs/configuration/index.md +++ b/docs/configuration/index.md @@ -399,6 +399,7 @@ Metric monitoring is an essential part of Druid operations. The following monit |`org.apache.druid.server.metrics.TaskCountStatsMonitor`|Reports how many ingestion tasks are currently running/pending/waiting and also the number of successful/failed tasks per emission period.| |`org.apache.druid.server.metrics.TaskSlotCountStatsMonitor`|Reports metrics about task slot usage per emission period.| |`org.apache.druid.server.metrics.WorkerTaskCountStatsMonitor`|Reports how many ingestion tasks are currently running/pending/waiting, the number of successful/failed tasks, and metrics about task slot usage for the reporting worker, per emission period. Only supported by middleManager node types.| +|`org.apache.druid.server.metrics.ServiceStatusMonitor`|Reports a heartbeat for the service.| For example, you might configure monitors on all processes for system and JVM information within `common.runtime.properties` as follows: diff --git a/docs/operations/metrics.md b/docs/operations/metrics.md index f85cbec77df..5bc292dfd43 100644 --- a/docs/operations/metrics.md +++ b/docs/operations/metrics.md @@ -329,6 +329,12 @@ If `emitBalancingStats` is set to `true` in the Coordinator [dynamic configurati ## General Health +### Service Health + +|Metric|Description|Dimensions|Normal Value| +|------|-----------|----------|------------| +| `service/heartbeat` | Metric indicating the service is up. `ServiceStatusMonitor` must be enabled. |`leader` on the Overlord and Coordinator.|1| + ### Historical |Metric|Description|Dimensions|Normal Value| diff --git a/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java b/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java new file mode 100644 index 00000000000..d56bf76ec43 --- /dev/null +++ b/server/src/main/java/org/apache/druid/server/metrics/ServiceStatusMonitor.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.server.metrics; + +import com.google.common.base.Supplier; +import com.google.inject.Inject; +import com.google.inject.name.Named; +import org.apache.druid.java.util.emitter.service.ServiceEmitter; +import org.apache.druid.java.util.emitter.service.ServiceMetricEvent; +import org.apache.druid.java.util.metrics.AbstractMonitor; + +import java.util.Map; + +/** + * Reports a heartbeat for the service. + */ +public class ServiceStatusMonitor extends AbstractMonitor +{ + + @Named("heartbeat") + @Inject(optional = true) + Supplier> heartbeatTagsSupplier = null; + + @Override + public boolean doMonitor(ServiceEmitter emitter) + { + final ServiceMetricEvent.Builder builder = new ServiceMetricEvent.Builder(); + if (heartbeatTagsSupplier != null && heartbeatTagsSupplier.get() != null) { + heartbeatTagsSupplier.get().forEach((k, v) -> { + builder.setDimension(k, v); + }); + } + + emitter.emit(builder.build("service/heartbeat", 1)); + return true; + } +} + diff --git a/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java b/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java new file mode 100644 index 00000000000..88acb6dca26 --- /dev/null +++ b/server/src/test/java/org/apache/druid/server/metrics/ServiceStatusMonitorTest.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.server.metrics; + +import com.google.common.base.Supplier; +import org.apache.druid.java.util.metrics.StubServiceEmitter; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.util.HashMap; +import java.util.Map; + +public class ServiceStatusMonitorTest +{ + + private ServiceStatusMonitor monitor; + private Map heartbeatTags; + private Supplier> heartbeatTagsSupplier = () -> heartbeatTags; + private static String HEARTBEAT_METRIC_KEY = "service/heartbeat"; + + @Before + public void setUp() + { + monitor = new ServiceStatusMonitor(); + heartbeatTags = new HashMap<>(); + monitor.heartbeatTagsSupplier = heartbeatTagsSupplier; + } + + @Test + public void testDefaultHeartbeatReported() + { + final StubServiceEmitter emitter = new StubServiceEmitter("service", "host"); + monitor.doMonitor(emitter); + Assert.assertEquals(1, emitter.getEvents().size()); + Assert.assertEquals(HEARTBEAT_METRIC_KEY, emitter.getEvents().get(0).toMap().get("metric")); + Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("value")); + } + + @Test + public void testLeaderTag() + { + heartbeatTags.put("leader", 1); + final StubServiceEmitter emitter = new StubServiceEmitter("service", "host"); + monitor.doMonitor(emitter); + Assert.assertEquals(1, emitter.getEvents().size()); + Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("leader")); + Assert.assertEquals(HEARTBEAT_METRIC_KEY, emitter.getEvents().get(0).toMap().get("metric")); + Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("value")); + } + + @Test + public void testMoreThanOneTag() + { + heartbeatTags.put("leader", 1); + heartbeatTags.put("taskRunner", "http"); + final StubServiceEmitter emitter = new StubServiceEmitter("service", "host"); + monitor.doMonitor(emitter); + Assert.assertEquals(1, emitter.getEvents().size()); + Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("leader")); + Assert.assertEquals("http", emitter.getEvents().get(0).toMap().get("taskRunner")); + Assert.assertEquals(HEARTBEAT_METRIC_KEY, emitter.getEvents().get(0).toMap().get("metric")); + Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("value")); + } +} diff --git a/services/src/main/java/org/apache/druid/cli/CliCoordinator.java b/services/src/main/java/org/apache/druid/cli/CliCoordinator.java index 2810359f490..73adc79a4f3 100644 --- a/services/src/main/java/org/apache/druid/cli/CliCoordinator.java +++ b/services/src/main/java/org/apache/druid/cli/CliCoordinator.java @@ -24,6 +24,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.github.rvesse.airline.annotations.Command; import com.google.common.base.Predicates; import com.google.common.base.Strings; +import com.google.common.base.Supplier; import com.google.common.collect.ImmutableSet; import com.google.inject.Binder; import com.google.inject.Inject; @@ -31,6 +32,7 @@ import com.google.inject.Key; import com.google.inject.Module; import com.google.inject.Provider; import com.google.inject.Provides; +import com.google.inject.TypeLiteral; import com.google.inject.name.Names; import com.google.inject.util.Providers; import org.apache.curator.framework.CuratorFramework; @@ -119,8 +121,10 @@ import org.eclipse.jetty.server.Server; import org.joda.time.Duration; import java.util.ArrayList; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.concurrent.ExecutorService; @@ -330,6 +334,10 @@ public class CliCoordinator extends ServerRunnable binder.bind(TaskStorage.class).toProvider(Providers.of(null)); binder.bind(TaskMaster.class).toProvider(Providers.of(null)); binder.bind(RowIngestionMetersFactory.class).toProvider(Providers.of(null)); + // Bind HeartbeatSupplier only when the service operates independently of Overlord. + binder.bind(new TypeLiteral>>() {}) + .annotatedWith(Names.named("heartbeat")) + .toProvider(HeartbeatSupplier.class); } binder.bind(CoordinatorCustomDutyGroups.class) @@ -459,4 +467,26 @@ public class CliCoordinator extends ServerRunnable } } } + + private static class HeartbeatSupplier implements Provider>> + { + private final DruidCoordinator coordinator; + + @Inject + public HeartbeatSupplier(DruidCoordinator coordinator) + { + this.coordinator = coordinator; + } + + @Override + public Supplier> get() + { + return () -> { + Map heartbeatTags = new HashMap<>(); + heartbeatTags.put("leader", coordinator.isLeader() ? 1 : 0); + + return heartbeatTags; + }; + } + } } diff --git a/services/src/main/java/org/apache/druid/cli/CliOverlord.java b/services/src/main/java/org/apache/druid/cli/CliOverlord.java index b67d6592e2e..4c365c63a90 100644 --- a/services/src/main/java/org/apache/druid/cli/CliOverlord.java +++ b/services/src/main/java/org/apache/druid/cli/CliOverlord.java @@ -21,6 +21,7 @@ package org.apache.druid.cli; import com.fasterxml.jackson.databind.ObjectMapper; import com.github.rvesse.airline.annotations.Command; +import com.google.common.base.Supplier; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.google.inject.Binder; @@ -32,6 +33,7 @@ import com.google.inject.Provides; import com.google.inject.TypeLiteral; import com.google.inject.multibindings.MapBinder; import com.google.inject.multibindings.Multibinder; +import com.google.inject.name.Named; import com.google.inject.name.Names; import com.google.inject.servlet.GuiceFilter; import com.google.inject.util.Providers; @@ -134,7 +136,9 @@ import org.eclipse.jetty.servlet.FilterHolder; import org.eclipse.jetty.servlet.ServletContextHandler; import org.eclipse.jetty.servlet.ServletHolder; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Properties; import java.util.Set; @@ -354,6 +358,19 @@ public class CliOverlord extends ServerRunnable { return TaskStorageDirTracker.fromConfigs(workerConfig, taskConfig); } + + @Provides + @LazySingleton + @Named("heartbeat") + public Supplier> getHeartbeatSupplier(TaskMaster taskMaster) + { + return () -> { + Map heartbeatTags = new HashMap<>(); + heartbeatTags.put("leader", taskMaster.isLeader() ? 1 : 0); + + return heartbeatTags; + }; + } }; }