Add ServiceStatusMonitor to monitor service health (#14443)

* Add OverlordStatusMonitor and CoordinatorStatusMonitor to monitor service leader status

* make the monitor more general

* resolve conflict

* use Supplier pattern to provide metrics

* reformat code and doc

* move service specific tag to dimension

* minor refine

* update doc

* reformat code

* address comments

* remove declared exception

* bind HeartbeatSupplier conditionally in Coordinator
This commit is contained in:
YongGang 2023-06-26 10:26:37 -07:00 committed by GitHub
parent 114380749d
commit b7434be99e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 191 additions and 0 deletions

View File

@ -399,6 +399,7 @@ Metric monitoring is an essential part of Druid operations. The following monit
|`org.apache.druid.server.metrics.TaskCountStatsMonitor`|Reports how many ingestion tasks are currently running/pending/waiting and also the number of successful/failed tasks per emission period.|
|`org.apache.druid.server.metrics.TaskSlotCountStatsMonitor`|Reports metrics about task slot usage per emission period.|
|`org.apache.druid.server.metrics.WorkerTaskCountStatsMonitor`|Reports how many ingestion tasks are currently running/pending/waiting, the number of successful/failed tasks, and metrics about task slot usage for the reporting worker, per emission period. Only supported by middleManager node types.|
|`org.apache.druid.server.metrics.ServiceStatusMonitor`|Reports a heartbeat for the service.|
For example, you might configure monitors on all processes for system and JVM information within `common.runtime.properties` as follows:

View File

@ -329,6 +329,12 @@ If `emitBalancingStats` is set to `true` in the Coordinator [dynamic configurati
## General Health
### Service Health
|Metric|Description|Dimensions|Normal Value|
|------|-----------|----------|------------|
| `service/heartbeat` | Metric indicating the service is up. `ServiceStatusMonitor` must be enabled. |`leader` on the Overlord and Coordinator.|1|
### Historical
|Metric|Description|Dimensions|Normal Value|

View File

@ -0,0 +1,55 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.server.metrics;
import com.google.common.base.Supplier;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import org.apache.druid.java.util.emitter.service.ServiceEmitter;
import org.apache.druid.java.util.emitter.service.ServiceMetricEvent;
import org.apache.druid.java.util.metrics.AbstractMonitor;
import java.util.Map;
/**
* Reports a heartbeat for the service.
*/
public class ServiceStatusMonitor extends AbstractMonitor
{
@Named("heartbeat")
@Inject(optional = true)
Supplier<Map<String, Object>> heartbeatTagsSupplier = null;
@Override
public boolean doMonitor(ServiceEmitter emitter)
{
final ServiceMetricEvent.Builder builder = new ServiceMetricEvent.Builder();
if (heartbeatTagsSupplier != null && heartbeatTagsSupplier.get() != null) {
heartbeatTagsSupplier.get().forEach((k, v) -> {
builder.setDimension(k, v);
});
}
emitter.emit(builder.build("service/heartbeat", 1));
return true;
}
}

View File

@ -0,0 +1,82 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.server.metrics;
import com.google.common.base.Supplier;
import org.apache.druid.java.util.metrics.StubServiceEmitter;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.util.HashMap;
import java.util.Map;
public class ServiceStatusMonitorTest
{
private ServiceStatusMonitor monitor;
private Map<String, Object> heartbeatTags;
private Supplier<Map<String, Object>> heartbeatTagsSupplier = () -> heartbeatTags;
private static String HEARTBEAT_METRIC_KEY = "service/heartbeat";
@Before
public void setUp()
{
monitor = new ServiceStatusMonitor();
heartbeatTags = new HashMap<>();
monitor.heartbeatTagsSupplier = heartbeatTagsSupplier;
}
@Test
public void testDefaultHeartbeatReported()
{
final StubServiceEmitter emitter = new StubServiceEmitter("service", "host");
monitor.doMonitor(emitter);
Assert.assertEquals(1, emitter.getEvents().size());
Assert.assertEquals(HEARTBEAT_METRIC_KEY, emitter.getEvents().get(0).toMap().get("metric"));
Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("value"));
}
@Test
public void testLeaderTag()
{
heartbeatTags.put("leader", 1);
final StubServiceEmitter emitter = new StubServiceEmitter("service", "host");
monitor.doMonitor(emitter);
Assert.assertEquals(1, emitter.getEvents().size());
Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("leader"));
Assert.assertEquals(HEARTBEAT_METRIC_KEY, emitter.getEvents().get(0).toMap().get("metric"));
Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("value"));
}
@Test
public void testMoreThanOneTag()
{
heartbeatTags.put("leader", 1);
heartbeatTags.put("taskRunner", "http");
final StubServiceEmitter emitter = new StubServiceEmitter("service", "host");
monitor.doMonitor(emitter);
Assert.assertEquals(1, emitter.getEvents().size());
Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("leader"));
Assert.assertEquals("http", emitter.getEvents().get(0).toMap().get("taskRunner"));
Assert.assertEquals(HEARTBEAT_METRIC_KEY, emitter.getEvents().get(0).toMap().get("metric"));
Assert.assertEquals(1, emitter.getEvents().get(0).toMap().get("value"));
}
}

View File

@ -24,6 +24,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.github.rvesse.airline.annotations.Command;
import com.google.common.base.Predicates;
import com.google.common.base.Strings;
import com.google.common.base.Supplier;
import com.google.common.collect.ImmutableSet;
import com.google.inject.Binder;
import com.google.inject.Inject;
@ -31,6 +32,7 @@ import com.google.inject.Key;
import com.google.inject.Module;
import com.google.inject.Provider;
import com.google.inject.Provides;
import com.google.inject.TypeLiteral;
import com.google.inject.name.Names;
import com.google.inject.util.Providers;
import org.apache.curator.framework.CuratorFramework;
@ -119,8 +121,10 @@ import org.eclipse.jetty.server.Server;
import org.joda.time.Duration;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.ExecutorService;
@ -330,6 +334,10 @@ public class CliCoordinator extends ServerRunnable
binder.bind(TaskStorage.class).toProvider(Providers.of(null));
binder.bind(TaskMaster.class).toProvider(Providers.of(null));
binder.bind(RowIngestionMetersFactory.class).toProvider(Providers.of(null));
// Bind HeartbeatSupplier only when the service operates independently of Overlord.
binder.bind(new TypeLiteral<Supplier<Map<String, Object>>>() {})
.annotatedWith(Names.named("heartbeat"))
.toProvider(HeartbeatSupplier.class);
}
binder.bind(CoordinatorCustomDutyGroups.class)
@ -459,4 +467,26 @@ public class CliCoordinator extends ServerRunnable
}
}
}
private static class HeartbeatSupplier implements Provider<Supplier<Map<String, Object>>>
{
private final DruidCoordinator coordinator;
@Inject
public HeartbeatSupplier(DruidCoordinator coordinator)
{
this.coordinator = coordinator;
}
@Override
public Supplier<Map<String, Object>> get()
{
return () -> {
Map<String, Object> heartbeatTags = new HashMap<>();
heartbeatTags.put("leader", coordinator.isLeader() ? 1 : 0);
return heartbeatTags;
};
}
}
}

View File

@ -21,6 +21,7 @@ package org.apache.druid.cli;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.github.rvesse.airline.annotations.Command;
import com.google.common.base.Supplier;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.inject.Binder;
@ -32,6 +33,7 @@ import com.google.inject.Provides;
import com.google.inject.TypeLiteral;
import com.google.inject.multibindings.MapBinder;
import com.google.inject.multibindings.Multibinder;
import com.google.inject.name.Named;
import com.google.inject.name.Names;
import com.google.inject.servlet.GuiceFilter;
import com.google.inject.util.Providers;
@ -134,7 +136,9 @@ import org.eclipse.jetty.servlet.FilterHolder;
import org.eclipse.jetty.servlet.ServletContextHandler;
import org.eclipse.jetty.servlet.ServletHolder;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
@ -354,6 +358,19 @@ public class CliOverlord extends ServerRunnable
{
return TaskStorageDirTracker.fromConfigs(workerConfig, taskConfig);
}
@Provides
@LazySingleton
@Named("heartbeat")
public Supplier<Map<String, Object>> getHeartbeatSupplier(TaskMaster taskMaster)
{
return () -> {
Map<String, Object> heartbeatTags = new HashMap<>();
heartbeatTags.put("leader", taskMaster.isLeader() ? 1 : 0);
return heartbeatTags;
};
}
};
}