From 417940cdd76ded0a41906ce794a79f4f09136314 Mon Sep 17 00:00:00 2001 From: Andrzej Bialecki Date: Thu, 7 Jun 2018 21:23:55 +0200 Subject: [PATCH] SOLR-12438: Improve status reporting of metrics history API. --- solr/CHANGES.txt | 2 +- .../org/apache/solr/core/CoreContainer.java | 74 +++- .../handler/admin/MetricsHistoryHandler.java | 298 +++++++++----- .../solr/metrics/rrd/SolrRrdBackend.java | 30 +- .../metrics/rrd/SolrRrdBackendFactory.java | 76 ++-- .../cloud/MetricsHistoryIntegrationTest.java | 9 +- .../autoscaling/sim/SimCloudManager.java | 1 + .../admin/MetricsHistoryHandlerTest.java | 9 +- .../rrd/SolrRrdBackendFactoryTest.java | 16 +- solr/solr-ref-guide/src/metrics-history.adoc | 382 +++++++++++------- 10 files changed, 581 insertions(+), 316 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index d81b3ce8f44..38382130c5e 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -162,7 +162,7 @@ New Features * SOLR-12401: Add getValue() and setValue() Stream Evaluators (Joel Bernstein, janhoy) -* SOLR-11779: Basic long-term collection of aggregated metrics. Historical data is +* SOLR-11779, SOLR-12438: Basic long-term collection of aggregated metrics. Historical data is maintained as multi-resolution time series using round-robin databases in the '.system' collection. New /admin/metrics/history API allows retrieval of this data in numeric or graph formats. (ab) diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java index d546dd29b9c..37a660fdc41 100644 --- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java +++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java @@ -26,6 +26,7 @@ import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Date; +import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; @@ -43,6 +44,9 @@ import org.apache.http.client.CredentialsProvider; import org.apache.http.config.Lookup; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.store.Directory; +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.cloud.SolrCloudManager; +import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; import org.apache.solr.client.solrj.impl.CloudSolrClient; import org.apache.solr.client.solrj.impl.HttpClientUtil; import org.apache.solr.client.solrj.impl.SolrHttpClientBuilder; @@ -59,6 +63,7 @@ import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Replica.State; +import org.apache.solr.common.params.CollectionAdminParams; import org.apache.solr.common.util.ExecutorUtil; import org.apache.solr.common.util.IOUtils; import org.apache.solr.common.util.Utils; @@ -570,21 +575,7 @@ public class CoreContainer { containerHandlers.put(METRICS_PATH, metricsHandler); metricsHandler.initializeMetrics(metricManager, SolrInfoBean.Group.node.toString(), metricTag, METRICS_PATH); - if (isZooKeeperAware()) { - PluginInfo plugin = cfg.getMetricsConfig().getHistoryHandler(); - Map initArgs; - if (plugin != null && plugin.initArgs != null) { - initArgs = plugin.initArgs.asMap(5); - initArgs.put(MetricsHistoryHandler.ENABLE_PROP, plugin.isEnabled()); - } else { - initArgs = Collections.emptyMap(); - } - metricsHistoryHandler = new MetricsHistoryHandler(getZkController().getNodeName(), metricsHandler, - new CloudSolrClient.Builder(Collections.singletonList(getZkController().getZkServerAddress()), Optional.empty()) - .withHttpClient(updateShardHandler.getDefaultHttpClient()).build(), getZkController().getSolrCloudManager(), initArgs); - containerHandlers.put(METRICS_HISTORY_PATH, metricsHistoryHandler); - metricsHistoryHandler.initializeMetrics(metricManager, SolrInfoBean.Group.node.toString(), metricTag, METRICS_HISTORY_PATH); - } + createMetricsHistoryHandler(); autoscalingHistoryHandler = createHandler(AUTOSCALING_HISTORY_PATH, AutoscalingHistoryHandler.class.getName(), AutoscalingHistoryHandler.class); metricsCollectorHandler = createHandler(MetricsCollectorHandler.HANDLER_PATH, MetricsCollectorHandler.class.getName(), MetricsCollectorHandler.class); @@ -748,6 +739,49 @@ public class CoreContainer { status |= LOAD_COMPLETE | INITIAL_CORE_LOAD_COMPLETE; } + // MetricsHistoryHandler supports both cloud and standalone configs + private void createMetricsHistoryHandler() { + PluginInfo plugin = cfg.getMetricsConfig().getHistoryHandler(); + Map initArgs; + if (plugin != null && plugin.initArgs != null) { + initArgs = plugin.initArgs.asMap(5); + initArgs.put(MetricsHistoryHandler.ENABLE_PROP, plugin.isEnabled()); + } else { + initArgs = new HashMap<>(); + } + String name; + SolrCloudManager cloudManager; + SolrClient client; + if (isZooKeeperAware()) { + name = getZkController().getNodeName(); + cloudManager = getZkController().getSolrCloudManager(); + client = new CloudSolrClient.Builder(Collections.singletonList(getZkController().getZkServerAddress()), Optional.empty()) + .withHttpClient(updateShardHandler.getDefaultHttpClient()).build(); + } else { + name = getNodeConfig().getNodeName(); + if (name == null || name.isEmpty()) { + name = "localhost"; + } + cloudManager = null; + client = new EmbeddedSolrServer(this, CollectionAdminParams.SYSTEM_COLL) { + @Override + public void close() throws IOException { + // do nothing - we close the container ourselves + } + }; + // enable local metrics unless specifically set otherwise + if (!initArgs.containsKey(MetricsHistoryHandler.ENABLE_NODES_PROP)) { + initArgs.put(MetricsHistoryHandler.ENABLE_NODES_PROP, true); + } + if (!initArgs.containsKey(MetricsHistoryHandler.ENABLE_REPLICAS_PROP)) { + initArgs.put(MetricsHistoryHandler.ENABLE_REPLICAS_PROP, true); + } + } + metricsHistoryHandler = new MetricsHistoryHandler(name, metricsHandler, + client, cloudManager, initArgs); + containerHandlers.put(METRICS_HISTORY_PATH, metricsHistoryHandler); + metricsHistoryHandler.initializeMetrics(metricManager, SolrInfoBean.Group.node.toString(), metricTag, METRICS_HISTORY_PATH); + } public void securityNodeChanged() { log.info("Security node changed, reloading security.json"); @@ -792,6 +826,12 @@ public class CoreContainer { ExecutorUtil.shutdownAndAwaitTermination(coreContainerWorkExecutor); replayUpdatesExecutor.shutdownAndAwaitTermination(); + + if (metricsHistoryHandler != null) { + IOUtils.closeQuietly(metricsHistoryHandler.getSolrClient()); + metricsHistoryHandler.close(); + } + if (metricManager != null) { metricManager.closeReporters(SolrMetricManager.getRegistryName(SolrInfoBean.Group.node)); metricManager.closeReporters(SolrMetricManager.getRegistryName(SolrInfoBean.Group.jvm)); @@ -810,10 +850,6 @@ public class CoreContainer { } catch (Exception e) { log.warn("Error removing live node. Continuing to close CoreContainer", e); } - if (metricsHistoryHandler != null) { - IOUtils.closeQuietly(metricsHistoryHandler.getSolrClient()); - metricsHistoryHandler.close(); - } if (metricManager != null) { metricManager.closeReporters(SolrMetricManager.getRegistryName(SolrInfoBean.Group.cluster)); } diff --git a/solr/core/src/java/org/apache/solr/handler/admin/MetricsHistoryHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/MetricsHistoryHandler.java index 3d486804b48..03b545f6f8b 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/MetricsHistoryHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/MetricsHistoryHandler.java @@ -19,10 +19,13 @@ package org.apache.solr.handler.admin; import javax.imageio.ImageIO; import java.awt.Color; import java.awt.image.BufferedImage; +import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.Closeable; import java.io.IOException; import java.lang.invoke.MethodHandles; +import java.net.MalformedURLException; +import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -45,17 +48,21 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.atomic.DoubleAdder; import java.util.function.Function; +import java.util.stream.Collectors; import java.util.stream.Stream; import com.google.common.annotations.VisibleForTesting; import org.apache.solr.api.Api; import org.apache.solr.api.ApiBag; import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.cloud.NodeStateProvider; import org.apache.solr.client.solrj.cloud.SolrCloudManager; import org.apache.solr.client.solrj.cloud.autoscaling.ReplicaInfo; import org.apache.solr.client.solrj.cloud.autoscaling.Suggestion; import org.apache.solr.client.solrj.cloud.autoscaling.VersionedData; +import org.apache.solr.client.solrj.impl.HttpClientUtil; import org.apache.solr.cloud.Overseer; import org.apache.solr.common.SolrException; import org.apache.solr.common.cloud.ClusterState; @@ -68,9 +75,12 @@ import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.Base64; +import org.apache.solr.common.util.JavaBinCodec; import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.Pair; import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.common.util.TimeSource; +import org.apache.solr.common.util.Utils; import org.apache.solr.handler.RequestHandlerBase; import org.apache.solr.metrics.SolrMetricManager; import org.apache.solr.metrics.rrd.SolrRrdBackendFactory; @@ -105,30 +115,32 @@ import static org.apache.solr.common.params.CommonParams.ID; public class MetricsHistoryHandler extends RequestHandlerBase implements PermissionNameProvider, Closeable { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - public static final List DEFAULT_CORE_COUNTERS = new ArrayList() {{ - add("QUERY./select.requests"); - add("UPDATE./update.requests"); - }}; - public static final List DEFAULT_CORE_GAUGES = new ArrayList() {{ - add("INDEX.sizeInBytes"); - }}; - public static final List DEFAULT_NODE_GAUGES = new ArrayList() {{ - add("CONTAINER.fs.coreRoot.usableSpace"); - }}; - public static final List DEFAULT_JVM_GAUGES = new ArrayList() {{ - add("memory.heap.used"); - add("os.processCpuLoad"); - add("os.systemLoadAverage"); - }}; + public static final List DEFAULT_CORE_COUNTERS = new ArrayList<>(); + public static final List DEFAULT_CORE_GAUGES = new ArrayList<>(); + public static final List DEFAULT_NODE_GAUGES = new ArrayList<>(); + public static final List DEFAULT_JVM_GAUGES = new ArrayList<>(); public static final String NUM_SHARDS_KEY = "numShards"; public static final String NUM_REPLICAS_KEY = "numReplicas"; public static final String NUM_NODES_KEY = "numNodes"; - public static final List DEFAULT_COLLECTION_GAUGES = new ArrayList() {{ - add(NUM_SHARDS_KEY); - add(NUM_REPLICAS_KEY); - }}; + public static final List DEFAULT_COLLECTION_GAUGES = new ArrayList<>(); + + static { + DEFAULT_JVM_GAUGES.add("memory.heap.used"); + DEFAULT_JVM_GAUGES.add("os.processCpuLoad"); + DEFAULT_JVM_GAUGES.add("os.systemLoadAverage"); + + DEFAULT_NODE_GAUGES.add("CONTAINER.fs.coreRoot.usableSpace"); + + DEFAULT_CORE_GAUGES.add("INDEX.sizeInBytes"); + + DEFAULT_CORE_COUNTERS.add("QUERY./select.requests"); + DEFAULT_CORE_COUNTERS.add("UPDATE./update.requests"); + + DEFAULT_COLLECTION_GAUGES.add(NUM_SHARDS_KEY); + DEFAULT_COLLECTION_GAUGES.add(NUM_REPLICAS_KEY); + } public static final String COLLECT_PERIOD_PROP = "collectPeriod"; public static final String SYNC_PERIOD_PROP = "syncPeriod"; @@ -148,6 +160,7 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss private final int collectPeriod; private final Map> counters = new HashMap<>(); private final Map> gauges = new HashMap<>(); + private final String overseerUrlScheme; private final Map knownDbs = new ConcurrentHashMap<>(); @@ -166,11 +179,17 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss if (pluginArgs != null) { args.putAll(pluginArgs); } - // override from ZK - Map props = (Map)cloudManager.getClusterStateProvider() - .getClusterProperty("metrics", Collections.emptyMap()) - .getOrDefault("history", Collections.emptyMap()); - args.putAll(props); + // override from ZK if available + if (cloudManager != null) { + Map props = (Map)cloudManager.getClusterStateProvider() + .getClusterProperty("metrics", Collections.emptyMap()) + .getOrDefault("history", Collections.emptyMap()); + args.putAll(props); + + overseerUrlScheme = cloudManager.getClusterStateProvider().getClusterProperty("urlScheme", "http"); + } else { + overseerUrlScheme = "http"; + } this.nodeName = nodeName; this.enable = Boolean.parseBoolean(String.valueOf(args.getOrDefault(ENABLE_PROP, "true"))); @@ -180,12 +199,12 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss this.collectPeriod = Integer.parseInt(String.valueOf(args.getOrDefault(COLLECT_PERIOD_PROP, DEFAULT_COLLECT_PERIOD))); int syncPeriod = Integer.parseInt(String.valueOf(args.getOrDefault(SYNC_PERIOD_PROP, SolrRrdBackendFactory.DEFAULT_SYNC_PERIOD))); - factory = new SolrRrdBackendFactory(solrClient, CollectionAdminParams.SYSTEM_COLL, - syncPeriod, cloudManager.getTimeSource()); this.solrClient = solrClient; this.metricsHandler = metricsHandler; this.cloudManager = cloudManager; - this.timeSource = cloudManager.getTimeSource(); + this.timeSource = cloudManager != null ? cloudManager.getTimeSource() : TimeSource.NANO_TIME; + factory = new SolrRrdBackendFactory(solrClient, CollectionAdminParams.SYSTEM_COLL, + syncPeriod, this.timeSource); counters.put(Group.core.toString(), DEFAULT_CORE_COUNTERS); counters.put(Group.node.toString(), Collections.emptyList()); @@ -217,43 +236,60 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss } } + // check that .system exists public void checkSystemCollection() { - // check that .system exists - try { - if (cloudManager.isClosed() || Thread.interrupted()) { - factory.setPersistent(false); - return; - } - ClusterState clusterState = cloudManager.getClusterStateProvider().getClusterState(); - DocCollection systemColl = clusterState.getCollectionOrNull(CollectionAdminParams.SYSTEM_COLL); - if (systemColl == null) { - if (logMissingCollection) { - log.warn("Missing " + CollectionAdminParams.SYSTEM_COLL + ", keeping metrics history in memory"); - logMissingCollection = false; - } - factory.setPersistent(false); - return; - } else { - boolean ready = false; - for (Replica r : systemColl.getReplicas()) { - if (r.isActive(clusterState.getLiveNodes())) { - ready = true; - break; - } - } - if (!ready) { - log.debug(CollectionAdminParams.SYSTEM_COLL + " not ready yet, keeping metrics history in memory"); + if (cloudManager != null) { + try { + if (cloudManager.isClosed() || Thread.interrupted()) { factory.setPersistent(false); return; } + ClusterState clusterState = cloudManager.getClusterStateProvider().getClusterState(); + DocCollection systemColl = clusterState.getCollectionOrNull(CollectionAdminParams.SYSTEM_COLL); + if (systemColl == null) { + if (logMissingCollection) { + log.warn("Missing " + CollectionAdminParams.SYSTEM_COLL + ", keeping metrics history in memory"); + logMissingCollection = false; + } + factory.setPersistent(false); + return; + } else { + boolean ready = false; + for (Replica r : systemColl.getReplicas()) { + if (r.isActive(clusterState.getLiveNodes())) { + ready = true; + break; + } + } + if (!ready) { + log.debug(CollectionAdminParams.SYSTEM_COLL + " not ready yet, keeping metrics history in memory"); + factory.setPersistent(false); + return; + } + } + } catch (Exception e) { + if (logMissingCollection) { + log.warn("Error getting cluster state, keeping metrics history in memory", e); + } + logMissingCollection = false; + factory.setPersistent(false); + return; + } + logMissingCollection = true; + factory.setPersistent(true); + } else { + try { + solrClient.query(CollectionAdminParams.SYSTEM_COLL, new SolrQuery(CommonParams.Q, "*:*", CommonParams.ROWS, "0")); + factory.setPersistent(true); + logMissingCollection = true; + } catch (Exception e) { + if (logMissingCollection) { + log.warn("Error querying .system collection, keeping metrics history in memory", e); + } + logMissingCollection = false; + factory.setPersistent(false); } - } catch (Exception e) { - log.warn("Error getting cluster state, keeping metrics history in memory", e); - factory.setPersistent(false); - return; } - logMissingCollection = true; - factory.setPersistent(true); } public SolrClient getSolrClient() { @@ -271,7 +307,11 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss return factory; } - private boolean isOverseerLeader() { + private String getOverseerLeader() { + // non-ZK node has no Overseer + if (cloudManager == null) { + return null; + } ZkNodeProps props = null; try { VersionedData data = cloudManager.getDistribStateManager().getData( @@ -281,24 +321,39 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss } } catch (KeeperException | IOException | NoSuchElementException e) { log.warn("Could not obtain overseer's address, skipping.", e); - return false; + return null; } catch (InterruptedException e) { Thread.currentThread().interrupt(); - return false; + return null; } if (props == null) { - return false; + return null; } String oid = props.getStr(ID); if (oid == null) { - return false; + return null; } String[] ids = oid.split("-"); if (ids.length != 3) { // unknown format log.warn("Unknown format of leader id, skipping: " + oid); - return false; + return null; + } + return ids[1]; + } + + private boolean amIOverseerLeader() { + return amIOverseerLeader(null); + } + + private boolean amIOverseerLeader(String leader) { + if (leader == null) { + leader = getOverseerLeader(); + } + if (leader == null) { + return false; + } else { + return nodeName.equals(leader); } - return nodeName.equals(ids[1]); } private void collectMetrics() { @@ -383,7 +438,7 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss } private void collectGlobalMetrics() { - if (!isOverseerLeader()) { + if (!amIOverseerLeader()) { return; } Set nodes = new HashSet<>(cloudManager.getClusterStateProvider().getLiveNodes()); @@ -640,11 +695,19 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss if (cmd == null) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "unknown 'action' param '" + actionStr + "', supported actions: " + Cmd.actions); } - Object res = null; + final SimpleOrderedMap res = new SimpleOrderedMap<>(); + rsp.add("metrics", res); switch (cmd) { case LIST: int rows = req.getParams().getInt(CommonParams.ROWS, SolrRrdBackendFactory.DEFAULT_MAX_DBS); - res = factory.list(rows); + List> lst = factory.list(rows); + lst.forEach(p -> { + SimpleOrderedMap data = new SimpleOrderedMap<>(); + // RrdDb always uses seconds - convert here for compatibility + data.add("lastModified", TimeUnit.SECONDS.convert(p.second(), TimeUnit.MILLISECONDS)); + data.add("node", nodeName); + res.add(p.first(), data); + }); break; case GET: String name = req.getParams().get(CommonParams.NAME); @@ -657,15 +720,14 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss if (format == null) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "unknown 'format' param '" + formatStr + "', supported formats: " + Format.formats); } - if (!factory.exists(name)) { - rsp.add("error", "'" + name + "' doesn't exist"); - } else { + if (factory.exists(name)) { // get a throwaway copy (safe to close and discard) RrdDb db = new RrdDb(URI_PREFIX + name, true, factory); - res = new NamedList<>(); - NamedList data = new NamedList<>(); + SimpleOrderedMap data = new SimpleOrderedMap<>(); data.add("data", getDbData(db, dsNames, format, req.getParams())); - ((NamedList)res).add(name, data); + data.add("lastModified", db.getLastUpdateTime()); + data.add("node", nodeName); + res.add(name, data); db.close(); } break; @@ -674,17 +736,14 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss if (name == null) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "'name' is a required param"); } - if (!factory.exists(name)) { - rsp.add("error", "'" + name + "' doesn't exist"); - } else { + if (factory.exists(name)) { // get a throwaway copy (safe to close and discard) RrdDb db = new RrdDb(URI_PREFIX + name, true, factory); - NamedList map = new NamedList<>(); - NamedList status = new NamedList<>(); + SimpleOrderedMap status = new SimpleOrderedMap<>(); status.add("status", getDbStatus(db)); - map.add(name, status); + status.add("node", nodeName); + res.add(name, status); db.close(); - res = map; } break; case DELETE: @@ -700,9 +759,61 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss rsp.add("success", "ok"); break; } - if (res != null) { - rsp.add("metrics", res); + // when using in-memory DBs non-overseer node has no access to overseer DBs - in this case + // forward the request to Overseer leader if available + if (!factory.isPersistent()) { + String leader = getOverseerLeader(); + if (leader != null && !amIOverseerLeader(leader)) { + // get & merge remote response + NamedList remoteRes = handleRemoteRequest(leader, req); + mergeRemoteRes(rsp, remoteRes); + } } + SimpleOrderedMap apiState = new SimpleOrderedMap<>(); + apiState.add("enableReplicas", enableReplicas); + apiState.add("enableNodes", enableNodes); + apiState.add("mode", enable ? (factory.isPersistent() ? "index" : "memory") : "inactive"); + if (!factory.isPersistent()) { + apiState.add("message", "WARNING: metrics history is not being persisted. Create .system collection to start persisting history."); + } + rsp.add("state", apiState); + rsp.getResponseHeader().add("zkConnected", cloudManager != null); + } + + private NamedList handleRemoteRequest(String nodeName, SolrQueryRequest req) { + String baseUrl = Utils.getBaseUrlForNodeName(nodeName, overseerUrlScheme); + String url; + try { + URL u = new URL(baseUrl); + u = new URL(u.getProtocol(), u.getHost(), u.getPort(), "/api/cluster/metrics/history"); + url = u.toString(); + } catch (MalformedURLException e) { + log.warn("Invalid Overseer url '" + baseUrl + "', unable to fetch remote metrics history", e); + return null; + } + // always use javabin + ModifiableSolrParams params = new ModifiableSolrParams(req.getParams()); + params.set(CommonParams.WT, "javabin"); + url = url + "?" + params.toString(); + try { + byte[] data = cloudManager.httpRequest(url, SolrRequest.METHOD.GET, null, null, HttpClientUtil.DEFAULT_CONNECT_TIMEOUT, true); + // response is always a NamedList + try (JavaBinCodec codec = new JavaBinCodec()) { + return (NamedList)codec.unmarshal(new ByteArrayInputStream(data)); + } + } catch (IOException e) { + log.warn("Exception forwarding request to Overseer at " + url, e); + return null; + } + } + + private void mergeRemoteRes(SolrQueryResponse rsp, NamedList remoteRes) { + if (remoteRes == null || remoteRes.get("metrics") == null) { + return; + } + NamedList remoteMetrics = (NamedList)remoteRes.get("metrics"); + SimpleOrderedMap localMetrics = (SimpleOrderedMap) rsp.getValues().get("metrics"); + remoteMetrics.forEach((k, v) -> localMetrics.add(k, v)); } private NamedList getDbStatus(RrdDb db) throws IOException { @@ -750,7 +861,7 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss RrdDef def = db.getRrdDef(); ArcDef[] arcDefs = def.getArcDefs(); for (ArcDef arcDef : arcDefs) { - SimpleOrderedMap map = new SimpleOrderedMap(); + SimpleOrderedMap map = new SimpleOrderedMap<>(); res.add(arcDef.dump(), map); Archive a = db.getArchive(arcDef.getConsolFun(), arcDef.getSteps()); // startTime / endTime, arcStep are in seconds @@ -761,22 +872,21 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss if (format != Format.GRAPH) { // add timestamps separately from values long[] timestamps = fd.getTimestamps(); - str.setLength(0); - for (int i = 0; i < timestamps.length; i++) { - if (format == Format.LIST) { - map.add("timestamps", timestamps[i]); - } else { + if (format == Format.LIST) { + // Arrays.asList works only on arrays of Objects + map.add("timestamps", Arrays.stream(timestamps).boxed().collect(Collectors.toList())); + } else { + str.setLength(0); + for (int i = 0; i < timestamps.length; i++) { if (i > 0) { str.append('\n'); } str.append(String.valueOf(timestamps[i])); } - } - if (format == Format.STRING) { map.add("timestamps", str.toString()); } } - SimpleOrderedMap values = new SimpleOrderedMap(); + SimpleOrderedMap values = new SimpleOrderedMap<>(); map.add("values", values); for (String name : dsNames) { double[] vals = fd.getValues(name); @@ -825,9 +935,7 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss values.add(name, str.toString()); break; case LIST: - for (int i = 0; i < vals.length; i++) { - values.add(name, vals[i]); - } + values.add(name, Arrays.stream(vals).boxed().collect(Collectors.toList())); break; } } diff --git a/solr/core/src/java/org/apache/solr/metrics/rrd/SolrRrdBackend.java b/solr/core/src/java/org/apache/solr/metrics/rrd/SolrRrdBackend.java index 956aabba44c..d0aa3e23566 100644 --- a/solr/core/src/java/org/apache/solr/metrics/rrd/SolrRrdBackend.java +++ b/solr/core/src/java/org/apache/solr/metrics/rrd/SolrRrdBackend.java @@ -19,6 +19,7 @@ package org.apache.solr.metrics.rrd; import java.io.Closeable; import java.io.IOException; import java.lang.invoke.MethodHandles; +import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.ReentrantLock; import org.rrd4j.core.RrdByteArrayBackend; @@ -36,14 +37,27 @@ public class SolrRrdBackend extends RrdByteArrayBackend implements Closeable { private final ReentrantLock lock = new ReentrantLock(); private volatile boolean dirty = false; private volatile boolean closed = false; + private volatile long lastModifiedTime; + + public static final class SyncData { + public byte[] data; + public long timestamp; + + public SyncData(byte[] data, long timestamp) { + this.data = data; + this.timestamp = timestamp; + } + } public SolrRrdBackend(String path, boolean readOnly, SolrRrdBackendFactory factory) { super(path); this.factory = factory; + this.lastModifiedTime = TimeUnit.MILLISECONDS.convert(factory.getTimeSource().getEpochTimeNs(), TimeUnit.NANOSECONDS); try { - byte[] data = factory.getData(path); - if (data != null) { - this.buffer = data; + SyncData syncData = factory.getData(path); + if (syncData != null) { + this.buffer = syncData.data; + this.lastModifiedTime = syncData.timestamp; } } catch (IOException e) { log.warn("Exception retrieving data from " + path + ", store will be readOnly", e); @@ -60,6 +74,7 @@ public class SolrRrdBackend extends RrdByteArrayBackend implements Closeable { super(other.getPath()); readOnly = true; factory = null; + this.lastModifiedTime = other.lastModifiedTime; byte[] otherBuffer = other.buffer; buffer = new byte[otherBuffer.length]; System.arraycopy(otherBuffer, 0, buffer, 0, otherBuffer.length); @@ -69,6 +84,10 @@ public class SolrRrdBackend extends RrdByteArrayBackend implements Closeable { return readOnly; } + public long getLastModifiedTime() { + return lastModifiedTime; + } + @Override protected void write(long offset, byte[] bytes) throws IOException { if (readOnly || closed) { @@ -77,13 +96,14 @@ public class SolrRrdBackend extends RrdByteArrayBackend implements Closeable { lock.lock(); try { super.write(offset, bytes); + lastModifiedTime = TimeUnit.MILLISECONDS.convert(factory.getTimeSource().getEpochTimeNs(), TimeUnit.NANOSECONDS); dirty = true; } finally { lock.unlock(); } } - public byte[] getSyncData() { + public SyncData getSyncData() { if (readOnly || closed) { return null; } @@ -95,7 +115,7 @@ public class SolrRrdBackend extends RrdByteArrayBackend implements Closeable { try { byte[] bufferCopy = new byte[buffer.length]; System.arraycopy(buffer, 0, bufferCopy, 0, buffer.length); - return bufferCopy; + return new SyncData(bufferCopy, lastModifiedTime); } finally { lock.unlock(); } diff --git a/solr/core/src/java/org/apache/solr/metrics/rrd/SolrRrdBackendFactory.java b/solr/core/src/java/org/apache/solr/metrics/rrd/SolrRrdBackendFactory.java index 06ab5fe3888..a3c6f64461d 100644 --- a/solr/core/src/java/org/apache/solr/metrics/rrd/SolrRrdBackendFactory.java +++ b/solr/core/src/java/org/apache/solr/metrics/rrd/SolrRrdBackendFactory.java @@ -22,14 +22,12 @@ import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Collections; -import java.util.Date; +import java.util.Comparator; import java.util.HashMap; -import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; -import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledThreadPoolExecutor; @@ -47,6 +45,7 @@ import org.apache.solr.common.params.CollectionAdminParams; import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.util.IOUtils; +import org.apache.solr.common.util.Pair; import org.apache.solr.common.util.TimeSource; import org.apache.solr.util.DefaultSolrThreadFactory; import org.rrd4j.core.RrdBackend; @@ -114,6 +113,10 @@ public class SolrRrdBackendFactory extends RrdBackendFactory implements SolrClos TimeUnit.MILLISECONDS); } + public TimeSource getTimeSource() { + return timeSource; + } + private void ensureOpen() throws IOException { if (closed) { throw new IOException("Factory already closed"); @@ -181,7 +184,7 @@ public class SolrRrdBackendFactory extends RrdBackendFactory implements SolrClos } } - byte[] getData(String path) throws IOException { + SolrRrdBackend.SyncData getData(String path) throws IOException { if (!persistent) { return null; } @@ -203,7 +206,8 @@ public class SolrRrdBackendFactory extends RrdBackendFactory implements SolrClos return null; } if (o instanceof byte[]) { - return (byte[])o; + Long time = (Long)doc.getFieldValue("timestamp_l"); + return new SolrRrdBackend.SyncData((byte[])o, time); } else { throw new SolrServerException("Unexpected value of '" + DATA_FIELD + "' field: " + o.getClass().getName() + ": " + o); } @@ -216,34 +220,58 @@ public class SolrRrdBackendFactory extends RrdBackendFactory implements SolrClos backends.remove(path); } + private static final class DbComparator implements Comparator> { + static final DbComparator INSTANCE = new DbComparator(); + + @Override + public int compare(Pair o1, Pair o2) { + return o1.first().compareTo(o2.first()); + } + } + /** * List all available databases created by this node name * @param maxLength maximum number of results to return - * @return list of database names, or empty + * @return list of database names and their last update times, or empty * @throws IOException on server errors */ - public List list(int maxLength) throws IOException { - Set names = new HashSet<>(); + public List> list(int maxLength) throws IOException { + Map> byName = new HashMap<>(); if (persistent) { try { ModifiableSolrParams params = new ModifiableSolrParams(); params.add(CommonParams.Q, "*:*"); params.add(CommonParams.FQ, CommonParams.TYPE + ":" + DOC_TYPE); - params.add(CommonParams.FL, "id"); + params.add(CommonParams.FL, "id,timestamp_l"); params.add(CommonParams.ROWS, String.valueOf(maxLength)); QueryResponse rsp = solrClient.query(collection, params); SolrDocumentList docs = rsp.getResults(); if (docs != null) { - docs.forEach(d -> names.add(((String)d.getFieldValue("id")).substring(idPrefixLength))); + docs.forEach(d -> { + Long time = (Long)d.getFieldValue("timestamp_l"); + Pair p = new Pair<>(((String)d.getFieldValue("id")).substring(idPrefixLength), time); + byName.put(p.first(), p); + }); } } catch (SolrServerException e) { log.warn("Error retrieving RRD list", e); } } - // add in-memory backends not yet stored - names.addAll(backends.keySet()); - ArrayList list = new ArrayList<>(names); - Collections.sort(list); + // add in-memory backends not yet stored, or replace with more recent versions + backends.forEach((name, db) -> { + long lastModifiedTime = db.getLastModifiedTime(); + Pair stored = byName.get(name); + Pair inMemory = new Pair(name, lastModifiedTime); + if (stored != null) { + if (stored.second() < lastModifiedTime) { + byName.put(name, inMemory); + } + } else { + byName.put(name, inMemory); + } + }); + ArrayList> list = new ArrayList<>(byName.values()); + Collections.sort(list, DbComparator.INSTANCE); return list; } @@ -301,25 +329,25 @@ public class SolrRrdBackendFactory extends RrdBackendFactory implements SolrClos return; } log.debug("-- maybe sync backends: " + backends.keySet()); - Map syncData = new HashMap<>(); + Map syncDatas = new HashMap<>(); backends.forEach((path, backend) -> { - byte[] data = backend.getSyncData(); - if (data != null) { - syncData.put(backend.getPath(), data); + SolrRrdBackend.SyncData syncData = backend.getSyncData(); + if (syncData != null) { + syncDatas.put(backend.getPath(), syncData); } }); - if (syncData.isEmpty()) { + if (syncDatas.isEmpty()) { return; } - log.debug("-- syncing " + syncData.keySet()); + log.debug("-- syncing " + syncDatas.keySet()); // write updates try { - syncData.forEach((path, data) -> { + syncDatas.forEach((path, syncData) -> { SolrInputDocument doc = new SolrInputDocument(); doc.setField("id", ID_PREFIX + ID_SEP + path); doc.addField(CommonParams.TYPE, DOC_TYPE); - doc.addField(DATA_FIELD, data); - doc.setField("timestamp", new Date(TimeUnit.MILLISECONDS.convert(timeSource.getEpochTimeNs(), TimeUnit.NANOSECONDS))); + doc.addField(DATA_FIELD, syncData.data); + doc.setField("timestamp_l", syncData.timestamp); try { solrClient.add(collection, doc); } catch (SolrServerException | IOException e) { @@ -334,7 +362,7 @@ public class SolrRrdBackendFactory extends RrdBackendFactory implements SolrClos } catch (SolrServerException e) { log.warn("Error committing RRD data updates", e); } - syncData.forEach((path, data) -> { + syncDatas.forEach((path, data) -> { SolrRrdBackend backend = backends.get(path); if (backend != null) { backend.markClean(); diff --git a/solr/core/src/test/org/apache/solr/cloud/MetricsHistoryIntegrationTest.java b/solr/core/src/test/org/apache/solr/cloud/MetricsHistoryIntegrationTest.java index 2012a1ac8f4..b3a1fb67f8e 100644 --- a/solr/core/src/test/org/apache/solr/cloud/MetricsHistoryIntegrationTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/MetricsHistoryIntegrationTest.java @@ -35,6 +35,7 @@ import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.Base64; import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.common.util.TimeSource; import org.apache.solr.util.LogLevel; import org.junit.AfterClass; @@ -94,12 +95,12 @@ public class MetricsHistoryIntegrationTest extends SolrCloudTestCase { NamedList rsp = solrClient.request(createHistoryRequest(params(CommonParams.ACTION, "list"))); assertNotNull(rsp); // expected solr.jvm, solr.node and solr.collection..system - List lst = (List)rsp.get("metrics"); + SimpleOrderedMap lst = (SimpleOrderedMap) rsp.get("metrics"); assertNotNull(lst); assertEquals(lst.toString(), 3, lst.size()); - assertTrue(lst.toString(), lst.contains("solr.jvm")); - assertTrue(lst.toString(), lst.contains("solr.node")); - assertTrue(lst.toString(), lst.contains("solr.collection..system")); + assertNotNull(lst.toString(), lst.get("solr.jvm")); + assertNotNull(lst.toString(), lst.get("solr.node")); + assertNotNull(lst.toString(), lst.get("solr.collection..system")); } @Test diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java index 234eaea29a1..900fc76848d 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java @@ -665,6 +665,7 @@ public class SimCloudManager implements SolrCloudManager { } queryRequest.getContext().put("httpMethod", req.getMethod().toString()); SolrQueryResponse queryResponse = new SolrQueryResponse(); + queryResponse.addResponseHeader(new SimpleOrderedMap<>()); if (autoscaling) { autoScalingHandler.handleRequest(queryRequest, queryResponse); } else { diff --git a/solr/core/src/test/org/apache/solr/handler/admin/MetricsHistoryHandlerTest.java b/solr/core/src/test/org/apache/solr/handler/admin/MetricsHistoryHandlerTest.java index e1e230fbb4f..7c84c16462e 100644 --- a/solr/core/src/test/org/apache/solr/handler/admin/MetricsHistoryHandlerTest.java +++ b/solr/core/src/test/org/apache/solr/handler/admin/MetricsHistoryHandlerTest.java @@ -30,6 +30,7 @@ import org.apache.solr.cloud.SolrCloudTestCase; import org.apache.solr.cloud.autoscaling.sim.SimCloudManager; import org.apache.solr.common.params.CollectionAdminParams; import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.util.Pair; import org.apache.solr.common.util.TimeSource; import org.apache.solr.core.SolrInfoBean; import org.apache.solr.metrics.SolrMetricManager; @@ -57,7 +58,7 @@ public class MetricsHistoryHandlerTest extends SolrCloudTestCase { @BeforeClass public static void beforeClass() throws Exception { - simulated = random().nextBoolean(); + simulated = random().nextBoolean() || true; Map args = new HashMap<>(); args.put(MetricsHistoryHandler.SYNC_PERIOD_PROP, 1); args.put(MetricsHistoryHandler.COLLECT_PERIOD_PROP, 1); @@ -111,11 +112,11 @@ public class MetricsHistoryHandlerTest extends SolrCloudTestCase { @Test public void testBasic() throws Exception { timeSource.sleep(10000); - List list = handler.getFactory().list(100); + List> list = handler.getFactory().list(100); // solr.jvm, solr.node, solr.collection..system assertEquals(list.toString(), 3, list.size()); - for (String path : list) { - RrdDb db = new RrdDb(MetricsHistoryHandler.URI_PREFIX + path, true, handler.getFactory()); + for (Pair p : list) { + RrdDb db = new RrdDb(MetricsHistoryHandler.URI_PREFIX + p.first(), true, handler.getFactory()); int dsCount = db.getDsCount(); int arcCount = db.getArcCount(); assertTrue("dsCount should be > 0, was " + dsCount, dsCount > 0); diff --git a/solr/core/src/test/org/apache/solr/metrics/rrd/SolrRrdBackendFactoryTest.java b/solr/core/src/test/org/apache/solr/metrics/rrd/SolrRrdBackendFactoryTest.java index 2f5fa131318..18c72ec07da 100644 --- a/solr/core/src/test/org/apache/solr/metrics/rrd/SolrRrdBackendFactoryTest.java +++ b/solr/core/src/test/org/apache/solr/metrics/rrd/SolrRrdBackendFactoryTest.java @@ -17,13 +17,13 @@ package org.apache.solr.metrics.rrd; -import java.util.Date; import java.util.List; import java.util.concurrent.TimeUnit; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.params.CollectionAdminParams; +import org.apache.solr.common.util.Pair; import org.apache.solr.common.util.TimeSource; import org.apache.solr.util.MockSearchableSolrClient; import org.junit.After; @@ -78,15 +78,15 @@ public class SolrRrdBackendFactoryTest extends SolrTestCaseJ4 { @Test public void testBasic() throws Exception { RrdDb db = new RrdDb(createDef(), factory); - List list = factory.list(100); + List> list = factory.list(100); assertEquals(list.toString(), 1, list.size()); - assertEquals(list.toString(), "foo", list.get(0)); + assertEquals(list.toString(), "foo", list.get(0).first()); timeSource.sleep(2000); // there should be one sync data assertEquals(solrClient.docs.toString(), 1, solrClient.docs.size()); String id = SolrRrdBackendFactory.ID_PREFIX + SolrRrdBackendFactory.ID_SEP + "foo"; SolrInputDocument doc = solrClient.docs.get(CollectionAdminParams.SYSTEM_COLL).get(id); - long timestamp = ((Date)doc.getFieldValue("timestamp")).getTime(); + long timestamp = (Long)doc.getFieldValue("timestamp_l"); timeSource.sleep(2000); SolrInputDocument newDoc = solrClient.docs.get(CollectionAdminParams.SYSTEM_COLL).get(id); assertEquals(newDoc.toString(), newDoc, doc); @@ -104,7 +104,7 @@ public class SolrRrdBackendFactoryTest extends SolrTestCaseJ4 { timeSource.sleep(3000); newDoc = solrClient.docs.get(CollectionAdminParams.SYSTEM_COLL).get(id); assertFalse(newDoc.toString(), newDoc.equals(doc)); - long newTimestamp = ((Date)newDoc.getFieldValue("timestamp")).getTime(); + long newTimestamp = (Long)newDoc.getFieldValue("timestamp_l"); assertNotSame(newTimestamp, timestamp); FetchRequest fr = db.createFetchRequest(ConsolFun.AVERAGE, firstTimestamp + 60, lastTimestamp - 60, 60); FetchData fd = fr.fetchData(); @@ -126,7 +126,7 @@ public class SolrRrdBackendFactoryTest extends SolrTestCaseJ4 { // should still be listed list = factory.list(100); assertEquals(list.toString(), 1, list.size()); - assertEquals(list.toString(), "foo", list.get(0)); + assertEquals(list.toString(), "foo", list.get(0).first()); // re-open read-write db = new RrdDb("solr:foo", factory); @@ -141,7 +141,7 @@ public class SolrRrdBackendFactoryTest extends SolrTestCaseJ4 { doc = newDoc; newDoc = solrClient.docs.get(CollectionAdminParams.SYSTEM_COLL).get(id); assertFalse(newDoc.toString(), newDoc.equals(doc)); - newTimestamp = ((Date)newDoc.getFieldValue("timestamp")).getTime(); + newTimestamp = (Long)newDoc.getFieldValue("timestamp_l"); assertNotSame(newTimestamp, timestamp); fr = db.createFetchRequest(ConsolFun.AVERAGE, firstTimestamp + 60, lastTimestamp, 60); fd = fr.fetchData(); @@ -174,7 +174,7 @@ public class SolrRrdBackendFactoryTest extends SolrTestCaseJ4 { timestamp = newTimestamp; newDoc = solrClient.docs.get(CollectionAdminParams.SYSTEM_COLL).get(id); assertTrue(newDoc.toString(), newDoc.equals(doc)); - newTimestamp = ((Date)newDoc.getFieldValue("timestamp")).getTime(); + newTimestamp = (Long)newDoc.getFieldValue("timestamp_l"); assertEquals(newTimestamp, timestamp); readOnly.close(); } diff --git a/solr/solr-ref-guide/src/metrics-history.adoc b/solr/solr-ref-guide/src/metrics-history.adoc index 5dc1c3cb18c..e39f66e7306 100644 --- a/solr/solr-ref-guide/src/metrics-history.adoc +++ b/solr/solr-ref-guide/src/metrics-history.adoc @@ -18,15 +18,16 @@ == Design === Round-robin databases -When Solr runs in "cloud" mode it collects long-term history of certain key metrics. This information -can be used for very simple monitoring and troubleshooting, but also some Solr Cloud components -(eg. autoscaling) can use this data for making informed decisions based on long-term -trends of selected metrics. +Solr collects long-term history of certain key metrics both in SolrCloud and in standalone mode. +This information can be used for very simple monitoring and troubleshooting, but also some +Solr Cloud components (eg. autoscaling) can use this data for making informed decisions based on +long-term trends of selected metrics. [IMPORTANT] ==== -Metrics history is available ONLY in SolrCloud mode, it's not supported in standalone Solr. Also, -the `.system` collection must exist if metrics history should be persisted. +The `.system` collection must exist if metrics history should be persisted. If this collection +is absent then metrics history will still be collected and kept in memory but it will be lost +on node restart. ==== This data is maintained as multi-resolution time series, with a fixed total number of data points @@ -61,14 +62,16 @@ update operations than storing each data point in a separate Solr document. Metr detailed data from each database, including retrieval of all individual datapoints. Databases are identified primarily by their corresponding metric registry name, so for databases that -keep track of aggregated metrics this will be eg. `solr.jvm`, `solr.node`, `solr.collection.gettingstarted`, -and for databases with non-aggregated metrics this will be eg. `solr.jvm.localhost:8983_solr`, -`solr.node.localhost:7574_solr`, `solr.core.gettingstarted.shard1.replica_n1`. +keep track of aggregated metrics this will be eg. `solr.jvm`, `solr.node`, `solr.collection.gettingstarted`. +For databases with non-aggregated metrics the name consists of the registry name, optionally with a node name +to identify databases with the same name coming from different nodes. For example, per-node databases are +name like this: `solr.jvm.localhost:8983_solr`, `solr.node.localhost:7574_solr`, but per-replica names are +already unique across the cluster so they are named like this: `solr.core.gettingstarted.shard1.replica_n1`. === Collected metrics Currently the following selected metrics are tracked: -* `solr.core` and `solr.collection` metrics: +* Non-aggregated `solr.core` and aggregated `solr.collection` metrics: ** `QUERY./select.requests` ** `UPDATE./update.requests` ** `INDEX.sizeInBytes` @@ -78,6 +81,7 @@ Currently the following selected metrics are tracked: * `solr.node` metrics: ** `CONTAINER.fs.coreRoot.usableSpace` ** `numNodes` (aggregated, number of live nodes) + * `solr.jvm` metrics: ** `memory.heap.used` ** `os.processCpuLoad` @@ -86,6 +90,10 @@ Currently the following selected metrics are tracked: Separate databases are created for each of these groups, and each database keeps data for all metrics listed in that group. +NOTE: Currently this list is not configurable. Also, if you change this list in the code then +all existing databases must be first removed from the `.system` collection because RRD4j doesn't allow +adding new datasources once the database is created. + === SolrRrdBackendFactory This component is responsible for managing in-memory databases and periodically saving them to the `.system` collection. If the `.system` collection is not available the updates to the @@ -101,7 +109,8 @@ collecting and periodically updating the in-memory databases. This handler also performs aggregation of metrics on per-collection level, and on a cluster level. By default only these aggregated metrics are tracked - historic data from each node and each replica in each collection is not collected separately. Aggregated databases are managed on the Overseer leader -node. +node but they are still accessible from other nodes even if they are not persisted - the handler redirects +the call from originating node to the current Overseer leader. The handler assumes that a simple aggregation (sum of partial metric values from each resource) is sufficient. This happens to make sense for the default built-in sets of metrics. Future extensions will @@ -135,6 +144,7 @@ databases. `collectPeriod`:: integer, in seconds, default is 60. Metrics values will be collected and respective databases updated every `collectPeriod` seconds. + [IMPORTANT] ==== Value of `collectPeriod` must be at least 1, and if it's changed then all previously existing databases @@ -142,9 +152,9 @@ with their historic data must be manually removed (new databases will be created ==== `syncPeriod`:: integer, in seconds, default is 60. Data from modified databases will be saved to Solr -every `syncPeriod` seconds. When accessing the databases via REST API the visibility of most recent -data depends on this period, because requests accessing the data from other nodes see only the -version of the data that is stored in the `.system` collection. +every `syncPeriod` seconds. When accessing the databases via REST API in `index` mode the visibility of +most recent data depends on this period, because requests accessing the data from other nodes see only +the version of the data that is stored in the `.system` collection. === Example configuration Example `/clusterprops.json` file with metrics history configuration that turns on the collection of @@ -154,6 +164,7 @@ properties unrelated to metrics history API. [source,json] ---- { +... "metrics" : { "history" : { "enable" : true, @@ -161,42 +172,86 @@ properties unrelated to metrics history API. "syncPeriod" : 300 } } +... } ---- == Metrics History API -Main entry point for accessing metrics history is `/admin/metrics/history` (or `/api/cluster/metrics/history` for -v2 API). +Main entry point for accessing metrics history is `/admin/metrics/history` (or `/api/cluster/metrics/history` +for v2 API). The following sections describe actions available in this API. All calls have at least one required parameter `action`. +All responses contain a section named `state`, which reports the current internal state of the API: + +`enableReplicas`:: boolean, corresponds to the `enableReplicas` configuration setting. +`enableNodes`:: boolean, corresponds to the `enableNodes` configuration setting. +`mode`:: one of the following values: +* `inactive` - when metrics collection is disabled (but access to existing metrics history is still available). +* `memory` - when metrics history is kept only in memory because `.system` collection doesn't exist. In this mode +clients can access metrics history available on the node that received the reuqest and on the Overseer leader. +* `index` - when metrics history is periodically stored in the `.system` collection. Data available in memory on +the node that accepted the request is retrieved from memory, any other data is retrieved from the +`.system` collection (so it's at least `syncPeriod` old). + +Also, the response header section (`responseHeader`) contains `zkConnected` boolean property that indicates +whether the current node is a part of SolrCloud cluster. + === List databases (`action=list`) This call produces a list of available databases. It supports the following parameters: -`rows`:: optional integer, default is 500. Maximum number of results to return +`rows`:: optional integer, default is 500. Maximum number of results to return. Example: +In this SolrCloud example the API is in `memory` mode, and the request was made to a node that is +not Overseer leader. The API transparently forwarded the request to Overseer leader. [source,bash] ---- -curl http://localhost:8983/solr/admin/metrics/history?action=list&rows=10 +curl http://localhost:7574/solr/admin/metrics/history?action=list&rows=10 ---- [source,json] ---- { - "responseHeader": { - "status": 0, - "QTime": 16 + "responseHeader": { + "zkConnected": true, + "status": 0, + "QTime": 9 + }, + "metrics": { + "solr.collection..system": { + "lastModified": 1528360138, + "node": "127.0.0.1:8983_solr" }, - "metrics": [ - "solr.collection..system", - "solr.collection.gettingstarted", - "solr.jvm", - "solr.node" - ] + "solr.collection.gettingstarted": { + "lastModified": 1528360138, + "node": "127.0.0.1:8983_solr" + }, + "solr.jvm": { + "lastModified": 1528360138, + "node": "127.0.0.1:8983_solr" + }, + "solr.node": { + "lastModified": 1528360138, + "node": "127.0.0.1:8983_solr" + } + }, + "state": { + "enableReplicas": false, + "enableNodes": false, + "mode": "memory" + } } ---- +Note the presence of the `node` element in each section, which shows where the information is coming +from - when API is in `memory` mode this indicates which results are local and which ones are retrieved +from the Overseer leader node. When the API is in `index` mode this element always shows the node name that +received the request (because the data is retrieved from the `.system` collection anyway). + +Each section also contains a `lastModified` element, which contains the last modification time when the +database was update. All timestamps returned from this API correspond to Unix epoch time in seconds. + === Database status (`action=status`) This call provides detailed status of the selected database. @@ -207,66 +262,71 @@ The following parameters are supported: Example: [source,bash] ---- -curl http://localhost:8983/solr/admin/metrics/history?action=status&name=solr.collection.gettingstarted +curl http://localhost:7574/solr/admin/metrics/history?action=status&name=solr.collection.gettingstarted ---- [source,json] ---- { - "responseHeader": { - "status": 0, - "QTime": 38 - }, - "metrics": [ - "solr.collection.gettingstarted", - [ - "status", - { - "lastModified": 1527268438, - "step": 60, - "datasourceCount": 5, - "archiveCount": 5, - "datasourceNames": [ - "numShards", - "numReplicas", - "QUERY./select.requests", - "UPDATE./update.requests", - "INDEX.sizeInBytes" - ], - "datasources": [ - { - "datasource": "DS:numShards:GAUGE:120:U:U", - "lastValue": 2 - }, - { - "datasource": "DS:QUERY./select.requests:COUNTER:120:U:U", - "lastValue": 8786 - }, - ... - ], - "archives": [ - { - "archive": "RRA:AVERAGE:0.5:1:240", - "steps": 1, - "consolFun": "AVERAGE", - "xff": 0.5, - "startTime": 1527254040, - "endTime": 1527268380, - "rows": 240 - }, - { - "archive": "RRA:AVERAGE:0.5:10:288", - "steps": 10, - "consolFun": "AVERAGE", - "xff": 0.5, - "startTime": 1527096000, - "endTime": 1527268200, - "rows": 288 - }, - ... - ] - } + "responseHeader": { + "zkConnected": true, + "status": 0, + "QTime": 46 + }, + "metrics": { + "solr.collection.gettingstarted": { + "status": { + "lastModified": 1528318361, + "step": 60, + "datasourceCount": 5, + "archiveCount": 5, + "datasourceNames": [ + "numShards", + "numReplicas", + "QUERY./select.requests", + "UPDATE./update.requests", + "INDEX.sizeInBytes" + ], + "datasources": [ + { + "datasource": "DS:numShards:GAUGE:120:U:U", + "lastValue": 2 + }, + { + "datasource": "DS:numReplicas:GAUGE:120:U:U", + "lastValue": 4 + }, + ... + ], + "archives": [ + { + "archive": "RRA:AVERAGE:0.5:1:240", + "steps": 1, + "consolFun": "AVERAGE", + "xff": 0.5, + "startTime": 1528303980, + "endTime": 1528318320, + "rows": 240 + }, + { + "archive": "RRA:AVERAGE:0.5:10:288", + "steps": 10, + "consolFun": "AVERAGE", + "xff": 0.5, + "startTime": 1528146000, + "endTime": 1528318200, + "rows": 288 + }, + ... ] - ] + }, + "node": "127.0.0.1:7574_solr" + } + }, + "state": { + "enableReplicas": false, + "enableNodes": false, + "mode": "index" + } } ---- @@ -286,7 +346,7 @@ values (because points from all datasources in a given time series share the sam * `graph` - data is returned as PNG images, Base64-encoded, containing graphs of each time series values over time. In each case the response is structured in a similar way: archive identifiers are keys in a JSON map, -and timestamps / datapoints / graphs are values. +all data is placed in a `data` element, with timestamps / datapoints / graphs as values in lists or maps. ==== Examples This is the output using the default `list` format: @@ -297,37 +357,49 @@ curl http://localhost:8983/solr/admin/metrics/history?action=get&name=solr.colle [source,json] ---- { - "responseHeader": { - "status": 0, - "QTime": 36 - }, - "metrics": [ - "solr.collection.gettingstarted", - [ - "data", - { - "RRA:AVERAGE:0.5:1:240": { - "timestamps":1527254460, - "timestamps":1527254520, - "timestamps":1527254580, - ... - "values": { - "numShards": "NaN", - "numShards": 2.0, - "numShards": 2.0, - ... - "numReplicas": "NaN", - "numReplicas": 4.0, - "numReplicas": 4.0, - ... - "QUERY./select.requests": "NaN", - "QUERY./select.requests": 123, - "QUERY./select.requests": 456, - ... - } - }, - "RRA:AVERAGE:0.5:10:288": { -... + "responseHeader": { + "zkConnected": true, + "status": 0, + "QTime": 4 + }, + "metrics": { + "solr.collection.gettingstarted": { + "data": { + "RRA:AVERAGE:0.5:1:240": { + "timestamps": [ + 1528304160, + 1528304220, + ... + ], + "values": { + "numShards": [ + "NaN", + 2.0, + ... + ], + "numReplicas": [ + "NaN", + 4.0, + ... + ], + ... + } + }, + "RRA:AVERAGE:0.5:10:288": { + "timestamps": [ + 1528145400, + 1528146000, + ... + "lastModified": 1528318606, + "node": "127.0.0.1:8983_solr" + } + }, + "state": { + "enableReplicas": false, + "enableNodes": false, + "mode": "index" + } +} ---- This is the output when using the `string` format: @@ -338,25 +410,24 @@ curl http://localhost:8983/solr/admin/metrics/history?action=get&name=solr.colle [source,json] ---- { - "responseHeader": { - "status": 0, - "QTime": 11 - }, - "metrics": [ - "solr.collection.gettingstarted", - [ - "data", - { - "RRA:AVERAGE:0.5:1:240": { - "timestamps": "1527254820\n1527254880\n1527254940\n...", - "values": { - "numShards": "NaN\n2.0\n2.0\n2.0\n2.0\n2.0\n2.0\n...", - "numReplicas": "NaN\n4.0\n4.0\n4.0\n4.0\n4.0\n4.0\n...", - "QUERY./select.requests": "NaN\n123\n456\n789\n...", - ... - } - }, - "RRA:AVERAGE:0.5:10:288": { + "responseHeader": { + "zkConnected": true, + "status": 0, + "QTime": 2 + }, + "metrics": { + "solr.collection.gettingstarted": { + "data": { + "RRA:AVERAGE:0.5:1:240": { + "timestamps": "1527254820\n1527254880\n1527254940\n...", + "values": { + "numShards": "NaN\n2.0\n2.0\n2.0\n2.0\n2.0\n2.0\n...", + "numReplicas": "NaN\n4.0\n4.0\n4.0\n4.0\n4.0\n4.0\n...", + "QUERY./select.requests": "NaN\n123\n456\n789\n...", + ... + } + }, + "RRA:AVERAGE:0.5:10:288": { ... ---- @@ -368,29 +439,28 @@ curl http://localhost:8983/solr/admin/metrics/history?action=get&name=solr.colle [source,json] ---- { - "responseHeader": { - "status": 0, - "QTime": 2275 - }, - "metrics": [ - "solr.collection.gettingstarted", - [ - "data", - { - "RRA:AVERAGE:0.5:1:240": { - "values": { - "numShards": "iVBORw0KGgoAAAANSUhEUgAAAkQAAA...", - "numReplicas": "iVBORw0KGgoAAAANSUhEUgAAAkQA...", - "QUERY./select.requests": "iVBORw0KGgoAAAANS...", - ... - } - }, - "RRA:AVERAGE:0.5:10:288": { - "values": { - "numShards": "iVBORw0KGgoAAAANSUhEUgAAAkQAAA...", - ... - }, - ... + "responseHeader": { + "zkConnected": true, + "status": 0, + "QTime": 2 + }, + "metrics": { + "solr.collection.gettingstarted": { + "data": { + "RRA:AVERAGE:0.5:1:240": { + "values": { + "numShards": "iVBORw0KGgoAAAANSUhEUgAAAkQAAA...", + "numReplicas": "iVBORw0KGgoAAAANSUhEUgAAAkQA...", + "QUERY./select.requests": "iVBORw0KGgoAAAANS...", + ... + } + }, + "RRA:AVERAGE:0.5:10:288": { + "values": { + "numShards": "iVBORw0KGgoAAAANSUhEUgAAAkQAAA...", + ... + }, + ... ---- .Example 60 sec resolution history graph for `QUERY./select.requests` metric