SOLR-11126: New Node-level health check handler at /admin/info/healthcheck and /node/health paths that checks if the node is live, connected to zookeeper and not shutdown

This commit is contained in:
Shalin Shekhar Mangar 2019-01-06 12:41:49 +05:30
parent 46592e981f
commit 2bd6f246b0
10 changed files with 121 additions and 26 deletions

View File

@ -185,6 +185,9 @@ New Features
* SOLR-7896: Add a login page to Admin UI, with initial support for Basic Auth (janhoy)
* SOLR-11126: New Node-level health check handler at /admin/info/healthcheck and /node/health paths that
checks if the node is live, connected to zookeeper and not shutdown. (Anshum Gupta, Amrit Sarkar, shalin)
Bug Fixes
----------------------

View File

@ -21,7 +21,6 @@ import static org.apache.solr.common.params.CommonParams.AUTHC_PATH;
import static org.apache.solr.common.params.CommonParams.AUTHZ_PATH;
import static org.apache.solr.common.params.CommonParams.AUTOSCALING_HISTORY_PATH;
import static org.apache.solr.common.params.CommonParams.COLLECTIONS_HANDLER_PATH;
import static org.apache.solr.common.params.CommonParams.HEALTH_CHECK_HANDLER_PATH;
import static org.apache.solr.common.params.CommonParams.CONFIGSETS_HANDLER_PATH;
import static org.apache.solr.common.params.CommonParams.CORES_HANDLER_PATH;
import static org.apache.solr.common.params.CommonParams.INFO_HANDLER_PATH;
@ -576,7 +575,6 @@ public class CoreContainer {
createHandler(ZK_PATH, ZookeeperInfoHandler.class.getName(), ZookeeperInfoHandler.class);
createHandler(ZK_STATUS_PATH, ZookeeperStatusHandler.class.getName(), ZookeeperStatusHandler.class);
collectionsHandler = createHandler(COLLECTIONS_HANDLER_PATH, cfg.getCollectionsHandlerClass(), CollectionsHandler.class);
healthCheckHandler = createHandler(HEALTH_CHECK_HANDLER_PATH, cfg.getHealthCheckHandlerClass(), HealthCheckHandler.class);
infoHandler = createHandler(INFO_HANDLER_PATH, cfg.getInfoHandlerClass(), InfoHandler.class);
coreAdminHandler = createHandler(CORES_HANDLER_PATH, cfg.getCoreAdminHandlerClass(), CoreAdminHandler.class);
configSetsHandler = createHandler(CONFIGSETS_HANDLER_PATH, cfg.getConfigSetsHandlerClass(), ConfigSetsHandler.class);

View File

@ -18,7 +18,6 @@
package org.apache.solr.handler.admin;
import java.lang.invoke.MethodHandles;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.ZkStateReader;
@ -30,6 +29,7 @@ import org.apache.solr.response.SolrQueryResponse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.solr.common.params.CommonParams.FAILURE;
import static org.apache.solr.common.params.CommonParams.OK;
import static org.apache.solr.common.params.CommonParams.STATUS;
@ -38,8 +38,9 @@ import static org.apache.solr.common.params.CommonParams.STATUS;
* Health Check Handler for reporting the health of a specific node.
*
* This checks if the node is:
* 1. Connected to zookeeper
* 2. listed in 'live_nodes'.
* 1. Cores container is active.
* 1. Connected to zookeeper.
* 2. Listed in 'live_nodes' in zookeeper.
*/
public class HealthCheckHandler extends RequestHandlerBase {
@ -47,6 +48,8 @@ public class HealthCheckHandler extends RequestHandlerBase {
CoreContainer coreContainer;
public HealthCheckHandler() {}
public HealthCheckHandler(final CoreContainer coreContainer) {
super();
this.coreContainer = coreContainer;
@ -54,7 +57,6 @@ public class HealthCheckHandler extends RequestHandlerBase {
@Override
final public void init(NamedList args) {
}
public CoreContainer getCoreContainer() {
@ -67,8 +69,9 @@ public class HealthCheckHandler extends RequestHandlerBase {
log.debug("Invoked HealthCheckHandler on [{}]", coreContainer.getZkController().getNodeName());
CoreContainer cores = getCoreContainer();
if(cores == null) {
rsp.setException(new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Core container not initialized"));
// Core container should not be null and active (redundant check)
if(cores == null || cores.isShutDown()) {
rsp.setException(new SolrException(SolrException.ErrorCode.SERVER_ERROR, "CoreContainer is either not initialized or shutting down"));
return;
}
if(!cores.isZooKeeperAware()) {
@ -94,8 +97,6 @@ public class HealthCheckHandler extends RequestHandlerBase {
}
rsp.setHttpCaching(false);
return;
}
@Override
@ -107,4 +108,9 @@ public class HealthCheckHandler extends RequestHandlerBase {
public Category getCategory() {
return Category.ADMIN;
}
@Override
public Boolean registerV2() {
return Boolean.TRUE;
}
}

View File

@ -50,6 +50,8 @@ public class InfoHandler extends RequestHandlerBase {
handlers.put("properties", new PropertiesRequestHandler());
handlers.put("logging", new LoggingHandler(coreContainer));
handlers.put("system", new SystemInfoHandler(coreContainer));
handlers.put("health", new HealthCheckHandler(coreContainer));
}

View File

@ -91,6 +91,10 @@
"class": "solr.LoggingHandler",
"useParams":"_ADMIN_LOGGING"
},
"/admin/health": {
"class": "solr.HealthCheckHandler",
"useParams":"_ADMIN_HEALTH"
},
"/admin/file": {
"class": "solr.ShowFileRequestHandler",
"useParams":"_ADMIN_FILE"

View File

@ -24,13 +24,16 @@ import org.apache.solr.client.solrj.SolrResponse;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.GenericSolrRequest;
import org.apache.solr.client.solrj.request.HealthCheckRequest;
import org.apache.solr.client.solrj.request.V2Request;
import org.apache.solr.client.solrj.response.CollectionAdminResponse;
import org.apache.solr.client.solrj.response.HealthCheckResponse;
import org.apache.solr.client.solrj.response.V2Response;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.junit.BeforeClass;
import org.junit.Test;
@ -53,42 +56,78 @@ public class HealthCheckHandlerTest extends SolrCloudTestCase {
// as compared with testHealthCheckHandlerWithCloudClient
// (Not sure if that's actaully a good thing -- but it's how the existing test worked)
assertEquals(CommonParams.OK,
req.process(cluster.getSolrClient()).getResponse().get(CommonParams.STATUS));
req.process(cluster.getSolrClient()).getResponse().get(CommonParams.STATUS));
// positive check that our exiting "healthy" node works with direct http client
try (HttpSolrClient httpSolrClient = getHttpSolrClient(cluster.getJettySolrRunner(0).getBaseUrl().toString())) {
SolrResponse response = req.process(httpSolrClient);
assertEquals(CommonParams.OK, response.getResponse().get(CommonParams.STATUS));
}
// successfully create a dummy collection
try (HttpSolrClient httpSolrClient = getHttpSolrClient(cluster.getJettySolrRunner(0).getBaseUrl().toString())) {
CollectionAdminResponse collectionAdminResponse = CollectionAdminRequest.createCollection("test", "_default", 1, 1)
.withProperty("solr.directoryFactory", "solr.StandardDirectoryFactory")
.process(httpSolrClient);
assertEquals(0, collectionAdminResponse.getStatus());
SolrResponse response = req.process(httpSolrClient);
assertEquals(CommonParams.OK, response.getResponse().get(CommonParams.STATUS));
} finally {
cluster.deleteAllCollections();
cluster.deleteAllConfigSets();
}
// add a new node for the purpose of negative testing
JettySolrRunner newJetty = cluster.startJettySolrRunner();
try (HttpSolrClient httpSolrClient = getHttpSolrClient(newJetty.getBaseUrl().toString())) {
// postive check that our (new) "healthy" node works with direct http client
assertEquals(CommonParams.OK, req.process(httpSolrClient).getResponse().get(CommonParams.STATUS));
// now "break" our (new) node
newJetty.getCoreContainer().getZkController().getZkClient().close();
// negative check of our (new) "broken" node that we deliberately put into an unhealth state
HttpSolrClient.RemoteSolrException e = expectThrows(HttpSolrClient.RemoteSolrException.class, () ->
{
req.process(httpSolrClient);
});
{
req.process(httpSolrClient);
});
assertTrue(e.getMessage(), e.getMessage().contains("Host Unavailable"));
assertEquals(SolrException.ErrorCode.SERVICE_UNAVAILABLE.code, e.code());
} finally {
newJetty.stop();
}
// add a new node for the purpose of negative testing
// negative check that if core container is not available at the node
newJetty = cluster.startJettySolrRunner();
try (HttpSolrClient httpSolrClient = getHttpSolrClient(newJetty.getBaseUrl().toString())) {
// postive check that our (new) "healthy" node works with direct http client
assertEquals(CommonParams.OK, req.process(httpSolrClient).getResponse().get(CommonParams.STATUS));
// shutdown the core container of new node
newJetty.getCoreContainer().shutdown();
// api shouldn't unreachable
SolrException thrown = expectThrows(SolrException.class, () -> {
req.process(httpSolrClient).getResponse().get(CommonParams.STATUS);
fail("API shouldn't be available, and fail at above request");
});
assertEquals("Exception code should be 404", 404, thrown.code());
assertTrue("Should have seen an exception containing the an error", thrown.getMessage().contains(
"Error processing the request. CoreContainer is either not initialized or shutting down."));
} finally {
newJetty.stop();
}
// (redundent) positive check that our (previously) exiting "healthy" node (still) works
// after getting negative results from our broken node
// after getting negative results from our broken node and failed core container
try (HttpSolrClient httpSolrClient = getHttpSolrClient(cluster.getJettySolrRunner(0).getBaseUrl().toString())) {
assertEquals(CommonParams.OK, req.process(httpSolrClient).getResponse().get(CommonParams.STATUS));
}
}
@Test
@ -101,11 +140,40 @@ public class HealthCheckHandlerTest extends SolrCloudTestCase {
}
}
@Test (expected = AssertionError.class)
@Test(expected = AssertionError.class)
public void testHealthCheckHandlerWithCloudClient() throws IOException, SolrServerException {
// negative check of a HealthCheckRequest using cloud solr client
HealthCheckRequest req = new HealthCheckRequest();
req.process(cluster.getSolrClient());
}
}
@Test
public void testHealthCheckV2Api() throws Exception {
V2Response res = new V2Request.Builder("/node/health").build().process(cluster.getSolrClient());
assertEquals(0, res.getStatus());
assertEquals(CommonParams.OK, res.getResponse().get(CommonParams.STATUS));
// add a new node for the purpose of negative testing
JettySolrRunner newJetty = cluster.startJettySolrRunner();
try (HttpSolrClient httpSolrClient = getHttpSolrClient(newJetty.getBaseUrl().toString())) {
// postive check that our (new) "healthy" node works with direct http client
assertEquals(CommonParams.OK, new V2Request.Builder("/node/health").build().process(httpSolrClient).
getResponse().get(CommonParams.STATUS));
// now "break" our (new) node
newJetty.getCoreContainer().getZkController().getZkClient().close();
// negative check of our (new) "broken" node that we deliberately put into an unhealth state
HttpSolrClient.RemoteSolrException e = expectThrows(HttpSolrClient.RemoteSolrException.class, () ->
{
new V2Request.Builder("/node/health").build().process(httpSolrClient);
});
assertTrue(e.getMessage(), e.getMessage().contains("Host Unavailable"));
assertEquals(SolrException.ErrorCode.SERVICE_UNAVAILABLE.code, e.code());
} finally {
newJetty.stop();
}
}
}

View File

@ -87,6 +87,7 @@ public class SolrCoreTest extends SolrTestCaseJ4 {
int ihCount = 0;
{
++ihCount; assertEquals(pathToClassMap.get("/admin/health"), "solr.HealthCheckHandler");
++ihCount; assertEquals(pathToClassMap.get("/admin/file"), "solr.ShowFileRequestHandler");
++ihCount; assertEquals(pathToClassMap.get("/admin/logging"), "solr.LoggingHandler");
++ihCount; assertEquals(pathToClassMap.get("/admin/luke"), "solr.LukeRequestHandler");

View File

@ -128,6 +128,18 @@ Threads:: Return info on all JVM threads.
v2: `api/node/threads` |{solr-javadocs}/solr-core/org/apache/solr/handler/admin/ThreadDumpHandler.html[ThreadDumpHandler] |`_ADMIN_THREADS`
|===
Health:: Reporting the health of the node (_available only in Solrcloud mode_)
+
[cols="3*.",frame=none,grid=cols,options="header"]
|===
|API Endpoints |Class & Javadocs |Paramset
|v1: `solr/admin/info/health`
v2: `api/node/health` |{solr-javadocs}/solr-core/org/apache/solr/handler/admin/HealthCheckHandler.html[HealthCheckHandler] |`_ADMIN_HEALTH`
|===
This endpoint can also take the collection or core name in the path (`solr/<collection>/admin/health` or `solr/<core>/admin/health`).
=== Analysis Handlers
[horizontal]

View File

@ -176,8 +176,8 @@ public interface CommonParams {
String OMIT_HEADER = "omitHeader";
String CORES_HANDLER_PATH = "/admin/cores";
String COLLECTIONS_HANDLER_PATH = "/admin/collections";
String HEALTH_CHECK_HANDLER_PATH = "/admin/health";
String INFO_HANDLER_PATH = "/admin/info";
String HEALTH_CHECK_HANDLER_PATH = INFO_HANDLER_PATH + "/health";
String CONFIGSETS_HANDLER_PATH = "/admin/configs";
String AUTHZ_PATH = "/admin/authorization";
String AUTHC_PATH = "/admin/authentication";

View File

@ -1,11 +1,12 @@
{
"description": "Provides information about system properties, threads, logging settings, and system details for a node.",
"description": "Provides information about system properties, threads, logging settings, system details and health (available in Solrcloud mode) for a node.",
"methods": ["GET"],
"url": {
"paths": [
"/node/properties",
"/node/threads",
"/node/logging",
"/node/system"]
"/node/system",
"/node/health"]
}
}