From 43dc72f1a56ccbd502c818ed24a01c6dcfc6123d Mon Sep 17 00:00:00 2001 From: Jake Landis Date: Wed, 9 Oct 2019 10:47:21 -0500 Subject: [PATCH] =?UTF-8?q?Fix=20cluster=20alert=20for=20watcher/monitorin?= =?UTF-8?q?g=20IndexOutOfBoundsExcep=E2=80=A6=20(#47756)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a cluster sending monitoring data is unhealthy and triggers an alert, then stops sending data the following exception [1] can occur. This exception stops the current Watch and the behavior is actually correct in part due to the exception. Simply fixing the exception introduces some incorrect behavior. Now that the Watch does not error in the this case, it will result in an incorrectly "resolved" alert. The fix here is two parts a) fix the exception b) fix the following incorrect behavior. a) fixing the exception is as easy as checking the size of the array before accessing it. b) fixing the following incorrect behavior is a bit more intrusive - Note - the UI depends on the success/met state for each condition to determine an "OK" or "FIRING" In this scenario, where an unhealthy cluster triggers an alert and then goes silent, it should keep "FIRING" until it hears back that the cluster is green. To keep the Watch "FIRING" either the index action or the email action needs to fire. Since the Watch is neither a "new" alert or a "resolved" alert, we do not want to keep sending an email (that would be non-passive too). Without completely changing the logic of how an alert is resolved allowing the index action to take place would result in the alert being resolved. Since we can not keep "FIRING" either the email or index action (since we don't want to resolve the alert nor re-write the logic for alert resolution), we will introduce a 3rd action. A logging action that WILL fire when the cluster is unhealthy. Specifically will fire when there is an unresolved alert and it can not find the cluster state. This logging action is logged at debug, so it should be noticed much. This logging action serves as an 'anchor' for the UI to keep the state in an a "FIRING" status until the alert is resolved. This presents a possible scenario where a cluster starts firing, then goes completely silent forever, the Watch will be "FIRING" forever. This is an edge case that already exists in some scenarios and requires manual intervention to remove that Watch. This changes changes to use a template-like method to populate the version_created for the default monitoring watches. The version is set to 7.5 since that is where this is first introduced. Fixes #43184 --- .../monitoring/exporter/ClusterAlertsUtil.java | 11 ++++++++++- .../watches/elasticsearch_cluster_status.json | 18 +++++++++++++++--- .../watches/elasticsearch_nodes.json | 2 +- .../elasticsearch_version_mismatch.json | 2 +- .../watches/kibana_version_mismatch.json | 2 +- .../watches/logstash_version_mismatch.json | 2 +- .../watches/xpack_license_expiration.json | 2 +- .../exporter/ClusterAlertsUtilTests.java | 1 + 8 files changed, 31 insertions(+), 9 deletions(-) diff --git a/x-pack/plugin/monitoring/src/main/java/org/elasticsearch/xpack/monitoring/exporter/ClusterAlertsUtil.java b/x-pack/plugin/monitoring/src/main/java/org/elasticsearch/xpack/monitoring/exporter/ClusterAlertsUtil.java index 2fe7e983a7a..0aae7194487 100644 --- a/x-pack/plugin/monitoring/src/main/java/org/elasticsearch/xpack/monitoring/exporter/ClusterAlertsUtil.java +++ b/x-pack/plugin/monitoring/src/main/java/org/elasticsearch/xpack/monitoring/exporter/ClusterAlertsUtil.java @@ -49,11 +49,19 @@ public class ClusterAlertsUtil { private static final Pattern UNIQUE_WATCH_ID_PROPERTY = Pattern.compile(Pattern.quote("${monitoring.watch.unique_id}")); + /** + * Replace the ${monitoring.watch.unique_id} field in the watches. + * + * @see #createUniqueWatchId(ClusterService, String) + */ + private static final Pattern VERSION_CREATED_PROPERTY = + Pattern.compile(Pattern.quote("${monitoring.version_created}")); + /** * The last time that all watches were updated. For now, all watches have been updated in the same version and should all be replaced * together. */ - public static final int LAST_UPDATED_VERSION = Version.V_7_0_0.id; + public static final int LAST_UPDATED_VERSION = Version.V_7_5_0.id; /** * An unsorted list of Watch IDs representing resource files for Monitoring Cluster Alerts. @@ -113,6 +121,7 @@ public class ClusterAlertsUtil { source = CLUSTER_UUID_PROPERTY.matcher(source).replaceAll(clusterUuid); source = WATCH_ID_PROPERTY.matcher(source).replaceAll(watchId); source = UNIQUE_WATCH_ID_PROPERTY.matcher(source).replaceAll(uniqueWatchId); + source = VERSION_CREATED_PROPERTY.matcher(source).replaceAll(Integer.toString(LAST_UPDATED_VERSION)); return source; } catch (final IOException e) { diff --git a/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/elasticsearch_cluster_status.json b/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/elasticsearch_cluster_status.json index 4e250d5d743..16e52bce019 100644 --- a/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/elasticsearch_cluster_status.json +++ b/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/elasticsearch_cluster_status.json @@ -7,7 +7,7 @@ "link": "elasticsearch/indices", "severity": 2100, "type": "monitoring", - "version_created": 7000099, + "version_created": "${monitoring.version_created}", "watch": "${monitoring.watch.id}" } }, @@ -134,11 +134,23 @@ }, "transform": { "script": { - "source": "ctx.vars.email_recipient = (ctx.payload.kibana_settings.hits.total > 0 && ctx.payload.kibana_settings.hits.hits[0]._source.kibana_settings.xpack != null) ? ctx.payload.kibana_settings.hits.hits[0]._source.kibana_settings.xpack.default_admin_email : null;ctx.vars.is_new = ctx.vars.fails_check && !ctx.vars.not_resolved;ctx.vars.is_resolved = !ctx.vars.fails_check && ctx.vars.not_resolved;def state = ctx.payload.check.hits.hits[0]._source.cluster_state.status;if (ctx.vars.not_resolved){ctx.payload = ctx.payload.alert.hits.hits[0]._source;if (ctx.vars.fails_check == false) {ctx.payload.resolved_timestamp = ctx.execution_time;}} else {ctx.payload = ['timestamp': ctx.execution_time, 'metadata': ctx.metadata.xpack];}if (ctx.vars.fails_check) {ctx.payload.prefix = 'Elasticsearch cluster status is ' + state + '.';if (state == 'red') {ctx.payload.message = 'Allocate missing primary shards and replica shards.';ctx.payload.metadata.severity = 2100;} else {ctx.payload.message = 'Allocate missing replica shards.';ctx.payload.metadata.severity = 1100;}}ctx.vars.state = state.toUpperCase();ctx.payload.update_timestamp = ctx.execution_time;return ctx.payload;" + "source": "ctx.vars.email_recipient = (ctx.payload.kibana_settings.hits.total > 0 && ctx.payload.kibana_settings.hits.hits[0]._source.kibana_settings.xpack != null) ? ctx.payload.kibana_settings.hits.hits[0]._source.kibana_settings.xpack.default_admin_email : null;ctx.vars.is_new = ctx.vars.fails_check && !ctx.vars.not_resolved;ctx.vars.is_resolved = !ctx.vars.fails_check && ctx.vars.not_resolved;ctx.vars.found_state = ctx.payload.check.hits.total != 0;def state = ctx.vars.found_state ? ctx.payload.check.hits.hits[0]._source.cluster_state.status : 'unknown';if (ctx.vars.not_resolved){ctx.payload = ctx.payload.alert.hits.hits[0]._source;if (ctx.vars.fails_check == false) {ctx.payload.resolved_timestamp = ctx.execution_time;}} else {ctx.payload = ['timestamp': ctx.execution_time, 'metadata': ctx.metadata.xpack];}if (ctx.vars.fails_check) {ctx.payload.prefix = 'Elasticsearch cluster status is ' + state + '.';if (state == 'red') {ctx.payload.message = 'Allocate missing primary shards and replica shards.';ctx.payload.metadata.severity = 2100;} else {ctx.payload.message = 'Allocate missing replica shards.';ctx.payload.metadata.severity = 1100;}}ctx.vars.state = state.toUpperCase();ctx.payload.update_timestamp = ctx.execution_time;return ctx.payload;" } }, "actions": { + "log_state_not_found": { + "condition": { + "script": "!ctx.vars.found_state" + }, + "logging" : { + "text" : "Watch [{{ctx.metadata.xpack.watch}}] could not determine cluster state for cluster [{{ctx.metadata.xpack.cluster_uuid}}]. This likely means the cluster has not sent any monitoring data recently.", + "level" : "debug" + } + }, "add_to_alerts_index": { + "condition": { + "script": "ctx.vars.found_state" + }, "index": { "index": ".monitoring-alerts-7", "doc_id": "${monitoring.watch.unique_id}" @@ -146,7 +158,7 @@ }, "send_email_to_admin": { "condition": { - "script": "return ctx.vars.email_recipient != null && (ctx.vars.is_new || ctx.vars.is_resolved)" + "script": "return ctx.vars.email_recipient != null && ctx.vars.found_state && (ctx.vars.is_new || ctx.vars.is_resolved)" }, "email": { "to": "X-Pack Admin <{{ctx.vars.email_recipient}}>", diff --git a/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/elasticsearch_nodes.json b/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/elasticsearch_nodes.json index d79cb786267..4347801fa2a 100644 --- a/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/elasticsearch_nodes.json +++ b/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/elasticsearch_nodes.json @@ -7,7 +7,7 @@ "link": "elasticsearch/nodes", "severity": 1999, "type": "monitoring", - "version_created": 7000099, + "version_created": "${monitoring.version_created}", "watch": "${monitoring.watch.id}" } }, diff --git a/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/elasticsearch_version_mismatch.json b/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/elasticsearch_version_mismatch.json index 37132a03c7b..05fa8396623 100644 --- a/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/elasticsearch_version_mismatch.json +++ b/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/elasticsearch_version_mismatch.json @@ -7,7 +7,7 @@ "link": "elasticsearch/nodes", "severity": 1000, "type": "monitoring", - "version_created": 7000099, + "version_created": "${monitoring.version_created}", "watch": "${monitoring.watch.id}" } }, diff --git a/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/kibana_version_mismatch.json b/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/kibana_version_mismatch.json index 3e08fd98843..b35137ad140 100644 --- a/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/kibana_version_mismatch.json +++ b/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/kibana_version_mismatch.json @@ -7,7 +7,7 @@ "link": "kibana/instances", "severity": 1000, "type": "monitoring", - "version_created": 7000099, + "version_created": "${monitoring.version_created}", "watch": "${monitoring.watch.id}" } }, diff --git a/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/logstash_version_mismatch.json b/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/logstash_version_mismatch.json index 8bb5b5efe9d..8417ef4d069 100644 --- a/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/logstash_version_mismatch.json +++ b/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/logstash_version_mismatch.json @@ -7,7 +7,7 @@ "link": "logstash/instances", "severity": 1000, "type": "monitoring", - "version_created": 7000099, + "version_created": "${monitoring.version_created}", "watch": "${monitoring.watch.id}" } }, diff --git a/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/xpack_license_expiration.json b/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/xpack_license_expiration.json index 3f1f49e0240..35041919141 100644 --- a/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/xpack_license_expiration.json +++ b/x-pack/plugin/monitoring/src/main/resources/monitoring/watches/xpack_license_expiration.json @@ -8,7 +8,7 @@ "alert_index": ".monitoring-alerts-7", "cluster_uuid": "${monitoring.watch.cluster_uuid}", "type": "monitoring", - "version_created": 7000099, + "version_created": "${monitoring.version_created}", "watch": "${monitoring.watch.id}" } }, diff --git a/x-pack/plugin/monitoring/src/test/java/org/elasticsearch/xpack/monitoring/exporter/ClusterAlertsUtilTests.java b/x-pack/plugin/monitoring/src/test/java/org/elasticsearch/xpack/monitoring/exporter/ClusterAlertsUtilTests.java index 868cd17b3eb..7ce728e2582 100644 --- a/x-pack/plugin/monitoring/src/test/java/org/elasticsearch/xpack/monitoring/exporter/ClusterAlertsUtilTests.java +++ b/x-pack/plugin/monitoring/src/test/java/org/elasticsearch/xpack/monitoring/exporter/ClusterAlertsUtilTests.java @@ -68,6 +68,7 @@ public class ClusterAlertsUtilTests extends ESTestCase { assertThat(watch, notNullValue()); assertThat(watch, containsString(clusterUuid)); assertThat(watch, containsString(watchId)); + assertThat(watch, containsString(String.valueOf(ClusterAlertsUtil.LAST_UPDATED_VERSION))); if ("elasticsearch_nodes".equals(watchId) == false) { assertThat(watch, containsString(clusterUuid + "_" + watchId));