Fix cluster alert for watcher/monitoring IndexOutOfBoundsExcep… (#47756)

If a cluster sending monitoring data is unhealthy and triggers an
alert, then stops sending data the following exception [1] can occur.

This exception stops the current Watch and the behavior is actually
correct in part due to the exception. Simply fixing the exception
introduces some incorrect behavior. Now that the Watch does not
error in the this case, it will result in an incorrectly "resolved"
alert.  The fix here is two parts a) fix the exception b) fix the
following incorrect behavior.

a) fixing the exception is as easy as checking the size of the
array before accessing it.

b) fixing the following incorrect behavior is a bit more intrusive

- Note - the UI depends on the success/met state for each condition
to determine an "OK" or "FIRING"

In this scenario, where an unhealthy cluster triggers an alert and
then goes silent, it should keep "FIRING" until it hears back that
the cluster is green. To keep the Watch "FIRING" either the index
action or the email action needs to fire. Since the Watch is neither
a "new" alert or a "resolved" alert, we do not want to keep sending
an email (that would be non-passive too). Without completely changing
the logic of how an alert is resolved allowing the index action to
take place would result in the alert being resolved. Since we can
not keep "FIRING" either the email or index action (since we don't
want to resolve the alert nor re-write the logic for alert resolution),
we will introduce a 3rd action. A logging action that WILL fire when
the cluster is unhealthy. Specifically will fire when there is an
unresolved alert and it can not find the cluster state.
This logging action is logged at debug, so it should be noticed much.
This logging action serves as an 'anchor' for the UI to keep the state
in an a "FIRING" status until the alert is resolved.

This presents a possible scenario where a cluster starts firing,
then goes completely silent forever, the Watch will be "FIRING"
forever. This is an edge case that already exists in some scenarios
and requires manual intervention to remove that Watch.

This changes changes to use a template-like method to populate the 
version_created for the default monitoring watches. The version is 
set to 7.5 since that is where this is first introduced.

Fixes #43184
This commit is contained in:
Jake Landis 2019-10-09 10:47:21 -05:00 committed by GitHub
parent 2abd9d53b6
commit 43dc72f1a5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 31 additions and 9 deletions

View File

@ -49,11 +49,19 @@ public class ClusterAlertsUtil {
private static final Pattern UNIQUE_WATCH_ID_PROPERTY =
Pattern.compile(Pattern.quote("${monitoring.watch.unique_id}"));
/**
* Replace the <code>${monitoring.watch.unique_id}</code> field in the watches.
*
* @see #createUniqueWatchId(ClusterService, String)
*/
private static final Pattern VERSION_CREATED_PROPERTY =
Pattern.compile(Pattern.quote("${monitoring.version_created}"));
/**
* The last time that all watches were updated. For now, all watches have been updated in the same version and should all be replaced
* together.
*/
public static final int LAST_UPDATED_VERSION = Version.V_7_0_0.id;
public static final int LAST_UPDATED_VERSION = Version.V_7_5_0.id;
/**
* An unsorted list of Watch IDs representing resource files for Monitoring Cluster Alerts.
@ -113,6 +121,7 @@ public class ClusterAlertsUtil {
source = CLUSTER_UUID_PROPERTY.matcher(source).replaceAll(clusterUuid);
source = WATCH_ID_PROPERTY.matcher(source).replaceAll(watchId);
source = UNIQUE_WATCH_ID_PROPERTY.matcher(source).replaceAll(uniqueWatchId);
source = VERSION_CREATED_PROPERTY.matcher(source).replaceAll(Integer.toString(LAST_UPDATED_VERSION));
return source;
} catch (final IOException e) {

View File

@ -7,7 +7,7 @@
"link": "elasticsearch/indices",
"severity": 2100,
"type": "monitoring",
"version_created": 7000099,
"version_created": "${monitoring.version_created}",
"watch": "${monitoring.watch.id}"
}
},
@ -134,11 +134,23 @@
},
"transform": {
"script": {
"source": "ctx.vars.email_recipient = (ctx.payload.kibana_settings.hits.total > 0 && ctx.payload.kibana_settings.hits.hits[0]._source.kibana_settings.xpack != null) ? ctx.payload.kibana_settings.hits.hits[0]._source.kibana_settings.xpack.default_admin_email : null;ctx.vars.is_new = ctx.vars.fails_check && !ctx.vars.not_resolved;ctx.vars.is_resolved = !ctx.vars.fails_check && ctx.vars.not_resolved;def state = ctx.payload.check.hits.hits[0]._source.cluster_state.status;if (ctx.vars.not_resolved){ctx.payload = ctx.payload.alert.hits.hits[0]._source;if (ctx.vars.fails_check == false) {ctx.payload.resolved_timestamp = ctx.execution_time;}} else {ctx.payload = ['timestamp': ctx.execution_time, 'metadata': ctx.metadata.xpack];}if (ctx.vars.fails_check) {ctx.payload.prefix = 'Elasticsearch cluster status is ' + state + '.';if (state == 'red') {ctx.payload.message = 'Allocate missing primary shards and replica shards.';ctx.payload.metadata.severity = 2100;} else {ctx.payload.message = 'Allocate missing replica shards.';ctx.payload.metadata.severity = 1100;}}ctx.vars.state = state.toUpperCase();ctx.payload.update_timestamp = ctx.execution_time;return ctx.payload;"
"source": "ctx.vars.email_recipient = (ctx.payload.kibana_settings.hits.total > 0 && ctx.payload.kibana_settings.hits.hits[0]._source.kibana_settings.xpack != null) ? ctx.payload.kibana_settings.hits.hits[0]._source.kibana_settings.xpack.default_admin_email : null;ctx.vars.is_new = ctx.vars.fails_check && !ctx.vars.not_resolved;ctx.vars.is_resolved = !ctx.vars.fails_check && ctx.vars.not_resolved;ctx.vars.found_state = ctx.payload.check.hits.total != 0;def state = ctx.vars.found_state ? ctx.payload.check.hits.hits[0]._source.cluster_state.status : 'unknown';if (ctx.vars.not_resolved){ctx.payload = ctx.payload.alert.hits.hits[0]._source;if (ctx.vars.fails_check == false) {ctx.payload.resolved_timestamp = ctx.execution_time;}} else {ctx.payload = ['timestamp': ctx.execution_time, 'metadata': ctx.metadata.xpack];}if (ctx.vars.fails_check) {ctx.payload.prefix = 'Elasticsearch cluster status is ' + state + '.';if (state == 'red') {ctx.payload.message = 'Allocate missing primary shards and replica shards.';ctx.payload.metadata.severity = 2100;} else {ctx.payload.message = 'Allocate missing replica shards.';ctx.payload.metadata.severity = 1100;}}ctx.vars.state = state.toUpperCase();ctx.payload.update_timestamp = ctx.execution_time;return ctx.payload;"
}
},
"actions": {
"log_state_not_found": {
"condition": {
"script": "!ctx.vars.found_state"
},
"logging" : {
"text" : "Watch [{{ctx.metadata.xpack.watch}}] could not determine cluster state for cluster [{{ctx.metadata.xpack.cluster_uuid}}]. This likely means the cluster has not sent any monitoring data recently.",
"level" : "debug"
}
},
"add_to_alerts_index": {
"condition": {
"script": "ctx.vars.found_state"
},
"index": {
"index": ".monitoring-alerts-7",
"doc_id": "${monitoring.watch.unique_id}"
@ -146,7 +158,7 @@
},
"send_email_to_admin": {
"condition": {
"script": "return ctx.vars.email_recipient != null && (ctx.vars.is_new || ctx.vars.is_resolved)"
"script": "return ctx.vars.email_recipient != null && ctx.vars.found_state && (ctx.vars.is_new || ctx.vars.is_resolved)"
},
"email": {
"to": "X-Pack Admin <{{ctx.vars.email_recipient}}>",

View File

@ -7,7 +7,7 @@
"link": "elasticsearch/nodes",
"severity": 1999,
"type": "monitoring",
"version_created": 7000099,
"version_created": "${monitoring.version_created}",
"watch": "${monitoring.watch.id}"
}
},

View File

@ -7,7 +7,7 @@
"link": "elasticsearch/nodes",
"severity": 1000,
"type": "monitoring",
"version_created": 7000099,
"version_created": "${monitoring.version_created}",
"watch": "${monitoring.watch.id}"
}
},

View File

@ -7,7 +7,7 @@
"link": "kibana/instances",
"severity": 1000,
"type": "monitoring",
"version_created": 7000099,
"version_created": "${monitoring.version_created}",
"watch": "${monitoring.watch.id}"
}
},

View File

@ -7,7 +7,7 @@
"link": "logstash/instances",
"severity": 1000,
"type": "monitoring",
"version_created": 7000099,
"version_created": "${monitoring.version_created}",
"watch": "${monitoring.watch.id}"
}
},

View File

@ -8,7 +8,7 @@
"alert_index": ".monitoring-alerts-7",
"cluster_uuid": "${monitoring.watch.cluster_uuid}",
"type": "monitoring",
"version_created": 7000099,
"version_created": "${monitoring.version_created}",
"watch": "${monitoring.watch.id}"
}
},

View File

@ -68,6 +68,7 @@ public class ClusterAlertsUtilTests extends ESTestCase {
assertThat(watch, notNullValue());
assertThat(watch, containsString(clusterUuid));
assertThat(watch, containsString(watchId));
assertThat(watch, containsString(String.valueOf(ClusterAlertsUtil.LAST_UPDATED_VERSION)));
if ("elasticsearch_nodes".equals(watchId) == false) {
assertThat(watch, containsString(clusterUuid + "_" + watchId));