Fix cluster alert for watcher/monitoring IndexOutOfBoundsExcep… (#47756)
If a cluster sending monitoring data is unhealthy and triggers an alert, then stops sending data the following exception [1] can occur. This exception stops the current Watch and the behavior is actually correct in part due to the exception. Simply fixing the exception introduces some incorrect behavior. Now that the Watch does not error in the this case, it will result in an incorrectly "resolved" alert. The fix here is two parts a) fix the exception b) fix the following incorrect behavior. a) fixing the exception is as easy as checking the size of the array before accessing it. b) fixing the following incorrect behavior is a bit more intrusive - Note - the UI depends on the success/met state for each condition to determine an "OK" or "FIRING" In this scenario, where an unhealthy cluster triggers an alert and then goes silent, it should keep "FIRING" until it hears back that the cluster is green. To keep the Watch "FIRING" either the index action or the email action needs to fire. Since the Watch is neither a "new" alert or a "resolved" alert, we do not want to keep sending an email (that would be non-passive too). Without completely changing the logic of how an alert is resolved allowing the index action to take place would result in the alert being resolved. Since we can not keep "FIRING" either the email or index action (since we don't want to resolve the alert nor re-write the logic for alert resolution), we will introduce a 3rd action. A logging action that WILL fire when the cluster is unhealthy. Specifically will fire when there is an unresolved alert and it can not find the cluster state. This logging action is logged at debug, so it should be noticed much. This logging action serves as an 'anchor' for the UI to keep the state in an a "FIRING" status until the alert is resolved. This presents a possible scenario where a cluster starts firing, then goes completely silent forever, the Watch will be "FIRING" forever. This is an edge case that already exists in some scenarios and requires manual intervention to remove that Watch. This changes changes to use a template-like method to populate the version_created for the default monitoring watches. The version is set to 7.5 since that is where this is first introduced. Fixes #43184
This commit is contained in:
parent
2abd9d53b6
commit
43dc72f1a5
|
@ -49,11 +49,19 @@ public class ClusterAlertsUtil {
|
|||
private static final Pattern UNIQUE_WATCH_ID_PROPERTY =
|
||||
Pattern.compile(Pattern.quote("${monitoring.watch.unique_id}"));
|
||||
|
||||
/**
|
||||
* Replace the <code>${monitoring.watch.unique_id}</code> field in the watches.
|
||||
*
|
||||
* @see #createUniqueWatchId(ClusterService, String)
|
||||
*/
|
||||
private static final Pattern VERSION_CREATED_PROPERTY =
|
||||
Pattern.compile(Pattern.quote("${monitoring.version_created}"));
|
||||
|
||||
/**
|
||||
* The last time that all watches were updated. For now, all watches have been updated in the same version and should all be replaced
|
||||
* together.
|
||||
*/
|
||||
public static final int LAST_UPDATED_VERSION = Version.V_7_0_0.id;
|
||||
public static final int LAST_UPDATED_VERSION = Version.V_7_5_0.id;
|
||||
|
||||
/**
|
||||
* An unsorted list of Watch IDs representing resource files for Monitoring Cluster Alerts.
|
||||
|
@ -113,6 +121,7 @@ public class ClusterAlertsUtil {
|
|||
source = CLUSTER_UUID_PROPERTY.matcher(source).replaceAll(clusterUuid);
|
||||
source = WATCH_ID_PROPERTY.matcher(source).replaceAll(watchId);
|
||||
source = UNIQUE_WATCH_ID_PROPERTY.matcher(source).replaceAll(uniqueWatchId);
|
||||
source = VERSION_CREATED_PROPERTY.matcher(source).replaceAll(Integer.toString(LAST_UPDATED_VERSION));
|
||||
|
||||
return source;
|
||||
} catch (final IOException e) {
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
"link": "elasticsearch/indices",
|
||||
"severity": 2100,
|
||||
"type": "monitoring",
|
||||
"version_created": 7000099,
|
||||
"version_created": "${monitoring.version_created}",
|
||||
"watch": "${monitoring.watch.id}"
|
||||
}
|
||||
},
|
||||
|
@ -134,11 +134,23 @@
|
|||
},
|
||||
"transform": {
|
||||
"script": {
|
||||
"source": "ctx.vars.email_recipient = (ctx.payload.kibana_settings.hits.total > 0 && ctx.payload.kibana_settings.hits.hits[0]._source.kibana_settings.xpack != null) ? ctx.payload.kibana_settings.hits.hits[0]._source.kibana_settings.xpack.default_admin_email : null;ctx.vars.is_new = ctx.vars.fails_check && !ctx.vars.not_resolved;ctx.vars.is_resolved = !ctx.vars.fails_check && ctx.vars.not_resolved;def state = ctx.payload.check.hits.hits[0]._source.cluster_state.status;if (ctx.vars.not_resolved){ctx.payload = ctx.payload.alert.hits.hits[0]._source;if (ctx.vars.fails_check == false) {ctx.payload.resolved_timestamp = ctx.execution_time;}} else {ctx.payload = ['timestamp': ctx.execution_time, 'metadata': ctx.metadata.xpack];}if (ctx.vars.fails_check) {ctx.payload.prefix = 'Elasticsearch cluster status is ' + state + '.';if (state == 'red') {ctx.payload.message = 'Allocate missing primary shards and replica shards.';ctx.payload.metadata.severity = 2100;} else {ctx.payload.message = 'Allocate missing replica shards.';ctx.payload.metadata.severity = 1100;}}ctx.vars.state = state.toUpperCase();ctx.payload.update_timestamp = ctx.execution_time;return ctx.payload;"
|
||||
"source": "ctx.vars.email_recipient = (ctx.payload.kibana_settings.hits.total > 0 && ctx.payload.kibana_settings.hits.hits[0]._source.kibana_settings.xpack != null) ? ctx.payload.kibana_settings.hits.hits[0]._source.kibana_settings.xpack.default_admin_email : null;ctx.vars.is_new = ctx.vars.fails_check && !ctx.vars.not_resolved;ctx.vars.is_resolved = !ctx.vars.fails_check && ctx.vars.not_resolved;ctx.vars.found_state = ctx.payload.check.hits.total != 0;def state = ctx.vars.found_state ? ctx.payload.check.hits.hits[0]._source.cluster_state.status : 'unknown';if (ctx.vars.not_resolved){ctx.payload = ctx.payload.alert.hits.hits[0]._source;if (ctx.vars.fails_check == false) {ctx.payload.resolved_timestamp = ctx.execution_time;}} else {ctx.payload = ['timestamp': ctx.execution_time, 'metadata': ctx.metadata.xpack];}if (ctx.vars.fails_check) {ctx.payload.prefix = 'Elasticsearch cluster status is ' + state + '.';if (state == 'red') {ctx.payload.message = 'Allocate missing primary shards and replica shards.';ctx.payload.metadata.severity = 2100;} else {ctx.payload.message = 'Allocate missing replica shards.';ctx.payload.metadata.severity = 1100;}}ctx.vars.state = state.toUpperCase();ctx.payload.update_timestamp = ctx.execution_time;return ctx.payload;"
|
||||
}
|
||||
},
|
||||
"actions": {
|
||||
"log_state_not_found": {
|
||||
"condition": {
|
||||
"script": "!ctx.vars.found_state"
|
||||
},
|
||||
"logging" : {
|
||||
"text" : "Watch [{{ctx.metadata.xpack.watch}}] could not determine cluster state for cluster [{{ctx.metadata.xpack.cluster_uuid}}]. This likely means the cluster has not sent any monitoring data recently.",
|
||||
"level" : "debug"
|
||||
}
|
||||
},
|
||||
"add_to_alerts_index": {
|
||||
"condition": {
|
||||
"script": "ctx.vars.found_state"
|
||||
},
|
||||
"index": {
|
||||
"index": ".monitoring-alerts-7",
|
||||
"doc_id": "${monitoring.watch.unique_id}"
|
||||
|
@ -146,7 +158,7 @@
|
|||
},
|
||||
"send_email_to_admin": {
|
||||
"condition": {
|
||||
"script": "return ctx.vars.email_recipient != null && (ctx.vars.is_new || ctx.vars.is_resolved)"
|
||||
"script": "return ctx.vars.email_recipient != null && ctx.vars.found_state && (ctx.vars.is_new || ctx.vars.is_resolved)"
|
||||
},
|
||||
"email": {
|
||||
"to": "X-Pack Admin <{{ctx.vars.email_recipient}}>",
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
"link": "elasticsearch/nodes",
|
||||
"severity": 1999,
|
||||
"type": "monitoring",
|
||||
"version_created": 7000099,
|
||||
"version_created": "${monitoring.version_created}",
|
||||
"watch": "${monitoring.watch.id}"
|
||||
}
|
||||
},
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
"link": "elasticsearch/nodes",
|
||||
"severity": 1000,
|
||||
"type": "monitoring",
|
||||
"version_created": 7000099,
|
||||
"version_created": "${monitoring.version_created}",
|
||||
"watch": "${monitoring.watch.id}"
|
||||
}
|
||||
},
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
"link": "kibana/instances",
|
||||
"severity": 1000,
|
||||
"type": "monitoring",
|
||||
"version_created": 7000099,
|
||||
"version_created": "${monitoring.version_created}",
|
||||
"watch": "${monitoring.watch.id}"
|
||||
}
|
||||
},
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
"link": "logstash/instances",
|
||||
"severity": 1000,
|
||||
"type": "monitoring",
|
||||
"version_created": 7000099,
|
||||
"version_created": "${monitoring.version_created}",
|
||||
"watch": "${monitoring.watch.id}"
|
||||
}
|
||||
},
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
"alert_index": ".monitoring-alerts-7",
|
||||
"cluster_uuid": "${monitoring.watch.cluster_uuid}",
|
||||
"type": "monitoring",
|
||||
"version_created": 7000099,
|
||||
"version_created": "${monitoring.version_created}",
|
||||
"watch": "${monitoring.watch.id}"
|
||||
}
|
||||
},
|
||||
|
|
|
@ -68,6 +68,7 @@ public class ClusterAlertsUtilTests extends ESTestCase {
|
|||
assertThat(watch, notNullValue());
|
||||
assertThat(watch, containsString(clusterUuid));
|
||||
assertThat(watch, containsString(watchId));
|
||||
assertThat(watch, containsString(String.valueOf(ClusterAlertsUtil.LAST_UPDATED_VERSION)));
|
||||
|
||||
if ("elasticsearch_nodes".equals(watchId) == false) {
|
||||
assertThat(watch, containsString(clusterUuid + "_" + watchId));
|
||||
|
|
Loading…
Reference in New Issue