665 lines
22 KiB
Plaintext
665 lines
22 KiB
Plaintext
[[watching-marvel-data]]
|
|
=== Watching Marvel Data
|
|
|
|
If you use Marvel to monitor your Elasticsearch deployment, you can set up
|
|
watches to take action when something out of the ordinary occurs. For example,
|
|
you could set up watches to alert on:
|
|
|
|
- <<watching-cluster-health, Cluster health changes>>
|
|
- <<watching-memory-usage, High memory usage>>
|
|
- <<watching-cpu-usage, High cpu usage>>
|
|
- <<watching-open-file-descriptors, High file descriptor usage>>
|
|
- <<watching-fielddata, High fielddata cache usage>>
|
|
- <<watching-nodes, Nodes joining or leaving the cluster>>
|
|
|
|
NOTE: These watches query the index where your cluster's Marvel data is stored.
|
|
If you don't have Marvel installed, the queries won't return any results, the conditions
|
|
evaluate to false, and no actions are performed.
|
|
|
|
[float]
|
|
[[watching-cluster-health]]
|
|
==== Watching Cluster Health
|
|
|
|
This watch checks the cluster health once a minute and takes action if the cluster state has
|
|
been red for the last 60 seconds:
|
|
|
|
- The watch schedule is set to execute the watch every minute.
|
|
- The watch input gets the most recent cluster status from the `.marvel-*` indices.
|
|
- The watch condition checks the cluster status to see if it's been red for the last 60 seconds.
|
|
- The watch action is to send an email. (You could also call a `webhook` or store the event.)
|
|
|
|
[source,json]
|
|
--------------------------------------------------
|
|
PUT _watcher/watch/cluster_red_alert
|
|
{
|
|
"trigger": {
|
|
"schedule": {
|
|
"interval": "1m"
|
|
}
|
|
},
|
|
"input": {
|
|
"search": {
|
|
"request": {
|
|
"indices": ".marvel-*",
|
|
"types": "cluster_stats",
|
|
"body": {
|
|
"query": {
|
|
"filtered": {
|
|
"filter": {
|
|
"bool": {
|
|
"must": [
|
|
{
|
|
"range": {
|
|
"@timestamp": {
|
|
"gte": "now-2m",
|
|
"lte": "now"
|
|
}
|
|
}
|
|
}
|
|
],
|
|
"should": [
|
|
{
|
|
"term": {
|
|
"status.raw": "red"
|
|
}
|
|
},
|
|
{
|
|
"term": {
|
|
"status.raw": "green"
|
|
}
|
|
},
|
|
{
|
|
"term": {
|
|
"status.raw": "yellow"
|
|
}
|
|
}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"fields": ["@timestamp","status"],
|
|
"sort": [
|
|
{
|
|
"@timestamp": {
|
|
"order": "desc"
|
|
}
|
|
}
|
|
],
|
|
"size": 1,
|
|
"aggs": {
|
|
"minutes": {
|
|
"date_histogram": {
|
|
"field": "@timestamp",
|
|
"interval": "5s"
|
|
},
|
|
"aggs": {
|
|
"status": {
|
|
"terms": {
|
|
"field": "status.raw",
|
|
"size": 3
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"throttle_period": "30m", <1>
|
|
"condition": {
|
|
"script": {
|
|
"inline": "if (ctx.payload.hits.total < 1) return false; def rows = ctx.payload.hits.hits; if (rows[0].fields.status[0] != 'red') return false; if (ctx.payload.aggregations.minutes.buckets.size() < 12) return false; def last60Seconds = ctx.payload.aggregations.minutes.buckets[-12..-1]; return last60Seconds.every { it.status.buckets.every { s -> s.key == 'red' } }"
|
|
}
|
|
},
|
|
"actions": {
|
|
"send_email": { <2>
|
|
"email": {
|
|
"to": "<username>@<domainname>", <3>
|
|
"subject": "Watcher Notification - Cluster has been RED for the last 60 seconds",
|
|
"body": "Your cluster has been red for the last 60 seconds."
|
|
}
|
|
}
|
|
}
|
|
}
|
|
--------------------------------------------------
|
|
// AUTOSENSE
|
|
|
|
<1> The throttle period prevents notifications from being sent more than once every 30 minutes.
|
|
You can change the throttle period to receive notifications more or less frequently.
|
|
<2> To send email notifications, you must configure at least one email account in `elasticsearch.yml`.
|
|
See <<email-services, Configuring Email Services>> for more information.
|
|
<3> Specify the email address you want to notify.
|
|
|
|
NOTE: This example uses an inline script, which requires you to enable dynamic scripting in
|
|
Elasticsearch. While this is convenient when you're experimenting with Watcher, in a
|
|
production environment we recommend disabling dynamic scripting and using file scripts.
|
|
|
|
[float]
|
|
[[watching-memory-usage]]
|
|
==== Watching Memory Usage
|
|
|
|
This watch runs every minute and takes action if a node in the cluster has averaged 75% or greater
|
|
heap usage for the past 60 seconds.
|
|
|
|
- The watch schedule is set to execute the watch every minute.
|
|
- The watch input gets the average `jvm.mem.heap_used_percent` for each node from the `.marvel-*` indices.
|
|
- The watch condition checks to see if any node's average heap usage is 75% or greater.
|
|
- The watch action is to send an email. (You could also call a `webhook` or store the event.)
|
|
|
|
[source,json]
|
|
--------------------------------------------------
|
|
PUT _watcher/watch/mem_watch
|
|
{
|
|
"trigger": {
|
|
"schedule": {
|
|
"interval": "1m"
|
|
}
|
|
},
|
|
"input": {
|
|
"search": {
|
|
"request": {
|
|
"indices": [
|
|
".marvel-*"
|
|
],
|
|
"body": {
|
|
"size" : 0,
|
|
"query": {
|
|
"filtered": {
|
|
"filter": {
|
|
"range": {
|
|
"@timestamp": {
|
|
"gte": "now-2m",
|
|
"lte": "now"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"aggs": {
|
|
"minutes": {
|
|
"date_histogram": {
|
|
"field": "@timestamp",
|
|
"interval": "minute"
|
|
},
|
|
"aggs": {
|
|
"nodes": {
|
|
"terms": {
|
|
"field": "node.name.raw",
|
|
"size": 10,
|
|
"order": {
|
|
"memory": "desc"
|
|
}
|
|
},
|
|
"aggs": {
|
|
"memory": {
|
|
"avg": {
|
|
"field": "jvm.mem.heap_used_percent"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"throttle_period": "30m", <1>
|
|
"condition": {
|
|
"script": "if (ctx.payload.aggregations.minutes.buckets.size() == 0) return false; def latest = ctx.payload.aggregations.minutes.buckets[-1]; def node = latest.nodes.buckets[0]; return node && node.memory && node.memory.value >= 75;"
|
|
},
|
|
"actions": {
|
|
"send_email": {
|
|
"transform": {
|
|
"script": "def latest = ctx.payload.aggregations.minutes.buckets[-1]; return latest.nodes.buckets.findAll { return it.memory && it.memory.value >= 75 };"
|
|
},
|
|
"email": { <2>
|
|
"to": "<username>@<domainname>", <3>
|
|
"subject": "Watcher Notification - HIGH MEMORY USAGE",
|
|
"body": "Nodes with HIGH MEMORY Usage (above 75%):\n\n{{#ctx.payload._value}}\"{{key}}\" - Memory Usage is at {{memory.value}}%\n{{/ctx.payload._value}}"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
--------------------------------------------------
|
|
// AUTOSENSE
|
|
|
|
<1> The throttle period prevents notifications from being sent more than once every 30 minutes.
|
|
You can change the throttle period to receive notifications more or less frequently.
|
|
<2> To send email notifications, you must configure at least one email account in `elasticsearch.yml`.
|
|
See <<email-services, Configuring Email Services>> for more information.
|
|
<3> Specify the email address you want to notify.
|
|
|
|
NOTE: This example uses an inline script, which requires you to enable dynamic scripting in Elasticsearch.
|
|
While this is convenient when you're experimenting with Watcher, in a production
|
|
environment we recommend disabling dynamic scripting and using file scripts.
|
|
|
|
[float]
|
|
[[watching-cpu-usage]]
|
|
==== Watching CPU Usage
|
|
|
|
This watch runs every minute and takes action if a node in the cluster has averaged 75% or greater CPU
|
|
usage for the past 60 seconds.
|
|
|
|
- The watch schedule is set to execute the watch every minute.
|
|
- The watch input gets the average CPU usage for each node from the `.marvel-*` indices.
|
|
- The watch condition checks to see if any node's average CPU usage is 75% or greater.
|
|
- The watch action is to send an email. (You could also call a `webhook` or store the event.)
|
|
|
|
[source,json]
|
|
--------------------------------------------------
|
|
PUT _watcher/watch/cpu_usage
|
|
{
|
|
"trigger": {
|
|
"schedule": {
|
|
"interval": "1m"
|
|
}
|
|
},
|
|
"input": {
|
|
"search": {
|
|
"request": {
|
|
"indices": [
|
|
".marvel-*"
|
|
],
|
|
"body": {
|
|
"size" : 0,
|
|
"query": {
|
|
"filtered": {
|
|
"filter": {
|
|
"range": {
|
|
"@timestamp": {
|
|
"gte": "now-2m",
|
|
"lte": "now"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"aggs": {
|
|
"minutes": {
|
|
"date_histogram": {
|
|
"field": "@timestamp",
|
|
"interval": "minute"
|
|
},
|
|
"aggs": {
|
|
"nodes": {
|
|
"terms": {
|
|
"field": "node.name.raw",
|
|
"size": 10,
|
|
"order": {
|
|
"cpu": "desc"
|
|
}
|
|
},
|
|
"aggs": {
|
|
"cpu": {
|
|
"avg": {
|
|
"field": "os.cpu.user"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"throttle_period": "30m", <1>
|
|
"condition": {
|
|
"script": "if (ctx.payload.aggregations.minutes.buckets.size() == 0) return false; def latest = ctx.payload.aggregations.minutes.buckets[-1]; def node = latest.nodes.buckets[0]; return node && node.cpu && node.cpu.value >= 75;"
|
|
},
|
|
"actions": {
|
|
"send_email": { <2>
|
|
"transform": {
|
|
"script": "def latest = ctx.payload.aggregations.minutes.buckets[-1]; return latest.nodes.buckets.findAll { return it.cpu && it.cpu.value >= 75 };"
|
|
},
|
|
"email": {
|
|
"to": "user@example.com", <3>
|
|
"subject": "Watcher Notification - HIGH CPU USAGE",
|
|
"body": "Nodes with HIGH CPU Usage (above 75%):\n\n{{#ctx.payload._value}}\"{{key}}\" - CPU Usage is at {{cpu.value}}%\n{{/ctx.payload._value}}"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
--------------------------------------------------
|
|
// AUTOSENSE
|
|
|
|
<1> The throttle period prevents notifications from being sent more than once every 30 minutes.
|
|
You can change the throttle period to receive notifications more or less frequently.
|
|
<2> To send email notifications, you must configure at least one email account in `elasticsearch.yml`.
|
|
See <<email-services, Configuring Email Services>> for more information.
|
|
<3> Specify the email address you want to notify.
|
|
|
|
NOTE: This example uses an inline script, which requires you to enable dynamic scripting in Elasticsearch.
|
|
While this is convenient when you're experimenting with Watcher, in a production
|
|
environment we recommend disabling dynamic scripting and using file scripts.
|
|
|
|
|
|
[float]
|
|
[[watching-open-file-descriptors]]
|
|
==== Watching Open File Descriptors
|
|
|
|
This watch runs once a minute and takes action if there are nodes that
|
|
are using 80% or more of the available file descriptors.
|
|
|
|
- The watch schedule is set to execute the watch every minute.
|
|
- The watch input gets the average number of open file descriptors on each node from the `.marvel-*`
|
|
indices. The input search returns the top ten nodes with the highest average number of open file
|
|
descriptors.
|
|
- The watch condition checks the cluster status to see if any node's average number of open file
|
|
descriptors is 80% or greater.
|
|
- The watch action is to send an email. (You could also call a `webhook` or store the event.)
|
|
|
|
[source,json]
|
|
--------------------------------------------------
|
|
PUT _watcher/watch/open_file_descriptors
|
|
{
|
|
"metadata": {
|
|
"system_fd": 65535,
|
|
"threshold": 0.8
|
|
},
|
|
"trigger": {
|
|
"schedule": {
|
|
"interval": "1m"
|
|
}
|
|
},
|
|
"input": {
|
|
"search": {
|
|
"request": {
|
|
"indices": [
|
|
".marvel-*"
|
|
],
|
|
"types": "node_stats",
|
|
"body": {
|
|
"query": {
|
|
"size" : 0,
|
|
"filtered": {
|
|
"filter": {
|
|
"range": {
|
|
"@timestamp": {
|
|
"gte": "now-1m",
|
|
"lte": "now"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"aggs": {
|
|
"minutes": {
|
|
"date_histogram": {
|
|
"field": "@timestamp",
|
|
"interval": "5s"
|
|
},
|
|
"aggs": {
|
|
"nodes": {
|
|
"terms": {
|
|
"field": "node.name.raw",
|
|
"size": 10,
|
|
"order": {
|
|
"fd": "desc"
|
|
}
|
|
},
|
|
"aggs": {
|
|
"fd": {
|
|
"avg": {
|
|
"field": "process.open_file_descriptors"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"throttle_period": "30m", <1>
|
|
"condition": {
|
|
"script": "if (ctx.payload.aggregations.minutes.buckets.size() == 0) return false; def latest = ctx.payload.aggregations.minutes.buckets[-1]; def node = latest.nodes.buckets[0]; return node && node.fd && node.fd.value >= (ctx.metadata.system_fd * ctx.metadata.threshold);"
|
|
},
|
|
"actions": {
|
|
"send_email": { <2>
|
|
"transform": {
|
|
"script": "def latest = ctx.payload.aggregations.minutes.buckets[-1]; return latest.nodes.buckets.findAll({ return it.fd && it.fd.value >= (ctx.metadata.system_fd * ctx.metadata.threshold) }).collect({ it.fd.percent = Math.round((it.fd.value/ctx.metadata.system_fd)*100); it });"
|
|
},
|
|
"email": {
|
|
"to": "<username>@<domainname>", <3>
|
|
"subject": "Watcher Notification - NODES WITH 80% FILE DESCRIPTORS USED",
|
|
"body": "Nodes with 80% FILE DESCRIPTORS USED (above 80%):\n\n{{#ctx.payload._value}}\"{{key}}\" - File Descriptors is at {{fd.value}} ({{fd.percent}}%)\n{{/ctx.payload._value}}"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
--------------------------------------------------
|
|
// AUTOSENSE
|
|
|
|
<1> The throttle period prevents notifications from being sent more than once a minute.
|
|
You can change the throttle period to receive notifications more or less frequently.
|
|
<2> To send email notifications, you must configure at least one email account in
|
|
`elasticsearch.yml`. See <<email-services, Configuring Email Services>> for more
|
|
information.
|
|
<3> Specify the email address you want to notify.
|
|
|
|
NOTE: This example uses an inline script, which requires you to enable dynamic scripting in
|
|
Elasticsearch. While this is convenient when you're experimenting with Watcher, in a
|
|
production environment we recommend disabling dynamic scripting and using file scripts.
|
|
|
|
[float]
|
|
[[watching-fielddata]]
|
|
==== Watching Field Data Utilization
|
|
|
|
This watch runs once a minute and takes action if there are nodes that
|
|
are using 80% or more of their field data cache.
|
|
|
|
- The watch schedule is set to execute the watch every minute.
|
|
- The watch input gets the average field data memory usage on each node from the `.marvel-*` indices.
|
|
The input search returns the top ten nodes with the highest average field data usage.
|
|
- The watch condition checks the cluster status to see if any node's average field data usage is 80%
|
|
or more of the field data cache size.
|
|
- The watch action is to send an email. (You could also call a `webhook` or store the event.)
|
|
|
|
[source,json]
|
|
--------------------------------------------------
|
|
PUT _watcher/watch/fielddata_utilization
|
|
{
|
|
"metadata": {
|
|
"fielddata_cache_size": 100000, <1>
|
|
"threshold": 0.8
|
|
},
|
|
"trigger": {
|
|
"schedule": {
|
|
"interval": "1m"
|
|
}
|
|
},
|
|
"input": {
|
|
"search": {
|
|
"request": {
|
|
"indices": [
|
|
".marvel-*"
|
|
],
|
|
"types": "node_stats",
|
|
"body": {
|
|
"query": {
|
|
"size" : 0,
|
|
"filtered": {
|
|
"filter": {
|
|
"range": {
|
|
"@timestamp": {
|
|
"gte": "now-1m",
|
|
"lte": "now"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"aggs": {
|
|
"minutes": {
|
|
"date_histogram": {
|
|
"field": "@timestamp",
|
|
"interval": "5s"
|
|
},
|
|
"aggs": {
|
|
"nodes": {
|
|
"terms": {
|
|
"field": "node.name.raw",
|
|
"size": 10,
|
|
"order": {
|
|
"fielddata": "desc"
|
|
}
|
|
},
|
|
"aggs": {
|
|
"fielddata": {
|
|
"avg": {
|
|
"field": "indices.fielddata.memory_size_in_bytes"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"throttle_period": "30m", <2>
|
|
"condition": {
|
|
"script": "if (ctx.payload.aggregations.minutes.buckets.size() == 0) return false; def latest = ctx.payload.aggregations.minutes.buckets[-1]; def node = latest.nodes.buckets[0]; return node && node.fielddata && node.fielddata.value >= (ctx.metadata.fielddata_cache_size * ctx.metadata.threshold);"
|
|
},
|
|
"actions": {
|
|
"send_email": { <3>
|
|
"transform": {
|
|
"script": "def latest = ctx.payload.aggregations.minutes.buckets[-1]; return latest.nodes.buckets.findAll({ return it.fielddata && it.fielddata.value >= (ctx.metadata.fielddata_cache_size * ctx.metadata.threshold) }).collect({ it.fielddata.percent = Math.round((it.fielddata.value/ctx.metadata.fielddata_cache_size)*100); it });"
|
|
},
|
|
"email": {
|
|
"to": "<username>@<domainname>", <4>
|
|
"subject": "Watcher Notification - NODES WITH 80% FIELDDATA UTILIZATION",
|
|
"body": "Nodes with 80% FIELDDATA UTILIZATION (above 80%):\n\n{{#ctx.payload._value}}\"{{key}}\" - Fielddata utilization is at {{fielddata.value}} bytes ({{fielddata.percent}}%)\n{{/ctx.payload._value}}"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
--------------------------------------------------
|
|
// AUTOSENSE
|
|
|
|
<1> The size of the field data cache. Set to the actual cache size configured for your nodes.
|
|
<2> The throttle period prevents notifications from being sent more than once a minute.
|
|
You can change the throttle period to receive notifications more or less frequently.
|
|
<3> To send email notifications, you must configure at least one email account in
|
|
`elasticsearch.yml`. See <<email-services, Configuring Email Services>> for more
|
|
information.
|
|
<4> Specify the email address you want to notify.
|
|
|
|
NOTE: This example uses an inline script, which requires you to enable dynamic scripting in
|
|
Elasticsearch. While this is convenient when you're experimenting with Watcher, in a
|
|
production environment we recommend disabling dynamic scripting and using file scripts.
|
|
|
|
[[watching-nodes]]
|
|
[float]
|
|
==== Watching for Nodes Joining or Leaving a Cluster
|
|
|
|
This watch checks every minute to see if a node has joined or left the cluster:
|
|
|
|
- The watch schedule is set to execute the watch every minute.
|
|
- The watch input searches for `node_left` and `node_joined` events in the past 60 seconds.
|
|
- The watch condition checks to see if there are any search results in the payload. If so,
|
|
the watch actions are performed.
|
|
- The watch action is to send an email. (You could also call a `webhook` or store the event.)
|
|
|
|
|
|
[source,json]
|
|
--------------------------------------------------
|
|
PUT _watcher/watch/node_event
|
|
{
|
|
"trigger": {
|
|
"schedule": {
|
|
"interval": "60s"
|
|
}
|
|
},
|
|
"input": {
|
|
"search": {
|
|
"request": {
|
|
"indices": [
|
|
".marvel-*"
|
|
],
|
|
"search_type": "query_then_fetch",
|
|
"body": {
|
|
"query": {
|
|
"filtered": {
|
|
"query": {
|
|
"bool": {
|
|
"should": [
|
|
{
|
|
"match": {
|
|
"event": "node_left"
|
|
}
|
|
},
|
|
{
|
|
"match": {
|
|
"event": "node_joined"
|
|
}
|
|
}
|
|
]
|
|
}
|
|
},
|
|
"filter": {
|
|
"range": {
|
|
"@timestamp": {
|
|
"from": "{{ctx.trigger.scheduled_time}}||-60s",
|
|
"to": "{{ctx.trigger.triggered_time}}"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"fields": [
|
|
"event",
|
|
"message",
|
|
"cluster_name"
|
|
],
|
|
"sort": [
|
|
{
|
|
"@timestamp": {
|
|
"order": "desc"
|
|
}
|
|
}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"throttle_period": "60s", <1>
|
|
"condition": {
|
|
"script": {
|
|
"inline": "ctx.payload.hits.size() > 0 "
|
|
}
|
|
},
|
|
"actions": {
|
|
"send_email": { <2>
|
|
"email": {
|
|
"to": "<username>@<domainname>", <3>
|
|
"subject": "{{ctx.payload.hits.hits.0.fields.event}} the cluster",
|
|
"body": "{{ctx.payload.hits.hits.0.fields.message}} the cluster {{ctx.payload.hits.hits.0.fields.cluster_name}} "
|
|
}
|
|
}
|
|
}
|
|
}
|
|
--------------------------------------------------
|
|
// AUTOSENSE
|
|
|
|
<1> The throttle period prevents notifications from being sent more than once a minute.
|
|
You can change the throttle period to receive notifications more or less frequently.
|
|
<2> To send email notifications, you must configure at least one email account in
|
|
`elasticsearch.yml`. See <<email-services, Configuring Email Services>> for more
|
|
information.
|
|
<3> Specify the email address you want to notify.
|
|
|
|
NOTE: This example uses an inline script, which requires you to enable dynamic scripting in
|
|
Elasticsearch. While this is convenient when you're experimenting with Watcher, in a
|
|
production environment we recommend disabling dynamic scripting and using file scripts.
|