From 3767bdc98d974282e4b759fe9a072b0d188fe4f2 Mon Sep 17 00:00:00 2001 From: David Kyle Date: Wed, 6 Jun 2018 13:21:16 +0100 Subject: [PATCH] [ML][DOCS] Add example of top N derivative aggregation (#31109) Add example of top N derivative aggregation to the ML datafeed docs --- x-pack/docs/build.gradle | 53 ++++++++++- x-pack/docs/en/ml/aggregations.asciidoc | 112 ++++++++++++++++++------ 2 files changed, 136 insertions(+), 29 deletions(-) diff --git a/x-pack/docs/build.gradle b/x-pack/docs/build.gradle index 17e0f2b70fd..3d799c8d0b5 100644 --- a/x-pack/docs/build.gradle +++ b/x-pack/docs/build.gradle @@ -16,7 +16,6 @@ buildRestTests.expectedUnconvertedCandidates = [ 'en/ml/functions/rare.asciidoc', 'en/ml/functions/sum.asciidoc', 'en/ml/functions/time.asciidoc', - 'en/ml/aggregations.asciidoc', 'en/ml/customurl.asciidoc', 'en/monitoring/indices.asciidoc', 'en/rest-api/security/ssl.asciidoc', @@ -281,6 +280,58 @@ setups['library'] = ''' {"name": "The Moon is a Harsh Mistress", "author": "Robert A. Heinlein", "release_date": "1966-04-01", "page_count": 288} ''' +setups['farequote_index'] = ''' + - do: + indices.create: + index: farequote + body: + settings: + number_of_shards: 1 + number_of_replicas: 0 + mappings: + metric: + properties: + time: + type: date + responsetime: + type: float + airline: + type: keyword +''' +setups['farequote_data'] = setups['farequote_index'] + ''' + - do: + bulk: + index: farequote + type: metric + refresh: true + body: | + {"index": {"_id":"1"}} + {"airline":"JZA","responsetime":990.4628,"time":"2016-02-07T00:00:00+0000"} + {"index": {"_id":"2"}} + {"airline":"JBU","responsetime":877.5927,"time":"2016-02-07T00:00:00+0000"} + {"index": {"_id":"3"}} + {"airline":"KLM","responsetime":1355.4812,"time":"2016-02-07T00:00:00+0000"} +''' +setups['farequote_job'] = setups['farequote_data'] + ''' + - do: + xpack.ml.put_job: + job_id: "farequote" + body: > + { + "analysis_config": { + "bucket_span": "60m", + "detectors": [{ + "function": "mean", + "field_name": "responsetime", + "by_field_name": "airline" + }], + "summary_count_field_name": "doc_count" + }, + "data_description": { + "time_field": "time" + } + } +''' setups['server_metrics_index'] = ''' - do: indices.create: diff --git a/x-pack/docs/en/ml/aggregations.asciidoc b/x-pack/docs/en/ml/aggregations.asciidoc index cc98a45d11e..f3b8e6b3e34 100644 --- a/x-pack/docs/en/ml/aggregations.asciidoc +++ b/x-pack/docs/en/ml/aggregations.asciidoc @@ -11,11 +11,12 @@ aggregated data into {xpackml} instead of raw results, which reduces the volume of data that must be considered while detecting anomalies. There are some limitations to using aggregations in {dfeeds}, however. -Your aggregation must include a buckets aggregation, which in turn must contain -a date histogram aggregation. This requirement ensures that the aggregated -data is a time series. If you use a terms aggregation and the cardinality of a -term is high, then the aggregation might not be effective and you might want -to just use the default search and scroll behavior. +Your aggregation must include a `date_histogram` aggregation, which in turn must +contain a `max` aggregation on the time field. This requirement ensures that the +aggregated data is a time series and the timestamp of each bucket is the time +of the last record in the bucket. If you use a terms aggregation and the +cardinality of a term is high, then the aggregation might not be effective and +you might want to just use the default search and scroll behavior. When you create or update a job, you can include the names of aggregations, for example: @@ -27,9 +28,9 @@ PUT _xpack/ml/anomaly_detectors/farequote "analysis_config": { "bucket_span": "60m", "detectors": [{ - "function":"mean", - "field_name":"responsetime", - "by_field_name":"airline" + "function": "mean", + "field_name": "responsetime", + "by_field_name": "airline" }], "summary_count_field_name": "doc_count" }, @@ -38,6 +39,8 @@ PUT _xpack/ml/anomaly_detectors/farequote } } ---------------------------------- +// CONSOLE +// TEST[setup:farequote_data] In this example, the `airline`, `responsetime`, and `time` fields are aggregations. @@ -85,7 +88,8 @@ PUT _xpack/ml/datafeeds/datafeed-farequote } } ---------------------------------- - +// CONSOLE +// TEST[setup:farequote_job] In this example, the aggregations have names that match the fields that they operate on. That is to say, the `max` aggregation is named `time` and its @@ -100,35 +104,86 @@ For all other aggregations, if the aggregation name doesn't match the field name there are limitations in the drill-down functionality within the {ml} page in {kib}. +{dfeeds} support complex nested aggregations, this example uses the `derivative` +pipeline aggregation to find the 1st order derivative of the counter +`system.network.out.bytes` for each value of the field `beat.name`. + +[source,js] +---------------------------------- +"aggregations": { + "beat.name": { + "terms": { + "field": "beat.name" + }, + "aggregations": { + "buckets": { + "date_histogram": { + "field": "@timestamp", + "interval": "5m" + }, + "aggregations": { + "@timestamp": { + "max": { + "field": "@timestamp" + } + }, + "bytes_out_average": { + "avg": { + "field": "system.network.out.bytes" + } + }, + "bytes_out_derivative": { + "derivative": { + "buckets_path": "bytes_out_average" + } + } + } + } + } + } +} +---------------------------------- +// NOTCONSOLE + When you define an aggregation in a {dfeed}, it must have the following form: [source,js] ---------------------------------- -"aggregations" : { - "buckets" : { - "date_histogram" : { - "time_zone": "UTC", ... +"aggregations": { + ["bucketing_aggregation": { + "bucket_agg": { + ... }, - "aggregations": { - "": { - "max": { - "field":"" + "aggregations": {] + "data_histogram_aggregation": { + "date_histogram": { + "field": "time", + }, + "aggregations": { + "timestamp": { + "max": { + "field": "time" + } + }, + [,"": { + "terms":{... + } + [,"aggregations" : { + []+ + } ] + }] } } - [,"": { - "terms":{... - } - [,"aggregations" : { - []+ - } ] - }] - } - } + } + } } ---------------------------------- +// NOTCONSOLE -You must specify `buckets` as the aggregation name and `date_histogram` as the -aggregation type. For more information, see +The top level aggregation must be either a {ref}/search-aggregations-bucket.html[Bucket Aggregation] +containing as single sub-aggregation that is a `date_histogram` or the top level aggregation +is the required `date_histogram`. There must be exactly 1 `date_histogram` aggregation. +For more information, see {ref}/search-aggregations-bucket-datehistogram-aggregation.html[Date Histogram Aggregation]. NOTE: The `time_zone` parameter in the date histogram aggregation must be set to `UTC`, @@ -163,6 +218,7 @@ GET .../_search { } } -------------------------------------------------- +// NOTCONSOLE By default, {es} limits the maximum number of terms returned to 10000. For high cardinality fields, the query might not run. It might return errors related to