[ML][DOCS] Add example of top N derivative aggregation (#31109)

Add example of top N derivative aggregation to the ML datafeed docs
2018-06-06 13:21:16 +01:00 · 2018-06-06 13:21:16 +01:00 · 3767bdc98d
parent 0c8c619181
commit 3767bdc98d
2 changed files with 136 additions and 29 deletions
--- a/x-pack/docs/build.gradle
+++ b/x-pack/docs/build.gradle
@ -16,7 +16,6 @@ buildRestTests.expectedUnconvertedCandidates = [
        'en/ml/functions/rare.asciidoc',
        'en/ml/functions/sum.asciidoc',
        'en/ml/functions/time.asciidoc',
-        'en/ml/aggregations.asciidoc',
        'en/ml/customurl.asciidoc',
        'en/monitoring/indices.asciidoc',
        'en/rest-api/security/ssl.asciidoc',
@ -281,6 +280,58 @@ setups['library'] = '''
            {"name": "The Moon is a Harsh Mistress", "author": "Robert A. Heinlein", "release_date": "1966-04-01", "page_count": 288}

 '''
+setups['farequote_index'] = '''
+  - do:
+        indices.create:
+          index: farequote
+          body:
+            settings:
+              number_of_shards: 1
+              number_of_replicas: 0
+            mappings:
+              metric:
+                properties:
+                  time:
+                    type: date
+                  responsetime:
+                    type: float
+                  airline:
+                     type: keyword   
+'''
+setups['farequote_data'] = setups['farequote_index'] + '''
+  - do:
+        bulk:
+          index: farequote
+          type: metric
+          refresh: true
+          body: |
+            {"index": {"_id":"1"}}
+            {"airline":"JZA","responsetime":990.4628,"time":"2016-02-07T00:00:00+0000"}
+            {"index": {"_id":"2"}}
+            {"airline":"JBU","responsetime":877.5927,"time":"2016-02-07T00:00:00+0000"}
+            {"index": {"_id":"3"}}            
+            {"airline":"KLM","responsetime":1355.4812,"time":"2016-02-07T00:00:00+0000"}
+'''
+setups['farequote_job'] = setups['farequote_data'] + '''
+  - do:
+      xpack.ml.put_job:
+        job_id: "farequote"
+        body:  >
+          {
+            "analysis_config": {
+              "bucket_span": "60m",
+              "detectors": [{
+                "function": "mean",
+                "field_name": "responsetime",
+                "by_field_name": "airline"
+              }],
+            "summary_count_field_name": "doc_count"
+            },
+            "data_description": {
+              "time_field": "time"
+            }
+          }
+'''
 setups['server_metrics_index'] = '''
  - do:
        indices.create:
--- a/x-pack/docs/en/ml/aggregations.asciidoc
+++ b/x-pack/docs/en/ml/aggregations.asciidoc
@ -11,11 +11,12 @@ aggregated data into {xpackml} instead of raw results, which
 reduces the volume of data that must be considered while detecting anomalies.

 There are some limitations to using aggregations in {dfeeds}, however.
-Your aggregation must include a buckets aggregation, which in turn must contain
-a date histogram aggregation. This requirement ensures that the aggregated
-data is a time series. If you use a terms aggregation and the cardinality of a
-term is high, then the aggregation might not be effective and you might want
-to just use the default search and scroll behavior.
+Your aggregation must include a `date_histogram` aggregation, which in turn must
+contain a `max` aggregation on the time field. This requirement ensures that the
+aggregated data is a time series and the timestamp of each bucket is the time
+of the last record in the bucket. If you use a terms aggregation and the
+cardinality of a term is high, then the aggregation might not be effective and
+you might want to just use the default search and scroll behavior.

 When you create or update a job, you can include the names of aggregations, for
 example:
@ -27,9 +28,9 @@ PUT _xpack/ml/anomaly_detectors/farequote
  "analysis_config": {
    "bucket_span": "60m",
    "detectors": [{
-      "function":"mean",
-      "field_name":"responsetime",
-      "by_field_name":"airline"
+      "function": "mean",
+      "field_name": "responsetime",
+      "by_field_name": "airline"
    }],
    "summary_count_field_name": "doc_count"
  },
@ -38,6 +39,8 @@ PUT _xpack/ml/anomaly_detectors/farequote
  }
 }
 ----------------------------------
+// CONSOLE
+// TEST[setup:farequote_data]

 In this example, the `airline`, `responsetime`, and `time` fields are
 aggregations.
@ -85,7 +88,8 @@ PUT _xpack/ml/datafeeds/datafeed-farequote
  }
 }
 ----------------------------------
-
+// CONSOLE
+// TEST[setup:farequote_job]

 In this example, the aggregations have names that match the fields that they
 operate on. That is to say, the `max` aggregation is named `time` and its
@ -100,35 +104,86 @@ For all other aggregations, if the aggregation name doesn't match the field name
 there are limitations in the drill-down functionality within the {ml} page in
 {kib}.

+{dfeeds} support complex nested aggregations, this example uses the `derivative`
+pipeline aggregation to find the 1st order derivative of the counter
+`system.network.out.bytes` for each value of the field `beat.name`.
+
+[source,js]
+----------------------------------
+"aggregations": {
+  "beat.name": {
+    "terms": {
+      "field": "beat.name"
+    },
+    "aggregations": {
+      "buckets": {
+        "date_histogram": {
+          "field": "@timestamp",
+          "interval": "5m"
+        },
+        "aggregations": {
+          "@timestamp": {
+            "max": {
+              "field": "@timestamp"
+            }
+          },
+          "bytes_out_average": {
+            "avg": {
+              "field": "system.network.out.bytes"
+            }
+          },
+          "bytes_out_derivative": {
+            "derivative": {
+              "buckets_path": "bytes_out_average"
+            }
+          }
+        }
+      }
+    }
+  }
+}
+----------------------------------
+// NOTCONSOLE
+
 When you define an aggregation in a {dfeed}, it must have the following form:

 [source,js]
 ----------------------------------
-"aggregations" : {
-  "buckets" : {
-    "date_histogram" : {
-      "time_zone": "UTC", ...
+"aggregations": {
+  ["bucketing_aggregation": {
+    "bucket_agg": {
+      ...
    },
-    "aggregations": {
-      "<time_field>": {
-        "max": {
-          "field":"<time_field>"
+    "aggregations": {]
+      "data_histogram_aggregation": {
+        "date_histogram": {
+          "field": "time",
+        },
+        "aggregations": {
+          "timestamp": {
+            "max": {
+              "field": "time"
+            }
+          },
+          [,"<first_term>": {
+            "terms":{...
+            }
+            [,"aggregations" : {
+              [<sub_aggregation>]+
+            } ]
+          }]
        }
      }
-      [,"<first_term>": {
-        "terms":{...
-        }
-        [,"aggregations" : {
-          [<sub_aggregation>]+
-        } ]
-      }]
-   }
- }
+    }
+  }
 }
 ----------------------------------
+// NOTCONSOLE

-You must specify `buckets` as the aggregation name and `date_histogram` as the
-aggregation type. For more information, see
+The top level aggregation must be either a {ref}/search-aggregations-bucket.html[Bucket Aggregation]
+containing as single sub-aggregation that is a `date_histogram` or the top level aggregation
+is the required `date_histogram`. There must be exactly 1 `date_histogram` aggregation.
+For more information, see
 {ref}/search-aggregations-bucket-datehistogram-aggregation.html[Date Histogram Aggregation].

 NOTE: The `time_zone` parameter in the date histogram aggregation must be set to `UTC`,
@ -163,6 +218,7 @@ GET .../_search {
  }
 }
 --------------------------------------------------
+// NOTCONSOLE

 By default, {es} limits the maximum number of terms returned to 10000. For high
 cardinality fields, the query might not run. It might return errors related to