[ML][DOCS] Add example of top N derivative aggregation (#31109)

Add example of top N derivative aggregation to the ML datafeed docs
2018-06-06 13:21:16 +01:00 · 2018-06-06 13:21:16 +01:00 · 3767bdc98d
parent 0c8c619181
commit 3767bdc98d
2 changed files with 136 additions and 29 deletions
--- a/x-pack/docs/build.gradle
+++ b/x-pack/docs/build.gradle
@ -16,7 +16,6 @@ buildRestTests.expectedUnconvertedCandidates = [
        'en/ml/functions/rare.asciidoc',
        'en/ml/functions/sum.asciidoc',
        'en/ml/functions/time.asciidoc',
        'en/ml/aggregations.asciidoc',
        'en/ml/customurl.asciidoc',
        'en/monitoring/indices.asciidoc',
        'en/rest-api/security/ssl.asciidoc',
@ -281,6 +280,58 @@ setups['library'] = '''
            {"name": "The Moon is a Harsh Mistress", "author": "Robert A. Heinlein", "release_date": "1966-04-01", "page_count": 288}
 '''
 setups['farequote_index'] = '''
  - do:
        indices.create:
          index: farequote
          body:
            settings:
              number_of_shards: 1
              number_of_replicas: 0
            mappings:
              metric:
                properties:
                  time:
                    type: date
                  responsetime:
                    type: float
                  airline:
                     type: keyword   
 '''
 setups['farequote_data'] = setups['farequote_index'] + '''
  - do:
        bulk:
          index: farequote
          type: metric
          refresh: true
          body: |
            {"index": {"_id":"1"}}
            {"airline":"JZA","responsetime":990.4628,"time":"2016-02-07T00:00:00+0000"}
            {"index": {"_id":"2"}}
            {"airline":"JBU","responsetime":877.5927,"time":"2016-02-07T00:00:00+0000"}
            {"index": {"_id":"3"}}            
            {"airline":"KLM","responsetime":1355.4812,"time":"2016-02-07T00:00:00+0000"}
 '''
 setups['farequote_job'] = setups['farequote_data'] + '''
  - do:
      xpack.ml.put_job:
        job_id: "farequote"
        body:  >
          {
            "analysis_config": {
              "bucket_span": "60m",
              "detectors": [{
                "function": "mean",
                "field_name": "responsetime",
                "by_field_name": "airline"
              }],
            "summary_count_field_name": "doc_count"
            },
            "data_description": {
              "time_field": "time"
            }
          }
 '''
 setups['server_metrics_index'] = '''
  - do:
        indices.create:
--- a/x-pack/docs/en/ml/aggregations.asciidoc
+++ b/x-pack/docs/en/ml/aggregations.asciidoc
@ -11,11 +11,12 @@ aggregated data into {xpackml} instead of raw results, which
 reduces the volume of data that must be considered while detecting anomalies.
 There are some limitations to using aggregations in {dfeeds}, however.
-Your aggregation must include a buckets aggregation, which in turn must contain
+Your aggregation must include a `date_histogram` aggregation, which in turn must
-a date histogram aggregation. This requirement ensures that the aggregated
+contain a `max` aggregation on the time field. This requirement ensures that the
-data is a time series. If you use a terms aggregation and the cardinality of a
+aggregated data is a time series and the timestamp of each bucket is the time
-term is high, then the aggregation might not be effective and you might want
+of the last record in the bucket. If you use a terms aggregation and the
-to just use the default search and scroll behavior.
+cardinality of a term is high, then the aggregation might not be effective and
 you might want to just use the default search and scroll behavior.
 When you create or update a job, you can include the names of aggregations, for
 example:
@ -27,9 +28,9 @@ PUT _xpack/ml/anomaly_detectors/farequote
  "analysis_config": {
    "bucket_span": "60m",
    "detectors": [{
-      "function":"mean",
+      "function": "mean",
-      "field_name":"responsetime",
+      "field_name": "responsetime",
-      "by_field_name":"airline"
+      "by_field_name": "airline"
    }],
    "summary_count_field_name": "doc_count"
  },
@ -38,6 +39,8 @@ PUT _xpack/ml/anomaly_detectors/farequote
  }
 }
 ----------------------------------
 // CONSOLE
 // TEST[setup:farequote_data]
 In this example, the `airline`, `responsetime`, and `time` fields are
 aggregations.
@ -85,7 +88,8 @@ PUT _xpack/ml/datafeeds/datafeed-farequote
  }
 }
 ----------------------------------
-
+// CONSOLE
 // TEST[setup:farequote_job]
 In this example, the aggregations have names that match the fields that they
 operate on. That is to say, the `max` aggregation is named `time` and its
@ -100,35 +104,86 @@ For all other aggregations, if the aggregation name doesn't match the field name
 there are limitations in the drill-down functionality within the {ml} page in
 {kib}.
 {dfeeds} support complex nested aggregations, this example uses the `derivative`
 pipeline aggregation to find the 1st order derivative of the counter
 `system.network.out.bytes` for each value of the field `beat.name`.
 [source,js]
 ----------------------------------
 "aggregations": {
  "beat.name": {
    "terms": {
      "field": "beat.name"
    },
    "aggregations": {
      "buckets": {
        "date_histogram": {
          "field": "@timestamp",
          "interval": "5m"
        },
        "aggregations": {
          "@timestamp": {
            "max": {
              "field": "@timestamp"
            }
          },
          "bytes_out_average": {
            "avg": {
              "field": "system.network.out.bytes"
            }
          },
          "bytes_out_derivative": {
            "derivative": {
              "buckets_path": "bytes_out_average"
            }
          }
        }
      }
    }
  }
 }
 ----------------------------------
 // NOTCONSOLE
 When you define an aggregation in a {dfeed}, it must have the following form:
 [source,js]
 ----------------------------------
-"aggregations" : {
+"aggregations": {
-  "buckets" : {
+  ["bucketing_aggregation": {
-    "date_histogram" : {
+    "bucket_agg": {
-      "time_zone": "UTC", ...
+      ...
    },
-    "aggregations": {
+    "aggregations": {]
-      "<time_field>": {
+      "data_histogram_aggregation": {
-        "max": {
+        "date_histogram": {
-          "field":"<time_field>"
+          "field": "time",
        },
        "aggregations": {
          "timestamp": {
            "max": {
              "field": "time"
            }
          },
          [,"<first_term>": {
            "terms":{...
            }
            [,"aggregations" : {
              [<sub_aggregation>]+
            } ]
          }]
        }
      }
-      [,"<first_term>": {
+    }
-        "terms":{...
+  }
        }
        [,"aggregations" : {
          [<sub_aggregation>]+
        } ]
      }]
   }
 }
 }
 ----------------------------------
 // NOTCONSOLE
-You must specify `buckets` as the aggregation name and `date_histogram` as the
+The top level aggregation must be either a {ref}/search-aggregations-bucket.html[Bucket Aggregation]
-aggregation type. For more information, see
+containing as single sub-aggregation that is a `date_histogram` or the top level aggregation
 is the required `date_histogram`. There must be exactly 1 `date_histogram` aggregation.
 For more information, see
 {ref}/search-aggregations-bucket-datehistogram-aggregation.html[Date Histogram Aggregation].
 NOTE: The `time_zone` parameter in the date histogram aggregation must be set to `UTC`,
@ -163,6 +218,7 @@ GET .../_search {
  }
 }
 --------------------------------------------------
 // NOTCONSOLE
 By default, {es} limits the maximum number of terms returned to 10000. For high
 cardinality fields, the query might not run. It might return errors related to