From 386ac7345cf8bc28e57024c23dc57ee05f1c13ca Mon Sep 17 00:00:00 2001 From: Lisa Cawley Date: Tue, 23 May 2017 14:34:21 -0700 Subject: [PATCH] [DOCS] Add details about using aggregations with machine learning (elastic/x-pack-elasticsearch#1446) * [DOCS] Add ML aggregations configuration scenario * [DOCS] Refine ML configuration page * [DOCS] Add ML aggregation details * [DOCS] Add links to aggregations in Configuring ML * [DOCS] Address feedback about ML aggregations Original commit: elastic/x-pack-elasticsearch@847414409371d7c73fb332aab8d58f4284a9b61e --- docs/build.gradle | 1 + docs/en/ml/aggregations.asciidoc | 182 ++++++++++++++++++ docs/en/ml/configuring.asciidoc | 12 +- docs/en/ml/getting-started.asciidoc | 6 +- docs/en/rest-api/ml/datafeedresource.asciidoc | 30 +-- 5 files changed, 193 insertions(+), 38 deletions(-) create mode 100644 docs/en/ml/aggregations.asciidoc diff --git a/docs/build.gradle b/docs/build.gradle index ef79274b884..3ca927512a5 100644 --- a/docs/build.gradle +++ b/docs/build.gradle @@ -17,6 +17,7 @@ buildRestTests.expectedUnconvertedCandidates = [ 'en/ml/functions/rare.asciidoc', 'en/ml/functions/sum.asciidoc', 'en/ml/functions/time.asciidoc', + 'en/ml/aggregations.asciidoc', 'en/rest-api/security/users.asciidoc', 'en/rest-api/security/tokens.asciidoc', 'en/rest-api/watcher/put-watch.asciidoc', diff --git a/docs/en/ml/aggregations.asciidoc b/docs/en/ml/aggregations.asciidoc new file mode 100644 index 00000000000..519dec5e593 --- /dev/null +++ b/docs/en/ml/aggregations.asciidoc @@ -0,0 +1,182 @@ +[[ml-configuring-aggregation]] +=== Aggregating Data For Faster Performance + +By default, {dfeeds} fetch data from {es} using search and scroll requests. +It can be significantly more efficient, however, to aggregate data in {es} +and to configure your jobs to analyze aggregated data. + +One of the benefits of aggregating data this way is that {es} automatically +distributes these calculations across your cluster. You can then feed this +aggregated data into {xpackml} instead of raw results, which +reduces the volume of data that must be considered while detecting anomalies. +//TBD: Are "aggregated" and "summarized" equivalent terms? Are customers more +//familiar with one or the other? If so, I'll use one term throughout. + +There are some limitations to using aggregations in {dfeeds}, however. +Your aggregation must include a buckets aggregation, which in turn must contain +a date histogram aggregation. This requirement ensures that the aggregated +data is a time series. If you use a terms aggregation and the cardinality of a +term is high, then the aggregation might not be effective and you might want +to just use the default search and scroll behavior. + +When you create or update a job, you can include the names of aggregations, for +example: + +[source,js] +---------------------------------- +PUT _xpack/ml/anomaly_detectors/farequote +{ + "analysis_config": { + "bucket_span": "60m", + "detectors": [{ + "function":"mean", + "field_name":"responsetime", + "by_field_name":"airline" + }], + "summary_count_field": "doc_count" + }, + "data_description": { + "time_field":"time" + } +} +---------------------------------- + +In this example, the `airline`, `responsetime`, and `time` fields are +aggregations. + +NOTE: When the `summary_count_field_name` property is set to a non-null value, +the job expects to receive aggregated input. The property must be set to the +name of the field that contains the count of raw data points that have been +aggregated. It applies to all detectors in the job. + +The aggregations are defined in the {dfeed} as follows: + +[source,js] +---------------------------------- +PUT _xpack/ml/datafeeds/datafeed-farequote +{ + "job_id":"farequote", + "indexes": ["farequote"], + "types": ["response"], + "aggregations": { + "buckets": { + "date_histogram": { + "field": "time", + "interval": "360s", + "time_zone": "UTC" + }, + "aggregations": { + "time": { + "max": {"field": "time"} + }, + "airline": { + "terms": { + "field": "airline", + "size": 100 + }, + "aggregations": { + "responsetime": { + "avg": { + "field": "responsetime" + } + } + } + } + } + } + } +} +---------------------------------- + + +In this example, the aggregations have names that match the fields that they +operate on. That is to say, the `max` aggregation is named `time` and its +field is also `time`. The same is true for the aggregations with the names +`airline` and `responsetime`. Since you must create the job before you can +create the {dfeed}, synchronizing your aggregation and field names can simplify +these configuration steps. +//TBD: Describe how this would be accomplished in Kibana? + +When you define an aggregation in a {dfeed}, it must have the following form: + +[source,js] +---------------------------------- +"aggregations" : { + "buckets" : { + "date_histogram" : { + "time_zone": "UTC", ... + }, + "aggregations": { + "": { + "max": { + "field":"" + } + } + [,"": { + "terms":{... + } + [,"aggregations" : { + []+ + } ] + }] + } + } +} +---------------------------------- + +You must specify `buckets` as the aggregation name and `date_histogram` as the +aggregation type. For more information, see +{ref}/search-aggregations-bucket-datehistogram-aggregation.html[Date Histogram Aggregation]. + +NOTE: The `time_zone` parameter in the date histogram aggregation must be set to `UTC`, +which is the default value. + +Each histogram bucket has a key, which is the bucket start time. This key cannot +be used for aggregations in {dfeeds}, however, because they need to know the +time of the latest record within a bucket. Otherwise, when you restart a {dfeed}, +it continues from the start time of the histogram bucket and possibly fetches +the same data twice. The max aggregation for the time field is therefore +necessary to provide the time of the latest record within a bucket. + +You can optionally specify a terms aggregation, which creates buckets for +different values of a field. + +IMPORTANT: If you use a terms aggregation, by default it returns buckets for +the top ten terms. Thus if the cardinality of the term is greater than 10, not +all terms are analyzed. + +You can change this behavior by setting the `size` parameter. To +determine the cardinality of your data, you can run searches such as: + +[source,js] +-------------------------------------------------- +GET .../_search { + "aggs": { + "service_cardinality": { + "cardinality": { + "field": "service" + } + } + } +} +-------------------------------------------------- + +By default, {es} limits the maximum number of terms returned to 10000. For high +cardinality fields, the query might not run. It might return errors related to +circuit breaking exceptions that indicate that the data is too large. In such +cases, do not use aggregations in your {dfeed}. For more +information, see {ref}/search-aggregations-bucket-terms-aggregation.html[Terms Aggregation]. + +You can also optionally specify multiple sub-aggregations. +The sub-aggregations are aggregated for the buckets that were created by their +parent aggregation. For more information, see +{ref}/search-aggregations.html[Aggregations]. + +TIP: If your detectors use metric or sum analytical functions, set the +`interval` of the date histogram aggregation to a tenth of the `bucket_span` +that was defined in the job. This suggestion creates finer, more granular time +buckets, which are ideal for this type of analysis. If your detectors use count or rare functions, set +`interval` to the same value as `bucket_span`. For more information about +analytical functions, see <>. + +//TBD: Add more examples from https://github.com/elastic/prelert-legacy/wiki/Configuring-aggregations-on-a-datafeed diff --git a/docs/en/ml/configuring.asciidoc b/docs/en/ml/configuring.asciidoc index 73cc57c6e26..cbbca119ee3 100644 --- a/docs/en/ml/configuring.asciidoc +++ b/docs/en/ml/configuring.asciidoc @@ -23,11 +23,11 @@ you visualize and explore the results. For a tutorial that walks you through these configuration steps, see <>. -//Though it is quite simple to analyze your data and provide quick {ml} results, -//gaining deep insights might require some additional planning and configuration. -//The scenarios in this section describe some best practices for generating useful -//{ml} results and insights from your data. +Though it is quite simple to analyze your data and provide quick {ml} results, +gaining deep insights might require some additional planning and configuration. +The scenarios in this section describe some best practices for generating useful +{ml} results and insights from your data. -//* <> +* <> -//include::aggregations.asciidoc[] +include::aggregations.asciidoc[] diff --git a/docs/en/ml/getting-started.asciidoc b/docs/en/ml/getting-started.asciidoc index abb3575c134..558f57490f8 100644 --- a/docs/en/ml/getting-started.asciidoc +++ b/docs/en/ml/getting-started.asciidoc @@ -170,10 +170,8 @@ summarizing data this way is that {es} automatically distributes these calculations across your cluster. You can then feed this summarized data into {xpackml} instead of raw results, which reduces the volume of data that must be considered while detecting anomalies. For the purposes of -this tutorial, however, these summary values are stored in {es}, -rather than created using the {ref}/search-aggregations.html[_aggregations framework_]. - -//TBD link to working with aggregations page +this tutorial, however, these summary values are stored in {es}. For more +information, see <>. Before you load the data set, you need to set up {ref}/mapping.html[_mappings_] for the fields. Mappings divide the documents in the index into logical groups diff --git a/docs/en/rest-api/ml/datafeedresource.asciidoc b/docs/en/rest-api/ml/datafeedresource.asciidoc index c3e9c01cd59..7fb68550e15 100644 --- a/docs/en/rest-api/ml/datafeedresource.asciidoc +++ b/docs/en/rest-api/ml/datafeedresource.asciidoc @@ -6,36 +6,10 @@ A {dfeed} resource has the following properties: `aggregations`:: (object) If set, the {dfeed} performs aggregation searches. - For syntax information, see {ref}/search-aggregations.html[Aggregations]. Support for aggregations is limited and should only be used with - low cardinality data. For example: -+ --- -[source,js] ----------------------------------- -{ - "@timestamp": { - "histogram": { - "field": "@timestamp", - "interval": 30000, - "offset": 0, - "order": {"_key": "asc"}, - "keyed": false, - "min_doc_count": 0 - }, - "aggregations": { - "events_per_min": { - "sum": { - "field": "events_per_min" - } - } - } - } -} ----------------------------------- --- + low cardinality data. For more information, + see <>. -//TBD link to a Working with aggregations page `chunking_config`:: (object) Specifies how data searches are split into time chunks. See <>.