From 6275799a864a9b5bda6f18ce92ec608e7df3fcf0 Mon Sep 17 00:00:00 2001 From: ashwinkumar12345 Date: Thu, 6 May 2021 12:45:35 -0700 Subject: [PATCH] added aggregations --- docs/opensearch/aggregations.md | 161 ++++ docs/opensearch/bucket-agg.md | 1350 +++++++++++++++++++++++++++++++ docs/opensearch/metric-agg.md | 637 +++++++++++++++ docs/opensearch/pipeline-agg.md | 1259 ++++++++++++++++++++++++++++ docs/sql/endpoints.md | 12 +- 5 files changed, 3413 insertions(+), 6 deletions(-) create mode 100644 docs/opensearch/aggregations.md create mode 100644 docs/opensearch/bucket-agg.md create mode 100644 docs/opensearch/metric-agg.md create mode 100644 docs/opensearch/pipeline-agg.md diff --git a/docs/opensearch/aggregations.md b/docs/opensearch/aggregations.md new file mode 100644 index 00000000..160ec4d6 --- /dev/null +++ b/docs/opensearch/aggregations.md @@ -0,0 +1,161 @@ +--- +layout: default +title: Aggregations +parent: OpenSearch +nav_order: 13 +has_children: true +--- + +# Aggregations + +OpenSearch isn’t just for search. Aggregations let you tap into OpenSearch's powerful analytics engine to analyze your data and extract statistics from it. + +The use cases of aggregations vary from analyzing data in real time to take some action to using OpenSearch Dashboards to create a visualization dashboard. + +OpenSearch can perform aggregations on massive datasets in milliseconds. Compared to queries, aggregations consume more CPU cycles and memory. + +## Aggregations on text fields + +By default, OpenSearch doesn't support aggregations on a text field. +Because text fields are tokenized, an aggregation on a text field has to reverse the tokenization process back to its original string and then formulate an aggregation based on that. Such an operation consumes significant memory and degrades cluster performance. + +While you can enable aggregations on text fields by setting the `fielddata` parameter to `true` in the mapping, the aggregations are still based on the tokenized words and not on the raw text. + +We recommend keeping a raw version of the text field as a `keyword` field that you can aggregate on. +In this case, you can perform aggregations on the `title.raw` field, instead of the `title` field: + +```json +PUT movies +{ + "mappings": { + "properties": { + "title": { + "type": "text", + "fielddata": true, + "fields": { + "raw": { + "type": "keyword" + } + } + } + } + } +} +``` + +## General aggregation structure + +The structure of an aggregation query is as follows: + +```json +GET _search +{ + "size": 0, + "aggs": { + "NAME": { + "AGG_TYPE": {} + } + } +} +``` + +If you’re only interested in the aggregation result and not in the results of the query, set `size` to 0. + +In the `aggs` property (you can use `aggregations` if you want), you can define any number of aggregations. +Each aggregation is defined by its name and one of the types of aggregations that OpenSearch supports. + +The name of the aggregation helps you to distinguish between different aggregations in the response. +The `AGG_TYPE` property is where you specify the type of aggregation. + +## Sample aggregation + +This section uses the OpenSearch Dashboards sample e-commerce data and web log data. To add the sample data, log in to OpenSearch Dashboards, choose **Home** and **Try our sample data**. For **Sample eCommerce orders** and **Sample web logs**, choose **Add data**. + +### avg + +To find the average value of the `taxful_total_price` field: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "avg_taxful_total_price": { + "avg": { + "field": "taxful_total_price" + } + } + } +} +``` + +#### Sample response + +```json +{ + "took" : 1, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 4675, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "aggregations" : { + "avg_taxful_total_price" : { + "value" : 75.05542864304813 + } + } +} +``` + +The aggregation block in the response shows the average value for the `taxful_total_price` field. + +## Types of aggregations + +There are three main types of aggregations: + +- Metric aggregations - Calculate metrics such as `sum`, `min`, `max`, and `avg` on numeric fields. +- Bucket aggregations - Sort query results into groups based on some criteria. +- Pipeline aggregations - Pipe the output of one aggregation as an input to another. + +## Nested aggregations + +Aggregations within aggregations are called nested or sub aggregations. + +Metric aggregations produce simple results and can't contain nested aggregations. + +Bucket aggregations produce buckets of documents that you can nest in other aggregations. You can perform complex analysis on your data by nesting metric and bucket aggregations within bucket aggregations. + +### General nested aggregation syntax + +```json +{ + "aggs": { + "name": { + "type": { + "data" + }, + "aggs": { + "nested": { + "type": { + "data" + } + } + } + } + } +} +``` + +The inner `aggs` keyword begins a new nested aggregation. The syntax of the parent aggregation and the nested aggregation is the same. Nested aggregations run in the context of the preceding parent aggregations. + +You can also pair your aggregations with search queries to narrow down things you’re trying to analyze before aggregating. If you don't add a query, OpenSearch implicitly uses the `match_all` query. diff --git a/docs/opensearch/bucket-agg.md b/docs/opensearch/bucket-agg.md new file mode 100644 index 00000000..3648ea4b --- /dev/null +++ b/docs/opensearch/bucket-agg.md @@ -0,0 +1,1350 @@ +--- +layout: default +title: Bucket Aggregations +parent: Aggregations +grand_parent: OpenSearch +nav_order: 2 +has_children: false +--- + +# Bucket Aggregations + +Bucket aggregations categorize sets of documents as buckets. The type of bucket aggregation determines whether a given document falls into a bucket or not. + +You can use bucket aggregations to implement faceted navigation (usually placed as a sidebar on a search result landing page) to help you're users narrow down the results. + +## terms + +The `terms` aggregation dynamically creates a bucket for each unique term of a field. + +The following example uses the `terms` aggregation to find the number of documents per response code in web log data: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "response_codes": { + "terms": { + "field": "response.keyword", + "size": 10 + } + } + } +} +``` + +#### Sample Response + +```json +... +"aggregations" : { + "response_codes" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "200", + "doc_count" : 12832 + }, + { + "key" : "404", + "doc_count" : 801 + }, + { + "key" : "503", + "doc_count" : 441 + } + ] + } + } +} +``` + +The values are returned with the key `key`. +`doc_count` specifies the number of documents in each bucket. By default, the buckets are sorted in descending order of `doc-count`. + +The response also includes two keys named `doc_count_error_upper_bound` and `sum_other_doc_count`. + +The `terms` aggregation returns the top unique terms. So, if the data has many unique terms, then some of them might not appear in the results. The `sum_other_doc_count` field is the sum of the documents that are left out of the response. In this case, the number is 0 because all the unique values appear in the response. + +The `doc_count_error_upper_bound` field represents the maximum possible count for a unique value that's left out of the final results. Use this field to estimate the error margin for the count. + +The count might not be accurate. A coordinating node that’s responsible for the aggregation prompts each shard for its top unique terms. Imagine a scenario where the `size` parameter is 3. +The `terms` aggregation requests each shard for its top 3 unique terms. The coordinating node takes each of the results and aggregates them to compute the final result. If a shard has an object that’s not part of the top 3, then it won't show up in the response. + +This is especially true if `size` is set to a low number. Because the default size is 10, an error is unlikely to happen. If you don’t need high accuracy and want to increase the performance, you can reduce the size. + +## sampler, diversified_sampler + +If you're aggregating over millions of documents, you can use a `sampler` aggregation to reduce its scope to a small sample of documents for a faster response. The `sampler` aggregation selects the samples by top-scoring documents. + +The results are approximate but closely represent the distribution of the real data. The `sampler` aggregation significantly improves query performance, but the estimated responses are not entirely reliable. + +The basic syntax is: + +```json +“aggs”: { + "SAMPLE": { + "sampler": { + "shard_size": 100 + }, + "aggs": {...} + } +} +``` + +The `shard_size` property tells OpenSearch how many documents (at most) to collect from each shard. + +The following example limits the number of documents collected on each shard to 1,000 and then buckets the documents by a `terms` aggregation: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "sample": { + "sampler": { + "shard_size": 1000 + }, + "aggs": { + "terms": { + "terms": { + "field": "agent.keyword" + } + } + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "sample" : { + "doc_count" : 1000, + "terms" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) Gecko/20110421 Firefox/6.0a1", + "doc_count" : 368 + }, + { + "key" : "Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.50 Safari/534.24", + "doc_count" : 329 + }, + { + "key" : "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)", + "doc_count" : 303 + } + ] + } + } + } +} +``` + +The `diversified_sampler` aggregation lets you reduce the bias in the distribution of the sample pool. You can use the `field` setting to control the maximum number of documents collected on any one shard which shares a common value: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "sample": { + "diversified_sampler": { + "shard_size": 1000, + "field": "response.keyword" + }, + "aggs": { + "terms": { + "terms": { + "field": "agent.keyword" + } + } + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "sample" : { + "doc_count" : 3, + "terms" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) Gecko/20110421 Firefox/6.0a1", + "doc_count" : 2 + }, + { + "key" : "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)", + "doc_count" : 1 + } + ] + } + } + } +} +``` + +## significant_terms, significant_text + +The `significant_terms` aggregation lets you spot unusual or interesting term occurrences in a filtered subset relative to the rest of the data in an index. + +A foreground set is the set of documents that you filter. A background set is a set of all documents in an index. +The `significant_terms` aggregation examines all documents in the foreground set and finds a score for significant occurrences in contrast to the documents in the background set. + +In the sample web log data, each document has a field containing the `user-agent` of the visitor. This example searches for all requests from an iOS operating system. A regular `terms` aggregation on this foreground set returns Firefox because it has the most number of documents within this bucket. On the other hand, a `significant_terms` aggregation returns Internet Explorer (IE) because IE has a significantly higher appearance in the foreground set as compared to the background set. + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "query": { + "terms": { + "machine.os.keyword": [ + "ios" + ] + } + }, + "aggs": { + "significant_response_codes": { + "significant_terms": { + "field": "agent.keyword" + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "significant_response_codes" : { + "doc_count" : 2737, + "bg_count" : 14074, + "buckets" : [ + { + "key" : "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)", + "doc_count" : 818, + "score" : 0.01462731514608217, + "bg_count" : 4010 + }, + { + "key" : "Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) Gecko/20110421 Firefox/6.0a1", + "doc_count" : 1067, + "score" : 0.009062566630410223, + "bg_count" : 5362 + } + ] + } + } +} +``` + +If the `significant_terms` aggregation doesn't return any result, you might have not filtered the results with a query. Alternatively, the distribution of terms in the foreground set might be the same as the background set, implying that there isn't anything unusual in the foreground set. + +The `significant_text` aggregation is similar to the `significant_terms` aggregation but it's for raw text fields. +Significant text measures the change in popularity measured between the foreground and background sets using statistical analysis. For example, it might suggest Tesla when you look for its stock acronym TSLA. + +The `significant_text` aggregation re-analyzes the source text on the fly, filtering noisy data like duplicate paragraphs, boilerplate headers and footers, and so on, which might otherwise skew the results. + +Re-analyzing high-cardinality datasets can be a very CPU-intensive operation. We recommend using the `significant_text` aggregation inside a sampler aggregation to limit the analysis to a small selection of top-matching documents, for example 200. + +You can set the following parameters: + +- `min_doc_count` - Return results that match more than a configured number of top hits. We recommend not setting `min_doc_count` to 1 because it tends to return terms that are typos or misspellings. Finding more than one instance of a term helps reinforce that the significance is not the result of a one-off accident. The default value of 3 is used to provide a minimum weight-of-evidence. +- `shard_size` - Setting a high value increases stability (and accuracy) at the expense of computational performance. +- `shard_min_doc_count` - If your text contains many low frequency words and you're not interested in these (for example typos), then you can set the `shard_min_doc_count` parameter to filter out candidate terms at a shard level with a reasonable certainty to not reach the required `min_doc_count` even after merging the local significant text frequencies. The default value is 1, which has no impact until you explicitly set it. We recommend setting this value much lower than the `min_doc_count` value. + +Assume that you have the complete works of Shakespeare indexed in an OpenSearch cluster. You can find significant texts in relation to the word "breathe" in the `text_entry` field: + +```json +GET shakespeare/_search +{ + "query": { + "match": { + "text_entry": "breathe" + } + }, + "aggregations": { + "my_sample": { + "sampler": { + "shard_size": 100 + }, + "aggregations": { + "keywords": { + "significant_text": { + "field": "text_entry", + "min_doc_count": 4 + } + } + } + } + } +} +``` + +#### Sample response + +```json +"aggregations" : { + "my_sample" : { + "doc_count" : 59, + "keywords" : { + "doc_count" : 59, + "bg_count" : 111396, + "buckets" : [ + { + "key" : "breathe", + "doc_count" : 59, + "score" : 1887.0677966101694, + "bg_count" : 59 + }, + { + "key" : "air", + "doc_count" : 4, + "score" : 2.641295376716233, + "bg_count" : 189 + }, + { + "key" : "dead", + "doc_count" : 4, + "score" : 0.9665839666414213, + "bg_count" : 495 + }, + { + "key" : "life", + "doc_count" : 5, + "score" : 0.9090787433467572, + "bg_count" : 805 + } + ] + } + } + } +} +``` + +The most significant texts in relation to `breathe` are `air`, `dead`, and `life`. + +The `significant_text` aggregation has the following limitations: + +- Doesn't support child aggregations because child aggregations come at a high memory cost. As a workaround, you can add a follow-up query using a `terms` aggregation with an include clause and a child aggregation. +- Doesn't support nested objects because it works with the document JSON source. +- The counts of documents might have some (typically small) inaccuracies as it's based on summing the samples returned from each shard. You can use the `shard_size` parameter to fine-tune the trade-off between accuracy and performance. By default, the `shard_size` is set to -1 to automatically estimate the number of shards and the `size` parameter. + +For both `significant_terms` and `significant_text` aggregations, the default source of statistical information for background term frequencies is the entire index. You can narrow this scope with a background filter for more focus: + +```json +GET shakespeare/_search +{ + "query": { + "match": { + "text_entry": "breathe" + } + }, + "aggregations": { + "my_sample": { + "sampler": { + "shard_size": 100 + }, + "aggregations": { + "keywords": { + "significant_text": { + "field": "text_entry", + "background_filter": { + "term": { + "speaker": "JOHN OF GAUNT" + } + } + } + } + } + } + } +} +``` + + +## missing + +If you have documents in your index that don’t contain the aggregating field at all or the aggregating field has a value of NULL, use the `missing` parameter to specify the name of the bucket such documents should be placed in. + +The following example adds any missing values to a bucket named "N/A": + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "response_codes": { + "terms": { + "field": "response.keyword", + "size": 10, + "missing": "N/A" + } + } + } +} +``` + +Because the default value for the `min_doc_count` parameter is 1, the `missing` parameter doesn't return any buckets in its response. Set `min_doc_count` parameter to 0 to see the "N/A" bucket in the response: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "response_codes": { + "terms": { + "field": "response.keyword", + "size": 10, + "missing": "N/A", + "min_doc_count": 0 + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "response_codes" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "200", + "doc_count" : 12832 + }, + { + "key" : "404", + "doc_count" : 801 + }, + { + "key" : "503", + "doc_count" : 441 + }, + { + "key" : "N/A", + "doc_count" : 0 + } + ] + } + } +} +``` + +## histogram, date_histogram + +The `histogram` aggregation buckets documents based on a specified interval. + +With `histogram` aggregations, you can visualize the distributions of values in a given range of documents very easily. Now OpenSearch doesn’t give you back an actual graph of course, that’s what OpenSearch Dashboards is for. But it'll give you the JSON response that you can use to construct your own graph. + +The following example buckets the `number_of_bytes` field by 10,000 intervals: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "number_of_bytes": { + "histogram": { + "field": "bytes", + "interval": 10000 + } + } + } +} +``` + +Sample Response + +```json +... +"aggregations" : { + "number_of_bytes" : { + "buckets" : [ + { + "key" : 0.0, + "doc_count" : 13372 + }, + { + "key" : 10000.0, + "doc_count" : 702 + } + ] + } + } +} +``` + +The `date_histogram` aggregation uses date math to generate histograms for time-series data. + +For example, you can find how many hits your website gets per month: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "logs_per_month": { + "date_histogram": { + "field": "@timestamp", + "interval": "month" + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "logs_per_month" : { + "buckets" : [ + { + "key_as_string" : "2020-10-01T00:00:00.000Z", + "key" : 1601510400000, + "doc_count" : 1635 + }, + { + "key_as_string" : "2020-11-01T00:00:00.000Z", + "key" : 1604188800000, + "doc_count" : 6844 + }, + { + "key_as_string" : "2020-12-01T00:00:00.000Z", + "key" : 1606780800000, + "doc_count" : 5595 + } + ] + } +} +} +``` + +The response has three months worth of logs. If you graph these values, you can see the peak and valleys of the request traffic to your website month over month. + +## range, date_range, ip_range + +The `range` aggregation lets you define the range for each bucket. + +For example, you can find the number of bytes between 1000 and 2000, 2000 and 3000, and 3000 and 4000. +Within the `range` parameter, you can define ranges as objects of an array. + + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "number_of_bytes_distribution": { + "range": { + "field": "bytes", + "ranges": [ + { + "from": 1000, + "to": 2000 + }, + { + "from": 2000, + "to": 3000 + }, + { + "from": 3000, + "to": 4000 + } + ] + } + } + } +} +``` + +The response includes the `from` key values and excludes the `to` key values: + +#### Sample response + +```json +... +"aggregations" : { + "number_of_bytes_distribution" : { + "buckets" : [ + { + "key" : "1000.0-2000.0", + "from" : 1000.0, + "to" : 2000.0, + "doc_count" : 805 + }, + { + "key" : "2000.0-3000.0", + "from" : 2000.0, + "to" : 3000.0, + "doc_count" : 1369 + }, + { + "key" : "3000.0-4000.0", + "from" : 3000.0, + "to" : 4000.0, + "doc_count" : 1422 + } + ] + } + } +} +``` + +The `date_range` aggregation is conceptually the same as the `range` aggregation, except that it lets you perform date math. +For example, you can get all documents from the last 10 days. To make the date more readable, include the format with a `format` parameter: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "number_of_bytes": { + "date_range": { + "field": "@timestamp", + "format": "MM-yyyy", + "ranges": [ + { + "from": "now-10d/d", + "to": "now" + } + ] + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "number_of_bytes" : { + "buckets" : [ + { + "key" : "03-2021-03-2021", + "from" : 1.6145568E12, + "from_as_string" : "03-2021", + "to" : 1.615451329043E12, + "to_as_string" : "03-2021", + "doc_count" : 0 + } + ] + } + } +} +``` + +The `ip_range` aggregation is for IP addresses. +It works on `ip` type fields. You can define the IP ranges and masks in the [CIDR](http://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing) notation. + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "access": { + "ip_range": { + "field": "ip", + "ranges": [ + { + "from": "1.0.0.0", + "to": "126.158.155.183" + }, + { + "mask": "1.0.0.0/8" + } + ] + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "access" : { + "buckets" : [ + { + "key" : "1.0.0.0/8", + "from" : "1.0.0.0", + "to" : "2.0.0.0", + "doc_count" : 98 + }, + { + "key" : "1.0.0.0-126.158.155.183", + "from" : "1.0.0.0", + "to" : "126.158.155.183", + "doc_count" : 7184 + } + ] + } + } +} +``` + +## filter, filters + +A `filter` aggregation is a query clause, exactly like a search query — `match` or `term` or `range`. You can use the `filter` aggregation to narrow down the entire set of documents to a specific set before creating buckets. + +The following example shows the `avg` aggregation running within the context of a filter. The `avg` aggregation only aggregates the documents that match the `range` query: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "low_value": { + "filter": { + "range": { + "taxful_total_price": { + "lte": 50 + } + } + }, + "aggs": { + "avg_amount": { + "avg": { + "field": "taxful_total_price" + } + } + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "low_value" : { + "doc_count" : 1633, + "avg_amount" : { + "value" : 38.363175998928355 + } + } + } +} +``` + +A `filters` aggregation is the same as the `filter` aggregation, except that it lets you use multiple filter aggregations. +While the `filter` aggregation results in a single bucket, the `filters` aggregation returns multiple buckets, one for each of the defined filters. + +To create a bucket for all the documents that didn't match the any of the filter queries, set the `other_bucket` property to `true`: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "200_os": { + "filters": { + "other_bucket": true, + "filters": [ + { + "term": { + "response.keyword": "200" + } + }, + { + "term": { + "machine.os.keyword": "osx" + } + } + ] + }, + "aggs": { + "avg_amount": { + "avg": { + "field": "bytes" + } + } + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "200_os" : { + "buckets" : [ + { + "doc_count" : 12832, + "avg_amount" : { + "value" : 5897.852711970075 + } + }, + { + "doc_count" : 2825, + "avg_amount" : { + "value" : 5620.347256637168 + } + }, + { + "doc_count" : 1017, + "avg_amount" : { + "value" : 3247.0963618485744 + } + } + ] + } + } +} +``` + +## global + +The `global` aggregations lets you break out of the aggregation context of a filter aggregation. Even if you have included a filter query that narrows down a set of documents, the `global` aggregation aggregates on all documents as if the filter query wasn't there. It ignores the `filter` aggregation and implicitly assumes the `match_all` query. + +The following example returns the `avg` value of the `taxful_total_price` field from all documents in the index: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "query": { + "range": { + "taxful_total_price": { + "lte": 50 + } + } + }, + "aggs": { + "total_avg_amount": { + "global": {}, + "aggs": { + "avg_price": { + "avg": { + "field": "taxful_total_price" + } + } + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "total_avg_amount" : { + "doc_count" : 4675, + "avg_price" : { + "value" : 75.05542864304813 + } + } + } +} +``` + +You can see that the average value for the `taxful_total_price` field is 75.05 and not the 38.36 as seen in the `filter` example when the query matched. + +## geo_distance, geohash_grid + +The `geo_distance` aggregation groups documents into concentric circles based on distances from an origin `geo_point` field. +It's the same as the `range` aggregation, except that it works on geo locations. + +For example, you can use the `geo_distance` aggregation to find all pizza places within 1 km of you. The search results are limited to the 1 km radius specified by you, but you can add another result found within 2 km. + +You can only use the `geo_distance` aggregation on fields mapped as `geo_point`. + +A point is a single geographical coordinate, such as your current location shown by your smart-phone. A point in OpenSearch is represented as follows: + +```json +{ + "location": { + "type": "point", + "coordinates": { + "lat": 83.76, + "lon": -81.2 + } + } +} +``` + +You can also specify the latitude and longitude as an array `[-81.20, 83.76]` or as a string `"83.76, -81.20"` + +This table lists the relevant fields of a `geo_distance` aggregation: + +Field | Description | Required +:--- | :--- |:--- +`field` | Specify the geo point field that you want to work on. | Yes +`origin` | Specify the geo point that's used to compute the distances from. | Yes +`ranges` | Specify a list of ranges to collect documents based on their distance from the target point. | Yes +`unit` | Define the units used in the `ranges` array. The `unit` defaults to `m` (meters), but you can switch to other units like `km` (kilometers), `mi` (miles), `in` (inches), `yd` (yards), `cm` (centimeters), and `mm` (millimeters). | No +`distance_type` | Specify how OpenSearch calculates the distance. The default is `sloppy_arc` (faster but less accurate), but can also be set to `arc` (slower but most accurate) or `plane` (fastest but least accurate). Because of high error margins, use `plane` only for small geographic areas. | No + +The syntax is as follows: + +```json +{ + "aggs": { + "aggregation_name": { + "geo_distance": { + "field": "field_1", + "origin": "x, y", + "ranges": [ + { + "to": "value_1" + }, + { + "from": "value_2", + "to": "value_3" + }, + { + "from": "value_4" + } + ] + } + } + } +} +``` + +This example forms buckets from the following distances from a `geo-point` field: + +- Fewer than 10 km +- From 10 to 20 km +- From 20 to 50 km +- From 50 to 100 km +- Above 100 km + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "position": { + "geo_distance": { + "field": "geo.coordinates", + "origin": { + "lat": 83.76, + "lon": -81.2 + }, + "ranges": [ + { + "to": 10 + }, + { + "from": 10, + "to": 20 + }, + { + "from": 20, + "to": 50 + }, + { + "from": 50, + "to": 100 + }, + { + "from": 100 + } + ] + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "position" : { + "buckets" : [ + { + "key" : "*-10.0", + "from" : 0.0, + "to" : 10.0, + "doc_count" : 0 + }, + { + "key" : "10.0-20.0", + "from" : 10.0, + "to" : 20.0, + "doc_count" : 0 + }, + { + "key" : "20.0-50.0", + "from" : 20.0, + "to" : 50.0, + "doc_count" : 0 + }, + { + "key" : "50.0-100.0", + "from" : 50.0, + "to" : 100.0, + "doc_count" : 0 + }, + { + "key" : "100.0-*", + "from" : 100.0, + "doc_count" : 14074 + } + ] + } + } +} +``` + +The `geohash_grid` aggregation buckets documents for geographical analysis. It organizes a geographical region into a grid of smaller regions of different sizes or precisions. Lower values of precision represent larger geographical areas and higher values represent smaller, more precise geographical areas. + +The number of results returned by a query might be far too many to display each geo point individually on a map. The `geohash_grid` aggregation buckets nearby geo points together by calculating the Geohash for each point, at the level of precision that you define (between 1 to 12; the default is 5). To learn more about Geohash, see [Wikipedia](http://en.wikipedia.org/wiki/Geohash). + +The web logs example data is spread over a large geographical area, so you can use a lower precision value. You can zoom in on this map by increasing the precision value: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "geo_hash": { + "geohash_grid": { + "field": "geo.coordinates", + "precision": 4 + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "geo_hash" : { + "buckets" : [ + { + "key" : "c1cg", + "doc_count" : 104 + }, + { + "key" : "dr5r", + "doc_count" : 26 + }, + { + "key" : "9q5b", + "doc_count" : 20 + }, + { + "key" : "c20g", + "doc_count" : 19 + }, + { + "key" : "dr70", + "doc_count" : 18 + } + ... + ] + } + } +} +``` + +You can visualize the aggregated response on a map using OpenSearch Dashboards. + +The more accurate you want the aggregation to be, the more resources OpenSearch consumes, because of the number of buckets that the aggregation has to calculate. By default, OpenSearch does not generate more than 10,000 buckets. You can change this behavior by using the `size` attribute, but keep in mind that the performance might suffer for very wide queries consisting of thousands of buckets. + +## adjacency_matrix + +The `adjacency_matrix` aggregation lets you define filter expressions and returns a matrix of the intersecting filters where each non-empty cell in the matrix represents a bucket. You can find how many documents fall within any combination of filters. + +Use the `adjacency_matrix` aggregation to discover how concepts are related by visualizing the data as graphs. + +For example, in the sample eCommerce dataset, to analyze how the different manufacturing companies are related: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "interactions": { + "adjacency_matrix": { + "filters": { + "grpA": { + "match": { + "manufacturer.keyword": "Low Tide Media" + } + }, + "grpB": { + "match": { + "manufacturer.keyword": "Elitelligence" + } + }, + "grpC": { + "match": { + "manufacturer.keyword": "Oceanavigations" + } + } + } + } + } + } +} +``` + +#### Sample response + + ```json + { + ... + "aggregations" : { + "interactions" : { + "buckets" : [ + { + "key" : "grpA", + "doc_count" : 1553 + }, + { + "key" : "grpA&grpB", + "doc_count" : 590 + }, + { + "key" : "grpA&grpC", + "doc_count" : 329 + }, + { + "key" : "grpB", + "doc_count" : 1370 + }, + { + "key" : "grpB&grpC", + "doc_count" : 299 + }, + { + "key" : "grpC", + "doc_count" : 1218 + } + ] + } + } + } +``` + + Let’s take a closer look at the result: + + ```json + { + "key" : "grpA&grpB", + "doc_count" : 590 + } + ``` + +- `grpA`: Products manufactured by Low Tide Media. +- `grpB`: Products manufactured by Elitelligence. +- `590`: Number of products that are manufactured by both. + +You can use OpenSearch Dashboards to represent this data with a network graph. + +## nested, reverse_nested + +The `nested` aggregation lets you aggregate on fields inside a nested object. The `nested` type is a specialized version of the object data type that allows arrays of objects to be indexed in a way that they can be queried independently of each other + +With the `object` type, all the data is stored in the same document, so matches for a search can go across sub documents. For example, imagine a `logs` index with `pages` mapped as an `object` datatype: + +```json +PUT logs/_doc/0 +{ + "response": "200", + "pages": [ + { + "page": "landing", + "load_time": 200 + }, + { + "page": "blog", + "load_time": 500 + } + ] +} +``` + +OpenSearch merges all sub-properties of the entity relations that looks something like this: + +```json +{ + "logs": { + "pages": ["landing", "blog"], + "load_time": ["200", "500"] + } +} +``` + +So, if you wanted to search this index with `pages=landing` and `load_time=500`, this document matches the criteria even though the `load_time` value for landing is 200. + +If you want to make sure such cross-object matches don’t happen, map the field as a `nested` type: + +```json +PUT logs +{ + "mappings": { + "properties": { + "pages": { + "type": "nested", + "properties": { + "page": { "type": "text" }, + "load_time": { "type": "double" } + } + } + } + } +} +``` + +Nested documents allow you to index the same JSON document but will keep your pages in separate Lucene documents, making only searches like `pages=landing` and `load_time=200` return the expected result. Internally, nested objects index each object in the array as a separate hidden document, meaning that each nested object can be queried independently of the others. + +You have to specify a nested path relative to parent that contains the nested documents: + + +```json +GET logs/_search +{ + "query": { + "match": { "response": "200" } + }, + "aggs": { + "pages": { + "nested": { + "path": "pages" + }, + "aggs": { + "min_load_time": { "min": { "field": "pages.load_time" } } + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "pages" : { + "doc_count" : 2, + "min_price" : { + "value" : 200.0 + } + } + } +} +``` + +You can also aggregate values from nested documents to their parent; this aggregation is called `reverse_nested`. +You can use `reverse_nested` to aggregate a field from the parent document after grouping by the field from the nested object. The `reverse_nested` aggregation "joins back" the root page and gets the `load_time` for each for your variations. + +The `reverse_nested` aggregation is a sub-aggregation inside a nested aggregation. It accepts a single option named `path`. This option defines how many steps backwards in the document hierarchy OpenSearch takes to calculate the aggregations. + +```json +GET logs/_search +{ + "query": { + "match": { "response": "200" } + }, + "aggs": { + "pages": { + "nested": { + "path": "pages" + }, + "aggs": { + "top_pages_per_load_time": { + "terms": { + "field": "pages.load_time" + }, + "aggs": { + "comment_to_logs": { + "reverse_nested": {}, + "aggs": { + "min_load_time": { + "min": { + "field": "pages.load_time" + } + } + } + } + } + } + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "pages" : { + "doc_count" : 2, + "top_pages_per_load_time" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : 200.0, + "doc_count" : 1, + "comment_to_logs" : { + "doc_count" : 1, + "min_load_time" : { + "value" : null + } + } + }, + { + "key" : 500.0, + "doc_count" : 1, + "comment_to_logs" : { + "doc_count" : 1, + "min_load_time" : { + "value" : null + } + } + } + ] + } + } + } +} +``` + +The response shows the logs index has one page with a `load_time` of 200 and one with a `load_time` of 500. diff --git a/docs/opensearch/metric-agg.md b/docs/opensearch/metric-agg.md new file mode 100644 index 00000000..380df4d6 --- /dev/null +++ b/docs/opensearch/metric-agg.md @@ -0,0 +1,637 @@ +--- +layout: default +title: Metric Aggregations +parent: Aggregations +grand_parent: OpenSearch +nav_order: 1 +has_children: false +--- + +# Metric Aggregations + +Metric aggregations let you perform simple calculations such as finding the minimum, maximum, and average values of a field. + +## Types of metric aggregations + +Metric aggregations are of two types: single-value metric aggregations and multi-value metric aggregations. + +### Single-value metric aggregations + +Single-value metric aggregations return a single metric. For example, `sum`, `min`, `max`, `avg`, `cardinality`, and `value_count`. + +### Multi-value metric aggregations + +Multi-value metric aggregations return more than one metric. For example, `stats`, `extended_stats`, `matrix_stats`, `percentile`, `percentile_ranks`, `geo_bound`, `top_hits`, and `scripted_metric`. + +## sum, min, max, avg + +The `sum`, `min`, `max`, and `avg` metrics are single-value metric aggregations that return the sum, minimum, maximum, and average values of a field, respectively. + +The following example calculates the total sum of the `taxful_total_price` field: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "sum_taxful_total_price": { + "sum": { + "field": "taxful_total_price" + } + } + } +} +``` + +#### Sample Response + +```json +... + "aggregations" : { + "sum_taxful_total_price" : { + "value" : 350884.12890625 + } + } +} +``` + +In a similar fashion, you can find the minimum, maximum, and average values of a field. + +## cardinality + +The `cardinality` metric is a single-value metric aggregation that counts the number of unique or distinct values of a field. + +The following example finds the number of unique products in an eCommerce store: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "unique_products": { + "cardinality": { + "field": "products.product_id" + } + } + } +} +``` + +#### Sample response + +```json +... + "aggregations" : { + "unique_products" : { + "value" : 7033 + } + } +} +``` + +The cardinality count is approximate. +If you had tens of thousands of products in your store, an accurate cardinality calculation requires loading all the values into a hash set and returning its size. This approach doesn't scale well because it requires more memory and causes high latency. + +You can control the trade-off between memory and accuracy with the `precision_threshold` setting. This setting defines the threshold below which counts are expected to be close to accurate. Above this value, counts might become a bit less accurate. The default value of `precision_threshold` is 3,000. The maximum supported value is 40,000. + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "unique_products": { + "cardinality": { + "field": "products.product_id", + "precision_threshold": 10000 + } + } + } +} +``` + +## value_count + +The `value_count` metric is a single-value metric aggregation that calculates the number of values that an aggregation is based on. + +For example, you can use the `value_count` metric with the `avg` metric to find how many numbers the aggregation uses to calculate an average value. + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "number_of_values": { + "value_count": { + "field": "taxful_total_price" + } + } + } +} +``` + +#### Sample response + +```json +... + "aggregations" : { + "number_of_values" : { + "value" : 4675 + } + } +} +``` + +## stats, extended_stats, matrix_stats + +The `stats` metric is a multi-value metric aggregation that returns all basic metrics such as `min`, `max`, `sum`, `avg`, and `value_count` in one aggregation query. + +The following example returns the basic stats for the `taxful_total_price` field: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "stats_taxful_total_price": { + "stats": { + "field": "taxful_total_price" + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "stats_taxful_total_price" : { + "count" : 4675, + "min" : 6.98828125, + "max" : 2250.0, + "avg" : 75.05542864304813, + "sum" : 350884.12890625 + } + } +} +``` + +The `extended_stats` aggregation is an extended version of the `stats` aggregation. Apart from including basic stats, `extended_stats` also returns stats such as `sum_of_squares`, `variance`, and `std_deviation`. + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "extended_stats_taxful_total_price": { + "extended_stats": { + "field": "taxful_total_price" + } + } + } +} +``` + +#### Sample Response + +```json +... +"aggregations" : { + "extended_stats_taxful_total_price" : { + "count" : 4675, + "min" : 6.98828125, + "max" : 2250.0, + "avg" : 75.05542864304813, + "sum" : 350884.12890625, + "sum_of_squares" : 3.9367749294174194E7, + "variance" : 2787.59157113862, + "variance_population" : 2787.59157113862, + "variance_sampling" : 2788.187974983536, + "std_deviation" : 52.79764740155209, + "std_deviation_population" : 52.79764740155209, + "std_deviation_sampling" : 52.80329511482722, + "std_deviation_bounds" : { + "upper" : 180.6507234461523, + "lower" : -30.53986616005605, + "upper_population" : 180.6507234461523, + "lower_population" : -30.53986616005605, + "upper_sampling" : 180.66201887270256, + "lower_sampling" : -30.551161586606312 + } + } + } +} +``` + +The `std_deviation_bounds` object provides a visual variance of the data with an interval of plus/minus two standard deviations from the mean. +To set the standard deviation to a different value, say 3, set `sigma` to 3: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "extended_stats_taxful_total_price": { + "extended_stats": { + "field": "taxful_total_price", + "sigma": 3 + } + } + } +} +``` + +The `matrix_stats` aggregation generates advanced stats for multiple fields in a matrix form. +The following example returns advanced stats in a matrix form for the `taxful_total_price` and `products.base_price` fields: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "matrix_stats_taxful_total_price": { + "matrix_stats": { + "fields": ["taxful_total_price", "products.base_price"] + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "matrix_stats_taxful_total_price" : { + "doc_count" : 4675, + "fields" : [ + { + "name" : "products.base_price", + "count" : 4675, + "mean" : 34.994239430147196, + "variance" : 360.5035285833703, + "skewness" : 5.530161335032702, + "kurtosis" : 131.16306324042148, + "covariance" : { + "products.base_price" : 360.5035285833703, + "taxful_total_price" : 846.6489362233166 + }, + "correlation" : { + "products.base_price" : 1.0, + "taxful_total_price" : 0.8444765264325268 + } + }, + { + "name" : "taxful_total_price", + "count" : 4675, + "mean" : 75.05542864304839, + "variance" : 2788.1879749835402, + "skewness" : 15.812149139924037, + "kurtosis" : 619.1235507385902, + "covariance" : { + "products.base_price" : 846.6489362233166, + "taxful_total_price" : 2788.1879749835402 + }, + "correlation" : { + "products.base_price" : 0.8444765264325268, + "taxful_total_price" : 1.0 + } + } + ] + } + } +} +``` + +Statistic | Description +:--- | :--- +`count` | The number of samples measured. +`mean` | The average value of the field measured from the sample. +`variance` | How far the values of the field measured are spread out from its mean value. The larger the variance, the more it's spread from its mean value. +`skewness` | An asymmetric measure of the distribution of the field's values around the mean. +`kurtosis` | A measure of the tail heaviness of a distribution. As the tail becomes lighter, kurtosis decreases. As the tail becomes heavier, kurtosis increases. To learn about kurtosis, see [Wikipedia](https://en.wikipedia.org/wiki/Kurtosis). +`covariance` | A measure of the joint variability between two fields. A positive value means their values move in the same direction and vice versa. +`correlation` | A measure of the strength of the relationship between two fields. The valid values are between [-1, 1]. A value of -1 means that the value is negatively correlated and a value of 1 means that it's positively correlated. A value of 0 means that there's no identifiable relationship between them. + +## percentile, percentile_ranks + +Percentile is the percentage of the data that's at or below a certain threshold value. + +The `percentile` metric is a multi-value metric aggregation that lets you find outliers in your data or figure out the distribution of your data. + +Like the `cardinality` metric, the `percentile` metric is also approximate. + +The following example calculates the percentile in relation to the `taxful_total_price` field: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "percentile_taxful_total_price": { + "percentiles": { + "field": "taxful_total_price" + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "percentile_taxful_total_price" : { + "values" : { + "1.0" : 21.984375, + "5.0" : 27.984375, + "25.0" : 44.96875, + "50.0" : 64.22061688311689, + "75.0" : 93.0, + "95.0" : 156.0, + "99.0" : 222.0 + } + } + } +} +``` + +Percentile rank is the percentile of values at or below a threshold grouped by a specified value. For example, if a value is greater than or equal to 80% of the values, it has a percentile rank of 80. + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "percentile_rank_taxful_total_price": { + "percentile_ranks": { + "field": "taxful_total_price", + "values": [ + 10, + 15 + ] + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "percentile_rank_taxful_total_price" : { + "values" : { + "10.0" : 0.055096056411283456, + "15.0" : 0.0830092961834656 + } + } + } +} +``` + +## geo_bound + +The `geo_bound` metric is a multi-value metric aggregation that calculates the bounding box in terms of latitude and longitude around a `geo_point` field. + +The following example returns the `geo_bound` metrics for the `geoip.location` field: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "geo": { + "geo_bounds": { + "field": "geoip.location" + } + } + } +} +``` + +#### Sample response + +```json +"aggregations" : { + "geo" : { + "bounds" : { + "top_left" : { + "lat" : 52.49999997206032, + "lon" : -118.20000001229346 + }, + "bottom_right" : { + "lat" : 4.599999985657632, + "lon" : 55.299999956041574 + } + } + } + } +} +``` + +## top_hits + +The `top_hits` metric is a multi-value metric aggregation that ranks the matching documents based on a relevance score for the field that's being aggregated. + +You can specify the following options: + +- `from`: The starting position of the hit. +- `size`: The maximum size of hits to return. The default value is 3. +- `sort`: How the matching hits are sorted. By default, the hits are sorted by the relevance score of the aggregation query. + +The following example returns the top 5 products in your eCommerce data: + +```json +GET opensearch_dashboards_sample_data_ecommerce/_search +{ + "size": 0, + "aggs": { + "top_hits_products": { + "top_hits": { + "size": 5 + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "top_hits_products" : { + "hits" : { + "total" : { + "value" : 4675, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "opensearch_dashboards_sample_data_ecommerce", + "_type" : "_doc", + "_id" : "glMlwXcBQVLeQPrkHPtI", + "_score" : 1.0, + "_source" : { + "category" : [ + "Women's Accessories", + "Women's Clothing" + ], + "currency" : "EUR", + "customer_first_name" : "rania", + "customer_full_name" : "rania Evans", + "customer_gender" : "FEMALE", + "customer_id" : 24, + "customer_last_name" : "Evans", + "customer_phone" : "", + "day_of_week" : "Sunday", + "day_of_week_i" : 6, + "email" : "rania@evans-family.zzz", + "manufacturer" : [ + "Tigress Enterprises" + ], + "order_date" : "2021-02-28T14:16:48+00:00", + "order_id" : 583581, + "products" : [ + { + "base_price" : 10.99, + "discount_percentage" : 0, + "quantity" : 1, + "manufacturer" : "Tigress Enterprises", + "tax_amount" : 0, + "product_id" : 19024, + "category" : "Women's Accessories", + "sku" : "ZO0082400824", + "taxless_price" : 10.99, + "unit_discount_amount" : 0, + "min_price" : 5.17, + "_id" : "sold_product_583581_19024", + "discount_amount" : 0, + "created_on" : "2016-12-25T14:16:48+00:00", + "product_name" : "Snood - white/grey/peach", + "price" : 10.99, + "taxful_price" : 10.99, + "base_unit_price" : 10.99 + }, + { + "base_price" : 32.99, + "discount_percentage" : 0, + "quantity" : 1, + "manufacturer" : "Tigress Enterprises", + "tax_amount" : 0, + "product_id" : 19260, + "category" : "Women's Clothing", + "sku" : "ZO0071900719", + "taxless_price" : 32.99, + "unit_discount_amount" : 0, + "min_price" : 17.15, + "_id" : "sold_product_583581_19260", + "discount_amount" : 0, + "created_on" : "2016-12-25T14:16:48+00:00", + "product_name" : "Cardigan - grey", + "price" : 32.99, + "taxful_price" : 32.99, + "base_unit_price" : 32.99 + } + ], + "sku" : [ + "ZO0082400824", + "ZO0071900719" + ], + "taxful_total_price" : 43.98, + "taxless_total_price" : 43.98, + "total_quantity" : 2, + "total_unique_products" : 2, + "type" : "order", + "user" : "rani", + "geoip" : { + "country_iso_code" : "EG", + "location" : { + "lon" : 31.3, + "lat" : 30.1 + }, + "region_name" : "Cairo Governorate", + "continent_name" : "Africa", + "city_name" : "Cairo" + }, + "event" : { + "dataset" : "sample_ecommerce" + } + } + ... + } + ] + } + } + } +} +``` + +## scripted_metric + +The `scripted_metric` metric is a multi-value metric aggregation that returns metrics calculated from a specified script. + +A script has four stages: the initial stage, the map stage, the combine stage, and the reduce stage. + +* `init_script`: (OPTIONAL) Sets the initial state and executes before any collection of documents. +* `map_script`: Checks the value of the `type` field and executes the aggregation on the collected documents. +* `combine_script`: Aggregates the state returned from every shard. The aggregated value is returned to the coordinating node. +* `reduce_script`: Provides access to the variable states; this variable combines the results from the `combine_script` on each shard into an array. + +The following example aggregates the different HTTP response types in web log data: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggregations": { + "responses.counts": { + "scripted_metric": { + "init_script": "state.responses = ['error':0L,'success':0L,'other':0L]", + "map_script": """ + def code = doc['response.keyword'].value; + if (code.startsWith('5') || code.startsWith('4')) { + state.responses.error += 1 ; + } else if(code.startsWith('2')) { + state.responses.success += 1; + } else { + state.responses.other += 1; + } + """, + "combine_script": "state.responses", + "reduce_script": """ + def counts = ['error': 0L, 'success': 0L, 'other': 0L]; + for (responses in states) { + counts.error += responses['error']; + counts.success += responses['success']; + counts.other += responses['other']; + } + return counts; + """ + } + } + } +} +``` + +#### Sample Response + +```json +... +"aggregations" : { + "responses.counts" : { + "value" : { + "other" : 0, + "success" : 12832, + "error" : 1242 + } + } + } +} +``` diff --git a/docs/opensearch/pipeline-agg.md b/docs/opensearch/pipeline-agg.md new file mode 100644 index 00000000..2aefd32f --- /dev/null +++ b/docs/opensearch/pipeline-agg.md @@ -0,0 +1,1259 @@ +--- +layout: default +title: Pipeline Aggregations +parent: Aggregations +grand_parent: OpenSearch +nav_order: 4 +has_children: false +--- + +# Pipeline Aggregations + +With pipeline aggregations, you can chain aggregations by piping the results of one aggregation as an input to another for a more nuanced output. + +You can use pipeline aggregations to compute complex statistical and mathematical measures like derivatives, moving averages, cumulative sums, and so on. + +## Pipeline aggregation syntax + +A pipeline aggregation uses the the `buckets_path` property to access the results of other aggregations. +The `buckets_path` property has a specific syntax: + +``` +buckets_path = [,]*[, ]; +``` + +where: + +- `AGG_NAME` is the name of the aggregation. +- `AGG_SEPARATOR` separates aggregations. It's represented as `>`. +- `METRIC_SEPARATOR` separates aggregations from its metrics. It's represented as `.`. +- `METRIC` is the name of the metric, in case of multi-value metric aggregations. + +For example, `my_sum.sum` selects the `sum` metric of an aggregation called `my_sum`. `popular_tags>my_sum.sum` nests `my_sum.sum` into the `popular_tags` aggregation. + +You can also specify the following additional parameters: + +- `gap_policy`: Real-world data can contain gaps or null values. You can specify the policy to deal with such missing data with the `gap_policy` property. You can either set the `gap_policy` property to `skip` to skip the missing data and continue from the next available value, or `insert_zeros` to replace the missing values with zero and continue running. +- `format`: The type of format for the output value. For example, `yyyy-MM-dd` for a date value. + +## Quick example + +To sum all the buckets returned by the `sum_total_memory` aggregation: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "number_of_bytes": { + "histogram": { + "field": "bytes", + "interval": 10000 + }, + "aggs": { + "sum_total_memory": { + "sum": { + "field": "phpmemory" + } + } + } + }, + "sum_copies": { + "sum_bucket": { + "buckets_path": "number_of_bytes>sum_total_memory" + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "number_of_bytes" : { + "buckets" : [ + { + "key" : 0.0, + "doc_count" : 13372, + "sum_total_memory" : { + "value" : 9.12664E7 + } + }, + { + "key" : 10000.0, + "doc_count" : 702, + "sum_total_memory" : { + "value" : 0.0 + } + } + ] + }, + "sum_copies" : { + "value" : 9.12664E7 + } + } +} +``` + +## Types of pipeline aggregations + +Pipeline aggregations are of two types: + +### Sibling aggregations + +Sibling aggregations take the output of a nested aggregation and produce new buckets or new aggregations at the same level as the nested buckets. + +Sibling aggregations must be a multi-bucket aggregation (have multiple grouped values for a certain field) and the metric must be a numeric value. + +`min_bucket`, `max_bucket`, `sum_bucket`, and `avg_bucket` are common sibling aggregations. + +### Parent aggregations + +Parent aggregations take the output of an outer aggregation and produce new buckets or new aggregations at the same level as the existing buckets. + +Parent aggregations must have `min_doc_count` set to 0 (default for `histogram` aggregations) and the specified metric must be a numeric value. If `min_doc_count` is greater than `0`, some buckets are omitted, which might lead to incorrect results. + +`derivatives` and `cumulative_sum` are common parent aggregations. + +## avg_bucket, sum_bucket, min_bucket, max_bucket + +The `avg_bucket`, `sum_bucket`, `min_bucket`, and `max_bucket` aggregations are sibling aggregations that calculate the average, sum, minimum, and maximum values of a metric in each bucket of a previous aggregation. + +The following example creates a date histogram with a one-month interval. The `sum` sub-aggregation calculates the sum of all bytes for each month. Finally, the `avg_bucket` aggregation uses this sum to calculate the average number of bytes per month: + +```json +POST opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "visits_per_month": { + "date_histogram": { + "field": "@timestamp", + "interval": "month" + }, + "aggs": { + "sum_of_bytes": { + "sum": { + "field": "bytes" + } + } + } + }, + "avg_monthly_bytes": { + "avg_bucket": { + "buckets_path": "visits_per_month>sum_of_bytes" + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "visits_per_month" : { + "buckets" : [ + { + "key_as_string" : "2020-10-01T00:00:00.000Z", + "key" : 1601510400000, + "doc_count" : 1635, + "sum_of_bytes" : { + "value" : 9400200.0 + } + }, + { + "key_as_string" : "2020-11-01T00:00:00.000Z", + "key" : 1604188800000, + "doc_count" : 6844, + "sum_of_bytes" : { + "value" : 3.8880434E7 + } + }, + { + "key_as_string" : "2020-12-01T00:00:00.000Z", + "key" : 1606780800000, + "doc_count" : 5595, + "sum_of_bytes" : { + "value" : 3.1445055E7 + } + } + ] + }, + "avg_monthly_bytes" : { + "value" : 2.6575229666666668E7 + } + } +} +``` + +In a similar fashion, you can calculate the `sum_bucket`, `min_bucket`, and `max_bucket` values for the bytes per month. + +## stats_bucket, extended_stats_bucket + +The `stats_bucket` aggregation is a sibling aggregation that returns a variety of stats (`count`, `min`, `max`, `avg`, and `sum`) for the buckets of a previous aggregation. + +The following example returns the basic stats for the buckets returned by the `sum_of_bytes` aggregation nested into the `visits_per_month` aggregation: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "visits_per_month": { + "date_histogram": { + "field": "@timestamp", + "interval": "month" + }, + "aggs": { + "sum_of_bytes": { + "sum": { + "field": "bytes" + } + } + } + }, + "stats_monthly_bytes": { + "stats_bucket": { + "buckets_path": "visits_per_month>sum_of_bytes" + } + } + } +} +``` + +#### Sample response + +```json +... +"stats_monthly_bytes" : { + "count" : 3, + "min" : 9400200.0, + "max" : 3.8880434E7, + "avg" : 2.6575229666666668E7, + "sum" : 7.9725689E7 + } + } +} +``` + +The `extended_stats` aggregation is an extended version of the `stats` aggregation. Apart from including basic stats, `extended_stats` also provides stats such as `sum_of_squares`, `variance`, and `std_deviation`. + +#### Sample response + +```json +"stats_monthly_visits" : { + "count" : 3, + "min" : 9400200.0, + "max" : 3.8880434E7, + "avg" : 2.6575229666666668E7, + "sum" : 7.9725689E7, + "sum_of_squares" : 2.588843392021381E15, + "variance" : 1.5670496550438025E14, + "variance_population" : 1.5670496550438025E14, + "variance_sampling" : 2.3505744825657038E14, + "std_deviation" : 1.251818539183616E7, + "std_deviation_population" : 1.251818539183616E7, + "std_deviation_sampling" : 1.5331583357780447E7, + "std_deviation_bounds" : { + "upper" : 5.161160045033899E7, + "lower" : 1538858.8829943463, + "upper_population" : 5.161160045033899E7, + "lower_population" : 1538858.8829943463, + "upper_sampling" : 5.723839638222756E7, + "lower_sampling" : -4087937.0488942266 + } + } + } +} +``` + +## bucket_script, bucket_selector + +The `bucket_script` aggregation is a parent aggregation that executes a script to perform per-bucket calculations of a previous aggregation. Make sure the metrics are of numeric type and the returned values are also numeric. + +Use the `script` parameter to add your script. The script can be inline, in a file, or in an index. To enable inline scripting, add the following line to your `opensearch.yml` file in the `config` folder: + +```yaml +script.inline: on +``` + +The `buckets_path` property consists of multiple entries. Each entry is a key and a value. The key is the name of the value that you can use in the script. + +The basic syntax is: + +```json +{ + "bucket_script": { + "buckets_path": { + "my_var1": "the_sum", + "my_var2": "the_value_count" + }, + "script": "params.my_var1 / params.my_var2" + } +} +``` + +The following example uses the `sum` aggregation on the buckets generated by a date histogram. From the resultant buckets values, the percentage of RAM is calculated in an interval of 10,000 bytes in the context of a zip extension: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "sales_per_month": { + "histogram": { + "field": "bytes", + "interval": "10000" + }, + "aggs": { + "total_ram": { + "sum": { + "field": "machine.ram" + } + }, + "ext-type": { + "filter": { + "term": { + "extension.keyword": "zip" + } + }, + "aggs": { + "total_ram": { + "sum": { + "field": "machine.ram" + } + } + } + }, + "ram-percentage": { + "bucket_script": { + "buckets_path": { + "machineRam": "ext-type>total_ram", + "totalRam": "total_ram" + }, + "script": "params.machineRam / params.totalRam" + } + } + } + } + } +} +``` + +#### Sample response + +```json +"aggregations" : { + "sales_per_month" : { + "buckets" : [ + { + "key" : 0.0, + "doc_count" : 13372, + "os-type" : { + "doc_count" : 1558, + "total_ram" : { + "value" : 2.0090783268864E13 + } + }, + "total_ram" : { + "value" : 1.7214228922368E14 + }, + "ram-percentage" : { + "value" : 0.11671032934131736 + } + }, + { + "key" : 10000.0, + "doc_count" : 702, + "os-type" : { + "doc_count" : 116, + "total_ram" : { + "value" : 1.622423896064E12 + } + }, + "total_ram" : { + "value" : 9.015136354304E12 + }, + "ram-percentage" : { + "value" : 0.17996665078608862 + } + } + ] + } + } +} +``` + +The RAM percentage is calculated and appended at the end of each bucket. + +The `bucket_selector` aggregation is a script-based aggregation that selects buckets returned by a `histogram` (or `date_histogram`) aggregation. Use it in scenarios where you don’t want certain buckets in the output based on conditions supplied by you. + +The `bucket_selector` aggregation executes a script to decide if a bucket stays in the parent multi-bucket aggregation. + +The basic syntax is: + +```json +{ + "bucket_selector": { + "buckets_path": { + "my_var1": "the_sum", + "my_var2": "the_value_count" + }, + "script": "params.my_var1 / params.my_var2" + } +} +``` + +The following example calculates the sum of bytes and then evaluates if this sum is greater than 20,000. If true, then the bucket is retained in the bucket list. Otherwise, it’s deleted from the final output. + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "bytes_per_month": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "month" + }, + "aggs": { + "total_bytes": { + "sum": { + "field": "bytes" + } + }, + "bytes_bucket_filter": { + "bucket_selector": { + "buckets_path": { + "totalBytes": "total_bytes" + }, + "script": "params.totalBytes > 20000" + } + } + } + } + } +} +``` + +#### Sample response + +```json +"aggregations" : { + "bytes_per_month" : { + "buckets" : [ + { + "key_as_string" : "2020-10-01T00:00:00.000Z", + "key" : 1601510400000, + "doc_count" : 1635, + "total_bytes" : { + "value" : 9400200.0 + } + }, + { + "key_as_string" : "2020-11-01T00:00:00.000Z", + "key" : 1604188800000, + "doc_count" : 6844, + "total_bytes" : { + "value" : 3.8880434E7 + } + }, + { + "key_as_string" : "2020-12-01T00:00:00.000Z", + "key" : 1606780800000, + "doc_count" : 5595, + "total_bytes" : { + "value" : 3.1445055E7 + } + } + ] + } + } +} +``` + +## bucket_sort + +The `bucket_sort` aggregation is a parent aggregation that sorts buckets of a previous aggregation. + +You can specify several sort fields together with the corresponding sort order. Additionally, you can sort each bucket based on its key, count, or its sub-aggregations. You can also truncate the buckets by setting `from` and `size` parameters. + +Syntax + +```json +{ + "bucket_sort": { + "sort": [ + {"sort_field_1": {"order": "asc"}}, + {"sort_field_2": {"order": "desc"}}, + "sort_field_3" + ], + "from":1, + "size":3 + } +} +``` + +The following example sorts the buckets of a `date_histogram` aggregation based on the computed `total_sum` values. We sort the buckets in descending order so that the buckets with the highest number of bytes are returned first. + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "sales_per_month": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "month" + }, + "aggs": { + "total_bytes": { + "sum": { + "field": "bytes" + } + }, + "bytes_bucket_sort": { + "bucket_sort": { + "sort": [ + { "total_bytes": { "order": "desc" } } + ], + "size": 3 + } + } + } + } + } +} +``` + +#### Sample response + +```json +"aggregations" : { + "sales_per_month" : { + "buckets" : [ + { + "key_as_string" : "2020-11-01T00:00:00.000Z", + "key" : 1604188800000, + "doc_count" : 6844, + "total_bytes" : { + "value" : 3.8880434E7 + } + }, + { + "key_as_string" : "2020-12-01T00:00:00.000Z", + "key" : 1606780800000, + "doc_count" : 5595, + "total_bytes" : { + "value" : 3.1445055E7 + } + }, + { + "key_as_string" : "2020-10-01T00:00:00.000Z", + "key" : 1601510400000, + "doc_count" : 1635, + "total_bytes" : { + "value" : 9400200.0 + } + } + ] + } + } +} +``` + +You can also use this aggregation to truncate the resulting buckets without sorting. For this, just use the `from` and/or `size` parameters without `sort`. + +## cumulative_sum + +The `cumulative_sum` aggregation is a parent aggregation that calculates the cumulative sum of each bucket of a previous aggregation. + +A cumulative sum is a sequence of partial sums of a given sequence. For example, the cumulative sums of the sequence `{a,b,c,…}` are `a`, `a+b`, `a+b+c`, and so on. You can use the cumulative sum to visualize the rate of change of a field over time. + +The following example calculates the cumulative number of bytes over a monthly basis: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "sales_per_month": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "month" + }, + "aggs": { + "no-of-bytes": { + "sum": { + "field": "bytes" + } + }, + "cumulative_bytes": { + "cumulative_sum": { + "buckets_path": "no-of-bytes" + } + } + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "sales_per_month" : { + "buckets" : [ + { + "key_as_string" : "2020-10-01T00:00:00.000Z", + "key" : 1601510400000, + "doc_count" : 1635, + "no-of-bytes" : { + "value" : 9400200.0 + }, + "cumulative_bytes" : { + "value" : 9400200.0 + } + }, + { + "key_as_string" : "2020-11-01T00:00:00.000Z", + "key" : 1604188800000, + "doc_count" : 6844, + "no-of-bytes" : { + "value" : 3.8880434E7 + }, + "cumulative_bytes" : { + "value" : 4.8280634E7 + } + }, + { + "key_as_string" : "2020-12-01T00:00:00.000Z", + "key" : 1606780800000, + "doc_count" : 5595, + "no-of-bytes" : { + "value" : 3.1445055E7 + }, + "cumulative_bytes" : { + "value" : 7.9725689E7 + } + } + ] + } + } +} +``` + +## derivative + +The `derivative` aggregation is a parent aggregation that calculates 1st order and 2nd order derivates of each bucket of a previous aggregation. + +In mathematics, the derivative of a function measures its sensitivity to change. In other words, a derivative evaluates the rate of change in some function with respect to some variable. To learn more about derivates, see [Wikipedia](https://en.wikipedia.org/wiki/Derivative). + +You can use derivates to calculate the rate of change of numeric values compared to its previous time periods. + +The 1st order derivative indicates whether a metric is increasing or decreasing, and by how much it's increasing or decreasing. + +The following example calculates the 1st order derivative for the sum of bytes per month. The 1st order derivative is the difference between the number of bytes in the current month and the previous month: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "sales_per_month": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "month" + }, + "aggs": { + "number_of_bytes": { + "sum": { + "field": "bytes" + } + }, + "bytes_deriv": { + "derivative": { + "buckets_path": "number_of_bytes" + } + } + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "sales_per_month" : { + "buckets" : [ + { + "key_as_string" : "2020-10-01T00:00:00.000Z", + "key" : 1601510400000, + "doc_count" : 1635, + "number_of_bytes" : { + "value" : 9400200.0 + } + }, + { + "key_as_string" : "2020-11-01T00:00:00.000Z", + "key" : 1604188800000, + "doc_count" : 6844, + "number_of_bytes" : { + "value" : 3.8880434E7 + }, + "bytes_deriv" : { + "value" : 2.9480234E7 + } + }, + { + "key_as_string" : "2020-12-01T00:00:00.000Z", + "key" : 1606780800000, + "doc_count" : 5595, + "number_of_bytes" : { + "value" : 3.1445055E7 + }, + "bytes_deriv" : { + "value" : -7435379.0 + } + } + ] + } + } +} +``` + +The 2nd order derivative is a double derivative or a derivative of the derivative. +It indicates how the rate of change of a quantity is itself changing. It’s the difference between the 1st order derivatives of adjacent buckets. + +To calculate a 2nd order derivative, chain one derivative aggregation to another: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "sales_per_month": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "month" + }, + "aggs": { + "number_of_bytes": { + "sum": { + "field": "bytes" + } + }, + "bytes_deriv": { + "derivative": { + "buckets_path": "number_of_bytes" + } + }, + "bytes_2nd_deriv": { + "derivative": { + "buckets_path": "bytes_deriv" + } + } + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "sales_per_month" : { + "buckets" : [ + { + "key_as_string" : "2020-10-01T00:00:00.000Z", + "key" : 1601510400000, + "doc_count" : 1635, + "number_of_bytes" : { + "value" : 9400200.0 + } + }, + { + "key_as_string" : "2020-11-01T00:00:00.000Z", + "key" : 1604188800000, + "doc_count" : 6844, + "number_of_bytes" : { + "value" : 3.8880434E7 + }, + "bytes_deriv" : { + "value" : 2.9480234E7 + } + }, + { + "key_as_string" : "2020-12-01T00:00:00.000Z", + "key" : 1606780800000, + "doc_count" : 5595, + "number_of_bytes" : { + "value" : 3.1445055E7 + }, + "bytes_deriv" : { + "value" : -7435379.0 + }, + "bytes_2nd_deriv" : { + "value" : -3.6915613E7 + } + } + ] + } + } +} +``` + +The first bucket doesn't have a 1st order derivate as a derivate needs at least two points for comparison. The first and second buckets don't have a 2nd order derivate because a 2nd order derivate needs at least two data points from the 1st order derivative. + +The 1st order derivative for the "2020-11-01" bucket is 2.9480234E7 and the "2020-12-01" bucket is -7435379. So, the 2nd order derivative of the “2020-12-01” bucket is -3.6915613E7 (-7435379-2.9480234E7). + +Theoretically, you could continue chaining derivate aggregations to calculate the third, the fourth, and even higher-order derivatives. That would, however, provide little to no value for most datasets. + +## moving_avg + +A `moving_avg` aggregation is a parent aggregation that calculates the moving average metric. + +The `moving_avg` aggregation finds the series of averages of different windows (subsets) of a dataset. A window’s size represents the number of data points covered by the window on each iteration (specified by the `window` property and set to 5 by default). On each iteration, the algorithm calculates the average for all data points that fit into the window and then slides forward by excluding the first member of the previous window and including the first member from the next window. + +For example, given the data `[1, 5, 8, 23, 34, 28, 7, 23, 20, 19]`, you can calculate a simple moving average with a window’s size of 5 as follows: + +``` +(1 + 5 + 8 + 23 + 34) / 5 = 14.2 +(5 + 8 + 23 + 34+ 28) / 5 = 19.6 +(8 + 23 + 34 + 28 + 7) / 5 = 20 +so on... +``` + +For more information, see [Wikipedia](https://en.wikipedia.org/wiki/Moving_average). + +You can use the `moving_avg` aggregation to either smoothen out short-term fluctuations or to highlight longer-term trends or cycles in your time-series data. + +Specify a small window size (for example, `window`: 10) that closely follows the data to smoothen out small-scale fluctuations. +Alternatively, specify a larger window size (for example, `window`: 100) that lags behind the actual data by a substantial amount to smoothen out all higher-frequency fluctuations or random noise, making lower frequency trends more visible. + +The following example nests a `moving_avg` aggregation into a `date_histogram` aggregation: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "my_date_histogram": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "month" + }, + "aggs": { + "sum_of_bytes": { + "sum": { "field": "bytes" } + }, + "moving_avg_of_sum_of_bytes": { + "moving_avg": { "buckets_path": "sum_of_bytes" } + } + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "my_date_histogram" : { + "buckets" : [ + { + "key_as_string" : "2020-10-01T00:00:00.000Z", + "key" : 1601510400000, + "doc_count" : 1635, + "sum_of_bytes" : { + "value" : 9400200.0 + } + }, + { + "key_as_string" : "2020-11-01T00:00:00.000Z", + "key" : 1604188800000, + "doc_count" : 6844, + "sum_of_bytes" : { + "value" : 3.8880434E7 + }, + "moving_avg_of_sum_of_bytes" : { + "value" : 9400200.0 + } + }, + { + "key_as_string" : "2020-12-01T00:00:00.000Z", + "key" : 1606780800000, + "doc_count" : 5595, + "sum_of_bytes" : { + "value" : 3.1445055E7 + }, + "moving_avg_of_sum_of_bytes" : { + "value" : 2.4140317E7 + } + } + ] + } + } +} +``` + +You can also use the `moving_avg` aggregation to predict future buckets. +To predict buckets, add the `predict` property and set it to the number of predictions that you want to see. + +The following example adds five predictions to the preceding query: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "my_date_histogram": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "month" + }, + "aggs": { + "sum_of_bytes": { + "sum": { + "field": "bytes" + } + }, + "moving_avg_of_sum_of_bytes": { + "moving_avg": { + "buckets_path": "sum_of_bytes", + "predict": 5 + } + } + } + } + } +} +``` + +#### Sample response + +```json +"aggregations" : { + "my_date_histogram" : { + "buckets" : [ + { + "key_as_string" : "2020-10-01T00:00:00.000Z", + "key" : 1601510400000, + "doc_count" : 1635, + "sum_of_bytes" : { + "value" : 9400200.0 + } + }, + { + "key_as_string" : "2020-11-01T00:00:00.000Z", + "key" : 1604188800000, + "doc_count" : 6844, + "sum_of_bytes" : { + "value" : 3.8880434E7 + }, + "moving_avg_of_sum_of_bytes" : { + "value" : 9400200.0 + } + }, + { + "key_as_string" : "2020-12-01T00:00:00.000Z", + "key" : 1606780800000, + "doc_count" : 5595, + "sum_of_bytes" : { + "value" : 3.1445055E7 + }, + "moving_avg_of_sum_of_bytes" : { + "value" : 2.4140317E7 + } + }, + { + "key_as_string" : "2021-01-01T00:00:00.000Z", + "key" : 1609459200000, + "doc_count" : 0, + "moving_avg_of_sum_of_bytes" : { + "value" : 2.6575229666666668E7 + } + }, + { + "key_as_string" : "2021-02-01T00:00:00.000Z", + "key" : 1612137600000, + "doc_count" : 0, + "moving_avg_of_sum_of_bytes" : { + "value" : 2.6575229666666668E7 + } + }, + { + "key_as_string" : "2021-03-01T00:00:00.000Z", + "key" : 1614556800000, + "doc_count" : 0, + "moving_avg_of_sum_of_bytes" : { + "value" : 2.6575229666666668E7 + } + }, + { + "key_as_string" : "2021-04-01T00:00:00.000Z", + "key" : 1617235200000, + "doc_count" : 0, + "moving_avg_of_sum_of_bytes" : { + "value" : 2.6575229666666668E7 + } + }, + { + "key_as_string" : "2021-05-01T00:00:00.000Z", + "key" : 1619827200000, + "doc_count" : 0, + "moving_avg_of_sum_of_bytes" : { + "value" : 2.6575229666666668E7 + } + } + ] + } + } +} +``` + +The `moving_avg` aggregation supports five models — `simple`, `linear`, `exponentially weighted`, `holt-linear`, and `holt-winters`. These models differ in how the values of the window are weighted. As data points become "older" (i.e., the window slides away from them), they might be weighted differently. You can specify a model of your choice by setting the `model` property. The `model` property holds the name of the model and the `settings` object, which you can use to provide model properties. For more information on these models, see [Wikipedia](https://en.wikipedia.org/wiki/Moving_average). + +A `simple` model first calculates the sum of all data points in the window, and then divides that sum by the size of the window. In other words, a `simple` model calculates a simple arithmetic mean for each window in your dataset. + +The following example uses a simple model with a window size of 30: + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "my_date_histogram": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "month" + }, + "aggs": { + "sum_of_bytes": { + "sum": { + "field": "bytes" + } + }, + "moving_avg_of_sum_of_bytes": { + "moving_avg": { + "buckets_path": "sum_of_bytes", + "window": 30, + "model": "simple" + } + } + } + } + } +} +``` + + +#### Sample response + +```json +... +"aggregations" : { + "my_date_histogram" : { + "buckets" : [ + { + "key_as_string" : "2020-10-01T00:00:00.000Z", + "key" : 1601510400000, + "doc_count" : 1635, + "sum_of_bytes" : { + "value" : 9400200.0 + } + }, + { + "key_as_string" : "2020-11-01T00:00:00.000Z", + "key" : 1604188800000, + "doc_count" : 6844, + "sum_of_bytes" : { + "value" : 3.8880434E7 + }, + "moving_avg_of_sum_of_bytes" : { + "value" : 9400200.0 + } + }, + { + "key_as_string" : "2020-12-01T00:00:00.000Z", + "key" : 1606780800000, + "doc_count" : 5595, + "sum_of_bytes" : { + "value" : 3.1445055E7 + }, + "moving_avg_of_sum_of_bytes" : { + "value" : 2.4140317E7 + } + } + ] + } + } +} +``` + +The following example uses a `holt` model. You can set the speed at which the importance decays occurs with the `alpha` and `beta` setting. The default value of `alpha` is 0.3 and `beta` is 0.1. You can specify any float value between 0-1 inclusive. + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "my_date_histogram": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "month" + }, + "aggs": { + "sum_of_bytes": { + "sum": { + "field": "bytes" + } + }, + "moving_avg_of_sum_of_bytes": { + "moving_avg": { + "buckets_path": "sum_of_bytes", + "model": "holt", + "settings": { + "alpha": 0.6, + "beta": 0.4 + } + } + } + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "my_date_histogram" : { + "buckets" : [ + { + "key_as_string" : "2020-10-01T00:00:00.000Z", + "key" : 1601510400000, + "doc_count" : 1635, + "sum_of_bytes" : { + "value" : 9400200.0 + } + }, + { + "key_as_string" : "2020-11-01T00:00:00.000Z", + "key" : 1604188800000, + "doc_count" : 6844, + "sum_of_bytes" : { + "value" : 3.8880434E7 + }, + "moving_avg_of_sum_of_bytes" : { + "value" : 9400200.0 + } + }, + { + "key_as_string" : "2020-12-01T00:00:00.000Z", + "key" : 1606780800000, + "doc_count" : 5595, + "sum_of_bytes" : { + "value" : 3.1445055E7 + }, + "moving_avg_of_sum_of_bytes" : { + "value" : 2.70883404E7 + } + } + ] + } + } +} +``` + + + +## serial_diff + +The `serial_diff` aggregation is a parent pipeline aggregation that computes a series of value differences between a time lag of the buckets from previous aggregations. + +You can use the `serial_diff` aggregation to find the data changes between time periods instead of finding the whole value. + +With the `lag` parameter (a positive, non-zero integer value), you can tell which previous bucket to subtract from the current one. If you don't specify the `lag` parameter, OpenSearch sets it to 1. + +Lets say that the population of a city grows with time. If you use the serial differencing aggregation with the period of one day, you can see the daily growth. For example, you can compute a series of differences of the weekly average changes of a total price. + +```json +GET opensearch_dashboards_sample_data_logs/_search +{ + "size": 0, + "aggs": { + "my_date_histogram": { + "date_histogram": { + "field": "@timestamp", + "calendar_interval": "month" + }, + "aggs": { + "the_sum": { + "sum": { + "field": "bytes" + } + }, + "thirtieth_difference": { + "serial_diff": { + "buckets_path": "the_sum", + "lag" : 30 + } + } + } + } + } +} +``` + +#### Sample response + +```json +... +"aggregations" : { + "my_date_histogram" : { + "buckets" : [ + { + "key_as_string" : "2020-10-01T00:00:00.000Z", + "key" : 1601510400000, + "doc_count" : 1635, + "the_sum" : { + "value" : 9400200.0 + } + }, + { + "key_as_string" : "2020-11-01T00:00:00.000Z", + "key" : 1604188800000, + "doc_count" : 6844, + "the_sum" : { + "value" : 3.8880434E7 + } + }, + { + "key_as_string" : "2020-12-01T00:00:00.000Z", + "key" : 1606780800000, + "doc_count" : 5595, + "the_sum" : { + "value" : 3.1445055E7 + } + } + ] + } + } +} +``` diff --git a/docs/sql/endpoints.md b/docs/sql/endpoints.md index 37cb33c0..2514234b 100644 --- a/docs/sql/endpoints.md +++ b/docs/sql/endpoints.md @@ -185,20 +185,20 @@ The `datarows` can have more than the `fetch_size` number of records in case the "cursor": "d:eyJhIjp7fSwicyI6IkRYRjFaWEo1UVc1a1JtVjBZMmdCQUFBQUFBQUFBQU1XZWpkdFRFRkZUMlpTZEZkeFdsWnJkRlZoYnpaeVVRPT0iLCJjIjpbeyJuYW1lIjoiZmlyc3RuYW1lIiwidHlwZSI6InRleHQifSx7Im5hbWUiOiJsYXN0bmFtZSIsInR5cGUiOiJ0ZXh0In1dLCJmIjo1LCJpIjoiYWNjb3VudHMabcde12345", "datarows": [ [ - "Abbas", - "Hussain" + "Abbey", + "Karen" ], [ "Chen", - "Dai" + "Ken" ], [ - "Anirudha", - "Jadhav" + "Ani", + "Jade" ], [ "Peng", - "Huo" + "Hu" ], [ "John",