183 lines
5.2 KiB
Plaintext
183 lines
5.2 KiB
Plaintext
[role="xpack"]
|
|
[testenv="basic"]
|
|
[[search-aggregations-pipeline-inference-bucket-aggregation]]
|
|
=== {infer-cap} Bucket Aggregation
|
|
|
|
experimental::[]
|
|
|
|
A parent pipeline aggregation which loads a pre-trained model and performs
|
|
{infer} on the collated result fields from the parent bucket aggregation.
|
|
|
|
To use the {infer} bucket aggregation, you need to have the same security
|
|
privileges that are required for using the <<get-trained-models>>.
|
|
|
|
[[inference-bucket-agg-syntax]]
|
|
==== Syntax
|
|
|
|
A `inference` aggregation looks like this in isolation:
|
|
|
|
[source,js]
|
|
--------------------------------------------------
|
|
{
|
|
"inference": {
|
|
"model_id": "a_model_for_inference", <1>
|
|
"inference_config": { <2>
|
|
"regression_config": {
|
|
"num_top_feature_importance_values": 2
|
|
}
|
|
},
|
|
"buckets_path": {
|
|
"avg_cost": "avg_agg", <3>
|
|
"max_cost": "max_agg"
|
|
}
|
|
}
|
|
}
|
|
--------------------------------------------------
|
|
// NOTCONSOLE
|
|
<1> The ID of model to use.
|
|
<2> The optional inference config which overrides the model's default settings
|
|
<3> Map the value of `avg_agg` to the model's input field `avg_cost`
|
|
|
|
|
|
[[inference-bucket-params]]
|
|
.`inference` Parameters
|
|
[options="header"]
|
|
|===
|
|
|Parameter Name |Description |Required |Default Value
|
|
| `model_id` | The ID of the model to load and infer against | Required | -
|
|
| `inference_config` | Contains the inference type and its options. There are two types: <<inference-agg-regression-opt,`regression`>> and <<inference-agg-classification-opt,`classification`>> | Optional | -
|
|
| `buckets_path` | Defines the paths to the input aggregations and maps the aggregation names to the field names expected by the model.
|
|
See <<buckets-path-syntax>> for more details | Required | -
|
|
|===
|
|
|
|
|
|
==== Configuration options for {infer} models
|
|
|
|
The `inference_config` setting is optional and usually isn't required as the
|
|
pre-trained models come equipped with sensible defaults. In the context of
|
|
aggregations some options can overridden for each of the 2 types of model.
|
|
|
|
[discrete]
|
|
[[inference-agg-regression-opt]]
|
|
===== Configuration options for {regression} models
|
|
|
|
`num_top_feature_importance_values`::
|
|
(Optional, integer)
|
|
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-regression-num-top-feature-importance-values]
|
|
|
|
[discrete]
|
|
[[inference-agg-classification-opt]]
|
|
===== Configuration options for {classification} models
|
|
|
|
`num_top_classes`::
|
|
(Optional, integer)
|
|
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-classification-num-top-classes]
|
|
|
|
`num_top_feature_importance_values`::
|
|
(Optional, integer)
|
|
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-classification-num-top-feature-importance-values]
|
|
|
|
`prediction_field_type`::
|
|
(Optional, string)
|
|
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-classification-prediction-field-type]
|
|
|
|
|
|
[[inference-bucket-agg-example]]
|
|
==== Example
|
|
|
|
The following snippet aggregates a web log by `client_ip` and extracts a number
|
|
of features via metric and bucket sub-aggregations as input to the {infer}
|
|
aggregation configured with a model trained to identify suspicious client IPs:
|
|
|
|
[source,console]
|
|
-------------------------------------------------
|
|
GET kibana_sample_data_logs/_search
|
|
{
|
|
"size": 0,
|
|
"aggs": {
|
|
"client_ip": { <1>
|
|
"composite": {
|
|
"sources": [
|
|
{
|
|
"client_ip": {
|
|
"terms": {
|
|
"field": "clientip"
|
|
}
|
|
}
|
|
}
|
|
]
|
|
},
|
|
"aggs": { <2>
|
|
"url_dc": {
|
|
"cardinality": {
|
|
"field": "url.keyword"
|
|
}
|
|
},
|
|
"bytes_sum": {
|
|
"sum": {
|
|
"field": "bytes"
|
|
}
|
|
},
|
|
"geo_src_dc": {
|
|
"cardinality": {
|
|
"field": "geo.src"
|
|
}
|
|
},
|
|
"geo_dest_dc": {
|
|
"cardinality": {
|
|
"field": "geo.dest"
|
|
}
|
|
},
|
|
"responses_total": {
|
|
"value_count": {
|
|
"field": "timestamp"
|
|
}
|
|
},
|
|
"success": {
|
|
"filter": {
|
|
"term": {
|
|
"response": "200"
|
|
}
|
|
}
|
|
},
|
|
"error404": {
|
|
"filter": {
|
|
"term": {
|
|
"response": "404"
|
|
}
|
|
}
|
|
},
|
|
"error503": {
|
|
"filter": {
|
|
"term": {
|
|
"response": "503"
|
|
}
|
|
}
|
|
},
|
|
"malicious_client_ip": { <3>
|
|
"inference": {
|
|
"model_id": "malicious_clients_model",
|
|
"buckets_path": {
|
|
"response_count": "responses_total",
|
|
"url_dc": "url_dc",
|
|
"bytes_sum": "bytes_sum",
|
|
"geo_src_dc": "geo_src_dc",
|
|
"geo_dest_dc": "geo_dest_dc",
|
|
"success": "success._count",
|
|
"error404": "error404._count",
|
|
"error503": "error503._count"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
-------------------------------------------------
|
|
// TEST[skip:setup kibana sample data]
|
|
|
|
<1> A composite bucket aggregation that aggregates the data by `client_ip`.
|
|
<2> A series of metrics and bucket sub-aggregations.
|
|
<3> {infer-cap} bucket aggregation that contains the model ID and maps the
|
|
aggregation names to the model's input fields.
|