diff --git a/docs/en/ml/configuring.asciidoc b/docs/en/ml/configuring.asciidoc index 41a0a19af06..342a04c98de 100644 --- a/docs/en/ml/configuring.asciidoc +++ b/docs/en/ml/configuring.asciidoc @@ -31,7 +31,9 @@ The scenarios in this section describe some best practices for generating useful * <> * <> * <> +* <> include::aggregations.asciidoc[] include::categories.asciidoc[] include::populations.asciidoc[] +include::transforms.asciidoc[] diff --git a/docs/en/ml/functions/geo.asciidoc b/docs/en/ml/functions/geo.asciidoc index 19161db5547..2c747f06107 100644 --- a/docs/en/ml/functions/geo.asciidoc +++ b/docs/en/ml/functions/geo.asciidoc @@ -74,5 +74,4 @@ format. For example, the following Painless script transforms } -------------------------------------------------- -For more information about `script_fields`, see -{ref}/ml-datafeed-resource.html[Datafeed Resources]. +For more information, see <>. diff --git a/docs/en/ml/images/ml-scriptfields.jpg b/docs/en/ml/images/ml-scriptfields.jpg new file mode 100644 index 00000000000..0c9150734c0 Binary files /dev/null and b/docs/en/ml/images/ml-scriptfields.jpg differ diff --git a/docs/en/ml/transforms.asciidoc b/docs/en/ml/transforms.asciidoc new file mode 100644 index 00000000000..9f8fcb40187 --- /dev/null +++ b/docs/en/ml/transforms.asciidoc @@ -0,0 +1,611 @@ +[[ml-configuring-transform]] +=== Transforming Data With Script Fields + +If you use {dfeeds}, you can add scripts to transform your data before +it is analyzed. {dfeeds-cap} contain an optional `script_fields` property, where +you can specify scripts that evaluate custom expressions and return script +fields. + +If your {dfeed} defines script fields, you can use those fields in your job. +For example, you can use the script fields in the analysis functions in one or +more detectors. + +* <> +* <> +* <> +* <> +* <> +* <> +* <> +* <> +* <> + +The following indices APIs create and add content to an index that is used in +subsequent examples: + +[source,js] +---------------------------------- +PUT /my_index +{ + "mappings":{ + "my_type":{ + "properties": { + "@timestamp": { + "type": "date" + }, + "aborted_count": { + "type": "long" + }, + "another_field": { + "type": "keyword" <1> + }, + "clientip": { + "type": "keyword" + }, + "coords": { + "properties": { + "lat": { + "type": "keyword" + }, + "lon": { + "type": "keyword" + } + } + }, + "error_count": { + "type": "long" + }, + "query": { + "type": "keyword" + }, + "some_field": { + "type": "keyword" + }, + "tokenstring1":{ + "type":"keyword" + }, + "tokenstring2":{ + "type":"keyword" + }, + "tokenstring3":{ + "type":"keyword" + } + } + } + } +} + +PUT /my_index/my_type/1 +{ + "@timestamp":"2017-03-23T13:00:00", + "error_count":36320, + "aborted_count":4156, + "some_field":"JOE", + "another_field":"SMITH ", + "tokenstring1":"foo-bar-baz", + "tokenstring2":"foo bar baz", + "tokenstring3":"foo-bar-19", + "query":"www.ml.elastic.co", + "clientip":"123.456.78.900", + "coords": { + "lat" : 41.44, + "lon":90.5 + } +} +---------------------------------- +// CONSOLE +// TESTSETUP +<1> In this example, string fields are mapped as `keyword` fields to support +aggregation. If you want both a full text (`text`) and a keyword (`keyword`) +version of the same field, use multi-fields. For more information, see +{ref}/multi-fields.html[fields]. + +[[ml-configuring-transform1]] +.Example 1: Adding two numerical fields +[source,js] +---------------------------------- +PUT _xpack/ml/anomaly_detectors/test1 +{ + "analysis_config":{ + "bucket_span": "10m", + "detectors":[ + { + "function":"mean", + "field_name": "total_error_count", <1> + "detector_description": "Custom script field transformation" + } + ] + }, + "data_description": { + "time_field":"@timestamp", + "time_format":"epoch_ms" + } +} + +PUT _xpack/ml/datafeeds/datafeed-test1 +{ + "job_id": "test1", + "indices": ["my_index"], + "types": ["my_type"], + "query": { + "match_all": { + "boost": 1 + } + }, + "script_fields": { + "total_error_count": { <2> + "script": { + "lang": "expression", + "inline": "doc['error_count'].value + doc['aborted_count'].value" + } + } + } +} +---------------------------------- +// CONSOLE +// TEST[continued] +<1> A script field named `total_error_count` is referenced in the detector +within the job. +<2> The script field is defined in the {dfeed}. + +This `test1` job contains a detector that uses a script field in a mean analysis +function. The `datafeed-test1` {dfeed} defines the script field. It contains a +script that adds two fields in the document to produce a "total" error count. + +The syntax for the `script_fields` property is identical to that used by {es}. +For more information, see {ref}/search-request-script-fields.html[Script Fields]. + +You can preview the contents of the {dfeed} by using the following API: + +[source,js] +---------------------------------- +GET _xpack/ml/datafeeds/datafeed-test1/_preview +---------------------------------- +// CONSOLE +// TEST[continued] + +In this example, the API returns the following results, which contain a sum of +the `error_count` and `aborted_count` values: + +[source,js] +---------------------------------- +[ + { + "@timestamp": 1490274000000, + "total_error_count": 40476 + } +] +---------------------------------- +// TESTRESPONSE + + +NOTE: This example demonstrates how to use script fields, but it contains +insufficient data to generate meaningful results. For a full demonstration of +how to create jobs with sample data, see <>. + +You can alternatively use {kib} to create an advanced job that uses script +fields. To add the `script_fields` property to your {dfeed}, you must use the +**Edit JSON** tab. For example: + +[role="screenshot"] +image::images/ml-scriptfields.jpg[Adding script fields to a {dfeed} in {kib}] + +[[ml-configuring-transform-examples]] +==== Common Script Field Examples + +While the possibilities are limitless, there are a number of common scenarios +where you might use script fields in your {dfeeds}. + +[NOTE] +=============================== +Some of these examples use regular expressions. By default, regular +expressions are disabled because they circumvent the protection that Painless +provides against long running and memory hungry scripts. For more information, +see {ref}/modules-scripting-painless.html[Painless Scripting Language]. + +Machine learning analysis is case sensitive. For example, "John" is considered +to be different than "john". This is one reason you might consider using scripts +that convert your strings to upper or lowercase letters. +=============================== + +[[ml-configuring-transform2]] +.Example 2: Concatenating strings +[source,js] +-------------------------------------------------- +PUT _xpack/ml/anomaly_detectors/test2 +{ + "analysis_config":{ + "bucket_span": "10m", + "detectors":[ + { + "function":"low_info_content", + "field_name":"my_script_field", <1> + "detector_description": "Custom script field transformation" + } + ] + }, + "data_description": { + "time_field":"@timestamp", + "time_format":"epoch_ms" + } +} + +PUT _xpack/ml/datafeeds/datafeed-test2 +{ + "job_id": "test2", + "indices": ["my_index"], + "types": ["my_type"], + "query": { + "match_all": { + "boost": 1 + } + }, + "script_fields": { + "my_script_field": { + "script": { + "lang": "painless", + "inline": "doc['some_field'].value + '_' + doc['another_field'].value" <2> + } + } + } +} + +GET _xpack/ml/datafeeds/datafeed-test2/_preview +-------------------------------------------------- +// CONSOLE +// TEST[continued] +<1> The script field has a rather generic name in this case, since it will +be used for various tests in the subsequent examples. +<2> The script field uses the plus (+) operator to concatenate strings. + +The preview {dfeed} API returns the following results, which show that "JOE" +and "SMITH " have been concatenated and an underscore was added: + +[source,js] +---------------------------------- +[ + { + "@timestamp": 1490274000000, + "my_script_field": "JOE_SMITH " + } +] +---------------------------------- +// TESTRESPONSE + +[[ml-configuring-transform3]] +.Example 3: Trimming strings +[source,js] +-------------------------------------------------- +POST _xpack/ml/datafeeds/datafeed-test2/_update +{ + "script_fields": { + "my_script_field": { + "script": { + "lang": "painless", + "inline": "doc['another_field'].value.trim()" <1> + } + } + } +} + +GET _xpack/ml/datafeeds/datafeed-test2/_preview +-------------------------------------------------- +// CONSOLE +// TEST[continued] +<1> This script field uses the `trim()` function to trim extra white space from a +string. + +The preview {dfeed} API returns the following results, which show that "SMITH " +has been trimmed to "SMITH": + +[source,js] +---------------------------------- +[ + { + "@timestamp": 1490274000000, + "my_script_field": "SMITH" + } +] +---------------------------------- +// TESTRESPONSE + +[[ml-configuring-transform4]] +.Example 4: Converting strings to lowercase +[source,js] +-------------------------------------------------- +POST _xpack/ml/datafeeds/datafeed-test2/_update +{ + "script_fields": { + "my_script_field": { + "script": { + "lang": "painless", + "inline": "doc['some_field'].value.toLowerCase()" <1> + } + } + } +} + +GET _xpack/ml/datafeeds/datafeed-test2/_preview +-------------------------------------------------- +// CONSOLE +// TEST[continued] +<1> This script field uses the `toLowerCase` function to convert a string to all +lowercase letters. Likewise, you can use the `toUpperCase{}` function to convert +a string to uppercase letters. + +The preview {dfeed} API returns the following results, which show that "JOE" +has been converted to "joe": + +[source,js] +---------------------------------- +[ + { + "@timestamp": 1490274000000, + "my_script_field": "joe" + } +] +---------------------------------- +// TESTRESPONSE + +[[ml-configuring-transform5]] +.Example 5: Converting strings to mixed case formats +[source,js] +-------------------------------------------------- +POST _xpack/ml/datafeeds/datafeed-test2/_update +{ + "script_fields": { + "my_script_field": { + "script": { + "lang": "painless", + "inline": "doc['some_field'].value.substring(0, 1).toUpperCase() + doc['some_field'].value.substring(1).toLowerCase()" <1> + } + } + } +} + +GET _xpack/ml/datafeeds/datafeed-test2/_preview +-------------------------------------------------- +// CONSOLE +// TEST[continued] +<1> This script field is a more complicated example of case manipulation. It uses +the `subString()` function to capitalize the first letter of a string and +converts the remaining characters to lowercase. + +The preview {dfeed} API returns the following results, which show that "JOE" +has been converted to "Joe": + +[source,js] +---------------------------------- +[ + { + "@timestamp": 1490274000000, + "my_script_field": "Joe" + } +] +---------------------------------- +// TESTRESPONSE + +[[ml-configuring-transform6]] +.Example 6: Replacing tokens +[source,js] +-------------------------------------------------- +POST _xpack/ml/datafeeds/datafeed-test2/_update +{ + "script_fields": { + "my_script_field": { + "script": { + "lang": "painless", + "inline": "/\\s/.matcher(doc['tokenstring2'].value).replaceAll('_')" <1> + } + } + } +} + +GET _xpack/ml/datafeeds/datafeed-test2/_preview +-------------------------------------------------- +// CONSOLE +// TEST[continued] +<1> This script field uses regular expressions to replace white +space with underscores. + +The preview {dfeed} API returns the following results, which show that +"foo bar baz" has been converted to "foo_bar_baz": + +[source,js] +---------------------------------- +[ + { + "@timestamp": 1490274000000, + "my_script_field": "foo_bar_baz" + } +] +---------------------------------- +// TESTRESPONSE + +[[ml-configuring-transform7]] +.Example 7: Regular expression matching and concatenation +[source,js] +-------------------------------------------------- +POST _xpack/ml/datafeeds/datafeed-test2/_update +{ + "script_fields": { + "my_script_field": { + "script": { + "lang": "painless", + "inline": "def m = /(.*)-bar-([0-9][0-9])/.matcher(doc['tokenstring3'].value); return m.find() ? m.group(1) + '_' + m.group(2) : '';" <1> + } + } + } +} + +GET _xpack/ml/datafeeds/datafeed-test2/_preview +-------------------------------------------------- +// CONSOLE +// TEST[continued] +<1> This script field looks for a specific regular expression pattern and emits the +matched groups as a concatenated string. If no match is found, it emits an empty +string. + +The preview {dfeed} API returns the following results, which show that +"foo-bar-19" has been converted to "foo_19": + +[source,js] +---------------------------------- +[ + { + "@timestamp": 1490274000000, + "my_script_field": "foo_19" + } +] +---------------------------------- +// TESTRESPONSE + +[[ml-configuring-transform8]] +.Example 8: Splitting strings by domain name +[source,js] +-------------------------------------------------- +PUT _xpack/ml/anomaly_detectors/test3 +{ + "description":"DNS tunneling", + "analysis_config":{ + "bucket_span": "30m", + "influencers": ["clientip","hrd"], + "detectors":[ + { + "function":"high_info_content", + "field_name": "sub", + "over_field_name": "hrd", + "exclude_frequent":"all" + } + ] + }, + "data_description": { + "time_field":"@timestamp", + "time_format":"epoch_ms" + } +} + +PUT _xpack/ml/datafeeds/datafeed-test3 +{ + "job_id": "test3", + "indices": ["my_index"], + "types": ["my_type"], + "query": { + "match_all": { + "boost": 1 + } + }, + "script_fields":{ + "sub":{ + "script":"return domainSplit(doc['query'].value, params).get(0);" + }, + "hrd":{ + "script":"return domainSplit(doc['query'].value, params).get(1);" + } + } +} + +GET _xpack/ml/datafeeds/datafeed-test3/_preview +-------------------------------------------------- +// CONSOLE +// TEST[continued] + +If you have a single field that contains a well-formed DNS domain name, you can +use the `domainSplit()` function to split the string into its highest registered +domain and the sub-domain, which is everything to the left of the highest +registered domain. For example, the highest registered domain of +`www.ml.elastic.co` is `elastic.co` and the sub-domain is `www.ml`. The +`domainSplit()` function returns an array of two values: the first value is the +subdomain; the second value is the highest registered domain. + +NOTE: The `domainSplit()` function takes two arguments. The first argument is +the string you want to split. The second argument is always `params`. This is a +technical implementation detail related to how Painless operates internally. + +The preview {dfeed} API returns the following results, which show that +"www.ml.elastic.co" has been split into "elastic.co" and "www.ml": + +[source,js] +---------------------------------- +[ + { + "@timestamp": 1490274000000, + "clientip.keyword": "123.456.78.900", + "hrd": "elastic.co", + "sub": "www.ml" + } +] +---------------------------------- +// TESTRESPONSE + +[[ml-configuring-transform9]] +.Example 9: Transforming geo_point data +[source,js] +-------------------------------------------------- +PUT _xpack/ml/anomaly_detectors/test4 +{ + "analysis_config":{ + "bucket_span": "10m", + "detectors":[ + { + "function":"lat_long", + "field_name": "my_coordinates" + } + ] + }, + "data_description": { + "time_field":"@timestamp", + "time_format":"epoch_ms" + } +} + +PUT _xpack/ml/datafeeds/datafeed-test4 +{ + "job_id": "test4", + "indices": ["my_index"], + "types": ["my_type"], + "query": { + "match_all": { + "boost": 1 + } + }, + "script_fields": { + "my_coordinates": { + "script": { + "inline": "doc['coords.lat'].value + ',' + doc['coords.lon'].value", + "lang": "painless" + } + } + } +} + +GET _xpack/ml/datafeeds/datafeed-test4/_preview +-------------------------------------------------- +// CONSOLE +// TEST[continued] + +In {es}, location data can be stored in `geo_point` fields but this data type is +not supported natively in {xpackml} analytics. This example of a script field +transforms the data into an appropriate format. For more information, +see <>. + +The preview {dfeed} API returns the following results, which show that +`41.44` and `90.5` have been combined into "41.44,90.5": + +[source,js] +---------------------------------- +[ + { + "@timestamp": 1490274000000, + "my_coordinates": "41.44,90.5" + } +] +---------------------------------- +// TESTRESPONSE + +//// +==== Configuring Script Fields in {dfeeds-cap} + +//TO-DO: Add Kibana steps from +//https://github.com/elastic/prelert-legacy/wiki/Transforming-data-with-script_fields#transforming-geo_point-data-to-a-workable-string-format +//// diff --git a/docs/en/rest-api/ml/datafeedresource.asciidoc b/docs/en/rest-api/ml/datafeedresource.asciidoc index 65d3af78176..17c6ee7716b 100644 --- a/docs/en/rest-api/ml/datafeedresource.asciidoc +++ b/docs/en/rest-api/ml/datafeedresource.asciidoc @@ -48,24 +48,8 @@ A {dfeed} resource has the following properties: script fields to the {dfeed}. The <> in a job can contain functions that use these script fields. - For more information, see {ref}/search-request-script-fields.html[Script Fields]. - For example: -+ --- -[source,js] ----------------------------------- -{ - "script_fields": { - "total_error_count": { - "script": { - "lang": "painless", - "source": "doc['error_count'].value + doc['aborted_count'].value" - } - } - } -} ----------------------------------- --- + For more information, see + {xpack-ref}/ml-configuring-transform.html[Transforming Data With Script Fields]. `scroll_size`:: (unsigned integer) The `size` parameter that is used in {es} searches.