migrate master branch for xpack

This commit is contained in:
Ryan Ernst 2018-04-20 15:29:55 -07:00
commit cf607e79d7
3887 changed files with 503108 additions and 0 deletions

View File

@ -0,0 +1,8 @@
# This file is used with all of the non-matrix tests in Jenkins.
# This .properties file defines the versions of Java with which to
# build and test Elasticsearch for this branch. Valid Java versions
# are 'java' or 'openjdk' followed by the major release number.
ES_BUILD_JAVA=java10
ES_RUNTIME_JAVA=java8

View File

@ -0,0 +1,2 @@
ES_BUILD_JAVA:
- java10

View File

@ -0,0 +1 @@
exclude:

View File

@ -0,0 +1,3 @@
ES_RUNTIME_JAVA:
- java8
- java10

88
x-pack/.dir-locals.el Normal file
View File

@ -0,0 +1,88 @@
((java-mode
.
((eval
.
(progn
(defun my/point-in-defun-declaration-p ()
(let ((bod (save-excursion (c-beginning-of-defun)
(point))))
(<= bod
(point)
(save-excursion (goto-char bod)
(re-search-forward "{")
(point)))))
(defun my/is-string-concatenation-p ()
"Returns true if the previous line is a string concatenation"
(save-excursion
(let ((start (point)))
(forward-line -1)
(if (re-search-forward " \\\+$" start t) t nil))))
(defun my/inside-java-lambda-p ()
"Returns true if point is the first statement inside of a lambda"
(save-excursion
(c-beginning-of-statement-1)
(let ((start (point)))
(forward-line -1)
(if (search-forward " -> {" start t) t nil))))
(defun my/trailing-paren-p ()
"Returns true if point is a training paren and semicolon"
(save-excursion
(end-of-line)
(let ((endpoint (point)))
(beginning-of-line)
(if (re-search-forward "[ ]*);$" endpoint t) t nil))))
(defun my/prev-line-call-with-no-args-p ()
"Return true if the previous line is a function call with no arguments"
(save-excursion
(let ((start (point)))
(forward-line -1)
(if (re-search-forward ".($" start t) t nil))))
(defun my/arglist-cont-nonempty-indentation (arg)
(if (my/inside-java-lambda-p)
'+
(if (my/is-string-concatenation-p)
16
(unless (my/point-in-defun-declaration-p) '++))))
(defun my/statement-block-intro (arg)
(if (and (c-at-statement-start-p) (my/inside-java-lambda-p)) 0 '+))
(defun my/block-close (arg)
(if (my/inside-java-lambda-p) '- 0))
(defun my/arglist-close (arg) (if (my/trailing-paren-p) 0 '--))
(defun my/arglist-intro (arg)
(if (my/prev-line-call-with-no-args-p) '++ 0))
(c-set-offset 'inline-open 0)
(c-set-offset 'topmost-intro-cont '+)
(c-set-offset 'statement-block-intro 'my/statement-block-intro)
(c-set-offset 'block-close 'my/block-close)
(c-set-offset 'knr-argdecl-intro '+)
(c-set-offset 'substatement-open '+)
(c-set-offset 'substatement-label '+)
(c-set-offset 'case-label '+)
(c-set-offset 'label '+)
(c-set-offset 'statement-case-open '+)
(c-set-offset 'statement-cont '++)
(c-set-offset 'arglist-intro 'my/arglist-intro)
(c-set-offset 'arglist-cont-nonempty '(my/arglist-cont-nonempty-indentation c-lineup-arglist))
(c-set-offset 'arglist-close 'my/arglist-close)
(c-set-offset 'inexpr-class 0)
(c-set-offset 'access-label 0)
(c-set-offset 'inher-intro '++)
(c-set-offset 'inher-cont '++)
(c-set-offset 'brace-list-intro '+)
(c-set-offset 'func-decl-cont '++)
))
(c-basic-offset . 4)
(c-comment-only-line-offset . (0 . 0))
(fill-column . 140)
(fci-rule-column . 140)
(compile-command . "gradle compileTestJava"))))

6
x-pack/.github/ISSUE_TEMPLATE.md vendored Normal file
View File

@ -0,0 +1,6 @@
<!--
Please do not submit any issues related to security vulnerabilities that
could be exploited by an attacker. Instead, send an email to
security@elastic.co. If you have any doubts, send an email to
security@elastic.co.
-->

54
x-pack/.gitignore vendored Normal file
View File

@ -0,0 +1,54 @@
.idea/
.gradle/
*.iml
*.ipr
*.iws
work/
/data/
logs/
.DS_Store
build/
build-idea/
build-eclipse/
generated-resources/
target/
*-execution-hints.log
docs/html/
docs/build.log
npm-debug.log
/tmp/
backwards/
html_docs
.vagrant/
vendor/
.bundle
Gemfile.lock
## eclipse ignores (use 'mvn eclipse:eclipse' to build eclipse projects)
## All files (.project, .classpath, .settings/*) should be generated through Maven which
## will correctly set the classpath based on the declared dependencies and write settings
## files to ensure common coding style across Eclipse and IDEA.
.project
.classpath
eclipse-build
*/.project
*/.classpath
*/eclipse-build
.settings
!/.settings/org.eclipse.core.resources.prefs
!/.settings/org.eclipse.jdt.core.prefs
!/.settings/org.eclipse.jdt.ui.prefs
## netbeans ignores
nb-configuration.xml
nbactions.xml
dependency-reduced-pom.xml
github.token
## ignore attachment files
.local-*
*/.local-*
## ignore antlr temporary files used by vscode-antlr4
.antlr

8
x-pack/.projectile Normal file
View File

@ -0,0 +1,8 @@
-/target
-/license/target
-/marvel/target
-/qa/target
-/shield/target
-/watcher/target
-/x-dev-tools/target
-*.class

223
x-pack/LICENSE.txt Normal file
View File

@ -0,0 +1,223 @@
ELASTIC LICENSE AGREEMENT
PLEASE READ CAREFULLY THIS ELASTIC LICENSE AGREEMENT (THIS "AGREEMENT"), WHICH
CONSTITUTES A LEGALLY BINDING AGREEMENT AND GOVERNS ALL OF YOUR USE OF ALL OF
THE ELASTIC SOFTWARE WITH WHICH THIS AGREEMENT IS INCLUDED ("ELASTIC SOFTWARE")
THAT IS PROVIDED IN OBJECT CODE FORMAT, AND, IN ACCORDANCE WITH SECTION 2 BELOW,
CERTAIN OF THE ELASTIC SOFTWARE THAT IS PROVIDED IN SOURCE CODE FORMAT. BY
INSTALLING OR USING ANY OF THE ELASTIC SOFTWARE GOVERNED BY THIS AGREEMENT, YOU
ARE ASSENTING TO THE TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE
WITH SUCH TERMS AND CONDITIONS, YOU MAY NOT INSTALL OR USE THE ELASTIC SOFTWARE
GOVERNED BY THIS AGREEMENT. IF YOU ARE INSTALLING OR USING THE SOFTWARE ON
BEHALF OF A LEGAL ENTITY, YOU REPRESENT AND WARRANT THAT YOU HAVE THE ACTUAL
AUTHORITY TO AGREE TO THE TERMS AND CONDITIONS OF THIS AGREEMENT ON BEHALF OF
SUCH ENTITY.
Posted Date: April 20, 2018
This Agreement is entered into by and between Elasticsearch BV ("Elastic") and
You, or the legal entity on behalf of whom You are acting (as applicable,
"You").
1. OBJECT CODE END USER LICENSES, RESTRICTIONS AND THIRD PARTY OPEN SOURCE
SOFTWARE
1.1 Object Code End User License. Subject to the terms and conditions of
Section 1.2 of this Agreement, Elastic hereby grants to You, AT NO CHARGE and
for so long as you are not in breach of any provision of this Agreement, a
License to the Basic Features and Functions of the Elastic Software.
1.2 Reservation of Rights; Restrictions. As between Elastic and You, Elastic
and its licensors own all right, title and interest in and to the Elastic
Software, and except as expressly set forth in Sections 1.1, and 2.1 of this
Agreement, no other license to the Elastic Software is granted to You under
this Agreement, by implication, estoppel or otherwise. You agree not to: (i)
reverse engineer or decompile, decrypt, disassemble or otherwise reduce any
Elastic Software provided to You in Object Code, or any portion thereof, to
Source Code, except and only to the extent any such restriction is prohibited
by applicable law, (ii) except as expressly permitted in this Agreement,
prepare derivative works from, modify, copy or use the Elastic Software Object
Code or the Commercial Software Source Code in any manner; (iii) except as
expressly permitted in Section 1.1 above, transfer, sell, rent, lease,
distribute, sublicense, loan or otherwise transfer, Elastic Software Object
Code, in whole or in part, to any third party; (iv) use Elastic Software
Object Code for providing time-sharing services, any software-as-a-service,
service bureau services or as part of an application services provider or
other service offering (collectively, "SaaS Offering") where obtaining access
to the Elastic Software or the features and functions of the Elastic Software
is a primary reason or substantial motivation for users of the SaaS Offering
to access and/or use the SaaS Offering ("Prohibited SaaS Offering"); (v)
circumvent the limitations on use of Elastic Software provided to You in
Object Code format that are imposed or preserved by any License Key, or (vi)
alter or remove any Marks and Notices in the Elastic Software. If You have any
question as to whether a specific SaaS Offering constitutes a Prohibited SaaS
Offering, or are interested in obtaining Elastic's permission to engage in
commercial or non-commercial distribution of the Elastic Software, please
contact elastic_license@elastic.co.
1.3 Third Party Open Source Software. The Commercial Software may contain or
be provided with third party open source libraries, components, utilities and
other open source software (collectively, "Open Source Software"), which Open
Source Software may have applicable license terms as identified on a website
designated by Elastic. Notwithstanding anything to the contrary herein, use of
the Open Source Software shall be subject to the license terms and conditions
applicable to such Open Source Software, to the extent required by the
applicable licensor (which terms shall not restrict the license rights granted
to You hereunder, but may contain additional rights). To the extent any
condition of this Agreement conflicts with any license to the Open Source
Software, the Open Source Software license will govern with respect to such
Open Source Software only. Elastic may also separately provide you with
certain open source software that is licensed by Elastic. Your use of such
Elastic open source software will not be governed by this Agreement, but by
the applicable open source license terms.
2. COMMERCIAL SOFTWARE SOURCE CODE
2.1 Limited License. Subject to the terms and conditions of Section 2.2 of
this Agreement, Elastic hereby grants to You, AT NO CHARGE and for so long as
you are not in breach of any provision of this Agreement, a limited,
non-exclusive, non-transferable, fully paid up royalty free right and license
to the Commercial Software in Source Code format, without the right to grant
or authorize sublicenses, to prepare Derivative Works of the Commercial
Software, provided You (i) do not hack the licensing mechanism, or otherwise
circumvent the intended limitations on the use of Elastic Software to enable
features other than Basic Features and Functions or those features You are
entitled to as part of a Subscription, and (ii) use the resulting object code
only for reasonable testing purposes.
2.2 Restrictions. Nothing in Section 2.1 grants You the right to (i) use the
Commercial Software Source Code other than in accordance with Section 2.1
above, (ii) use a Derivative Work of the Commercial Software outside of a
Non-production Environment, in any production capacity, on a temporary or
permanent basis, or (iii) transfer, sell, rent, lease, distribute, sublicense,
loan or otherwise make available the Commercial Software Source Code, in whole
or in part, to any third party. Notwithstanding the foregoing, You may
maintain a copy of the repository in which the Source Code of the Commercial
Software resides and that copy may be publicly accessible, provided that you
include this Agreement with Your copy of the repository.
3. TERMINATION
3.1 Termination. This Agreement will automatically terminate, whether or not
You receive notice of such Termination from Elastic, if You breach any of its
provisions.
3.2 Post Termination. Upon any termination of this Agreement, for any reason,
You shall promptly cease the use of the Elastic Software in Object Code format
and cease use of the Commercial Software in Source Code format. For the
avoidance of doubt, termination of this Agreement will not affect Your right
to use Elastic Software, in either Object Code or Source Code formats, made
available under the Apache License Version 2.0.
3.3 Survival. Sections 1.2, 2.2. 3.3, 4 and 5 shall survive any termination or
expiration of this Agreement.
4. DISCLAIMER OF WARRANTIES AND LIMITATION OF LIABILITY
4.1 Disclaimer of Warranties. TO THE MAXIMUM EXTENT PERMITTED UNDER APPLICABLE
LAW, THE ELASTIC SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND,
AND ELASTIC AND ITS LICENSORS MAKE NO WARRANTIES WHETHER EXPRESSED, IMPLIED OR
STATUTORY REGARDING OR RELATING TO THE ELASTIC SOFTWARE. TO THE MAXIMUM EXTENT
PERMITTED UNDER APPLICABLE LAW, ELASTIC AND ITS LICENSORS SPECIFICALLY
DISCLAIM ALL IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
PURPOSE AND NON-INFRINGEMENT WITH RESPECT TO THE ELASTIC SOFTWARE, AND WITH
RESPECT TO THE USE OF THE FOREGOING. FURTHER, ELASTIC DOES NOT WARRANT RESULTS
OF USE OR THAT THE ELASTIC SOFTWARE WILL BE ERROR FREE OR THAT THE USE OF THE
ELASTIC SOFTWARE WILL BE UNINTERRUPTED.
4.2 Limitation of Liability. IN NO EVENT SHALL ELASTIC OR ITS LICENSORS BE
LIABLE TO YOU OR ANY THIRD PARTY FOR ANY DIRECT OR INDIRECT DAMAGES,
INCLUDING, WITHOUT LIMITATION, FOR ANY LOSS OF PROFITS, LOSS OF USE, BUSINESS
INTERRUPTION, LOSS OF DATA, COST OF SUBSTITUTE GOODS OR SERVICES, OR FOR ANY
SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, IN CONNECTION WITH
OR ARISING OUT OF THE USE OR INABILITY TO USE THE ELASTIC SOFTWARE, OR THE
PERFORMANCE OF OR FAILURE TO PERFORM THIS AGREEMENT, WHETHER ALLEGED AS A
BREACH OF CONTRACT OR TORTIOUS CONDUCT, INCLUDING NEGLIGENCE, EVEN IF ELASTIC
HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
5. MISCELLANEOUS
This Agreement completely and exclusively states the entire agreement of the
parties regarding the subject matter herein, and it supersedes, and its terms
govern, all prior proposals, agreements, or other communications between the
parties, oral or written, regarding such subject matter. This Agreement may be
modified by Elastic from time to time, and any such modifications will be
effective upon the "Posted Date" set forth at the top of the modified
Agreement. If any provision hereof is held unenforceable, this Agreement will
continue without said provision and be interpreted to reflect the original
intent of the parties. This Agreement and any non-contractual obligation
arising out of or in connection with it, is governed exclusively by Dutch law.
This Agreement shall not be governed by the 1980 UN Convention on Contracts
for the International Sale of Goods. All disputes arising out of or in
connection with this Agreement, including its existence and validity, shall be
resolved by the courts with jurisdiction in Amsterdam, The Netherlands, except
where mandatory law provides for the courts at another location in The
Netherlands to have jurisdiction. The parties hereby irrevocably waive any and
all claims and defenses either might otherwise have in any such action or
proceeding in any of such courts based upon any alleged lack of personal
jurisdiction, improper venue, forum non conveniens or any similar claim or
defense. A breach or threatened breach, by You of Section 2 may cause
irreparable harm for which damages at law may not provide adequate relief, and
therefore Elastic shall be entitled to seek injunctive relief without being
required to post a bond. You may not assign this Agreement (including by
operation of law in connection with a merger or acquisition), in whole or in
part to any third party without the prior written consent of Elastic, which
may be withheld or granted by Elastic in its sole and absolute discretion.
Any assignment in violation of the preceding sentence is void. Notices to
Elastic may also be sent to legal@elastic.co.
6. DEFINITIONS
The following terms have the meanings ascribed:
6.1 "Affiliate" means, with respect to a party, any entity that controls, is
controlled by, or which is under common control with, such party, where
"control" means ownership of at least fifty percent (50%) of the outstanding
voting shares of the entity, or the contractual right to establish policy for,
and manage the operations of, the entity.
6.2 "Basic Features and Functions" means those features and functions of the
Elastic Software that are eligible for use under a Basic license, as set forth
at https://www.elastic.co/subscriptions, as may be modified by Elastic from
time to time.
6.3 "Commercial Software" means the Elastic Software Source Code in any file
containing a header stating the contents are subject to the Elastic License or
which is contained in the repository folder labeled "x-pack", unless a LICENSE
file present in the directory subtree declares a different license.
6.4 "Derivative Work of the Commercial Software" means, for purposes of this
Agreement, any modification(s) or enhancement(s) to the Commercial Software,
which represent, as a whole, an original work of authorship.
6.5 "License" means a limited, non-exclusive, non-transferable, fully paid up,
royalty free, right and license, without the right to grant or authorize
sublicenses, solely for Your internal business operations to (i) install and
use the applicable Features and Functions of the Elastic Software in Object
Code, and (ii) permit Contractors and Your Affiliates to use the Elastic
software as set forth in (i) above, provided that such use by Contractors must
be solely for Your benefit and/or the benefit of Your Affiliates, and You
shall be responsible for all acts and omissions of such Contractors and
Affiliates in connection with their use of the Elastic software that are
contrary to the terms and conditions of this Agreement.
6.6 "License Key" means a sequence of bytes, including but not limited to a
JSON blob, that is used to enable certain features and functions of the
Elastic Software.
6.7 "Marks and Notices" means all Elastic trademarks, trade names, logos and
notices present on the Documentation as originally provided by Elastic.
6.8 "Non-production Environment" means an environment for development, testing
or quality assurance, where software is not used for production purposes.
6.9 "Object Code" means any form resulting from mechanical transformation or
translation of Source Code form, including but not limited to compiled object
code, generated documentation, and conversions to other media types.
6.10 "Source Code" means the preferred form of computer software for making
modifications, including but not limited to software source code,
documentation source, and configuration files.
6.11 "Subscription" means the right to receive Support Services and a License
to the Commercial Software.

2
x-pack/NOTICE.txt Normal file
View File

@ -0,0 +1,2 @@
Elasticsearch X-Pack
Copyright 2009-2017 Elasticsearch

120
x-pack/README.asciidoc Normal file
View File

@ -0,0 +1,120 @@
= Elasticsearch X-Pack
A set of Elastic's commercial plugins for Elasticsearch:
- License
- Security
- Watcher
- Monitoring
- Machine Learning
- Graph
= Setup
You must checkout `x-pack-elasticsearch` and `elasticsearch` with a specific directory structure. The
`elasticsearch` checkout will be used when building `x-pack-elasticsearch`. The structure is:
- /path/to/elastic/elasticsearch
- /path/to/elastic/elasticsearch-extra/x-pack-elasticsearch
== Vault Secret
The build requires a Vault Secret ID. You can use a GitHub token by following these steps:
1. Go to https://github.com/settings/tokens
2. Click *Generate new token*
3. Set permissions to `read:org`
4. Copy the token into `~/.elastic/github.token`
5. Set the token's file permissions to `600`
```
$ mkdir ~/.elastic
$ vi ~/.elastic/github.token
# Add your_token exactly as it is into the file and save it
$ chmod 600 ~/.elastic/github.token
```
If you do not create the token, then you will see something along the lines of this as the failure when trying to build X-Pack:
```
* What went wrong:
Missing ~/.elastic/github.token file or VAULT_SECRET_ID environment variable, needed to authenticate with vault for secrets
```
=== Offline Mode
When running the build in offline mode (`--offline`), it will not required to have the vault secret setup.
== Native Code
**This is mandatory as tests depend on it**
Machine Learning requires platform specific binaries, built from https://github.com/elastic/ml-cpp via CI servers.
= Build
- Run unit tests:
+
[source, txt]
-----
gradle clean test
-----
- Run all tests:
+
[source, txt]
-----
gradle clean check
-----
- Run integration tests:
+
[source, txt]
-----
gradle clean integTest
-----
- Package X-Pack (without running tests)
+
[source, txt]
-----
gradle clean assemble
-----
- Install X-Pack (without running tests)
+
[source, txt]
-----
gradle clean install
-----
= Building documentation
The source files in this repository can be included in either the X-Pack
Reference or the Elasticsearch Reference.
NOTE: In 5.4 and later, the Elasticsearch Reference includes X-Pack-specific
content that is pulled from this repo.
To build the Elasticsearch Reference on your local machine, use the `docbldes`
or `docbldesx` build commands defined in
https://github.com/elastic/docs/blob/master/doc_build_aliases.sh
== Adding Images
When you include an image in the documentation, specify the path relative to the
location of the asciidoc file. By convention, we put images in an `images`
subdirectory.
For example to insert `watcher-ui-edit-watch.png` in `watcher/limitations.asciidoc`:
. Add an `images` subdirectory to the watcher directory if it doesn't already exist.
. In `limitations.asciidoc` specify:
+
[source, txt]
-----
image::images/watcher-ui-edit-watch.png["Editing a watch"]
-----
Please note that image names and anchor IDs must be unique within the book, so
do not use generic identifiers.

86
x-pack/build.gradle Normal file
View File

@ -0,0 +1,86 @@
import org.elasticsearch.gradle.BuildPlugin
import org.elasticsearch.gradle.plugin.PluginBuildPlugin
import org.elasticsearch.gradle.Version
import org.elasticsearch.gradle.precommit.LicenseHeadersTask
if (project.projectDir.name != 'x-pack-elasticsearch') {
throw new GradleException('You must checkout x-pack-elasticsearch in the following directory: <path to Elasticsearch checkout>/../elasticsearch-extra/x-pack-elasticsearch')
}
task wrapper(type: Wrapper)
Project xpackRootProject = project
subprojects {
group = 'org.elasticsearch.plugin'
ext.xpackRootProject = xpackRootProject
ext.xpackProject = { String projectName -> xpackRootProject.project(projectName) }
// helper method to find the path to a module
ext.xpackModule = { String moduleName -> xpackProject("plugin:${moduleName}").path }
plugins.withType(MavenPublishPlugin).whenPluginAdded {
publishing {
publications {
// add license information to generated poms
all {
pom.withXml { XmlProvider xml ->
Node node = xml.asNode()
Node license = node.appendNode('licenses').appendNode('license')
license.appendNode('name', 'Elastic Commercial Software End User License Agreement')
license.appendNode('url', 'https://www.elastic.co/eula/')
license.appendNode('distribution', 'repo')
Node developer = node.appendNode('developers').appendNode('developer')
developer.appendNode('name', 'Elastic')
developer.appendNode('url', 'http://www.elastic.co')
}
}
}
}
}
plugins.withType(BuildPlugin).whenPluginAdded {
project.licenseFile = xpackRootProject.file('LICENSE.txt')
project.noticeFile = xpackRootProject.file('NOTICE.txt')
}
plugins.withType(PluginBuildPlugin).whenPluginAdded {
project.esplugin.licenseFile = xpackRootProject.file('LICENSE.txt')
project.esplugin.noticeFile = xpackRootProject.file('NOTICE.txt')
}
}
File checkstyleSuppressions = file('dev-tools/checkstyle_suppressions.xml')
subprojects {
tasks.withType(Checkstyle) {
inputs.file(checkstyleSuppressions)
// Use x-pack-elasticsearch specific suppressions file rather than the open source one.
configProperties = [
suppressions: checkstyleSuppressions
]
}
tasks.withType(LicenseHeadersTask.class) {
approvedLicenses = ['Elasticsearch Confidential', 'Generated']
additionalLicense 'ESCON', 'Elasticsearch Confidential', 'ELASTICSEARCH CONFIDENTIAL'
}
ext.projectSubstitutions += [ "org.elasticsearch.plugin:x-pack-core:${version}": xpackModule('core')]
ext.projectSubstitutions += [ "org.elasticsearch.plugin:x-pack-deprecation:${version}": xpackModule('deprecation')]
ext.projectSubstitutions += [ "org.elasticsearch.plugin:x-pack-graph:${version}": xpackModule('graph')]
ext.projectSubstitutions += [ "org.elasticsearch.plugin:x-pack-logstash:${version}": xpackModule('logstash')]
ext.projectSubstitutions += [ "org.elasticsearch.plugin:x-pack-ml:${version}": xpackModule('ml')]
ext.projectSubstitutions += [ "org.elasticsearch.plugin:x-pack-monitoring:${version}": xpackModule('monitoring')]
ext.projectSubstitutions += [ "org.elasticsearch.plugin:x-pack-security:${version}": xpackModule('security')]
ext.projectSubstitutions += [ "org.elasticsearch.plugin:x-pack-upgrade:${version}": xpackModule('upgrade')]
ext.projectSubstitutions += [ "org.elasticsearch.plugin:x-pack-watcher:${version}": xpackModule('watcher')]
bwcVersions.snapshotProjectNames.each { snapshotName ->
Version snapshot = bwcVersions.getSnapshotForProject(snapshotName)
if (snapshot != null && snapshot.onOrAfter("6.3.0")) {
String snapshotProject = ":x-pack-elasticsearch:plugin:bwc:${snapshotName}"
project(snapshotProject).ext.bwcVersion = snapshot
ext.projectSubstitutions["org.elasticsearch.plugin:x-pack:${snapshot}"] = snapshotProject
}
}
}

View File

@ -0,0 +1,10 @@
File extrasDir = new File(settingsDir, '../..').getCanonicalFile()
if (extrasDir.name.endsWith('-extra') == false) {
throw new GradleException("x-pack-elasticsearch must be checked out under an elasticsearch-extra directory, found ${extrasDir.name}")
}
File elasticsearchDir = new File(extrasDir.parentFile, extrasDir.name[0..-7])
if (elasticsearchDir.exists() == false) {
throw new GradleException("${elasticsearchDir.name} is missing as a sibling to ${extrasDir.name}")
}
project(':').projectDir = new File(elasticsearchDir, 'buildSrc')

View File

@ -0,0 +1,29 @@
<?xml version="1.0"?>
<!DOCTYPE suppressions PUBLIC
"-//Puppy Crawl//DTD Suppressions 1.1//EN"
"http://www.puppycrawl.com/dtds/suppressions_1_1.dtd">
<suppressions>
<!-- On Windows, Checkstyle matches files using \ path separator -->
<!-- These files are generated by ANTLR so its silly to hold them to our rules. -->
<suppress files="plugin[/\\]sql[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]xpack[/\\]sql[/\\]parser[/\\]SqlBase(Base(Listener|Visitor)|Lexer|Listener|Parser|Visitor).java" checks="." />
<suppress files="plugin[/\\]core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]xpack[/\\]ml[/\\]action[/\\]StopDatafeedAction.java" checks="LineLength" />
<suppress files="plugin[/\\]ml[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]xpack[/\\]ml[/\\]utils[/\\]DomainSplitFunction.java" checks="LineLength" />
<suppress files="plugin[/\\]core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]xpack[/\\]persistent[/\\]CompletionPersistentTaskAction.java" checks="LineLength" />
<suppress files="plugin[/\\]security[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]xpack[/\\]security[/\\]Security.java" checks="LineLength" />
<suppress files="plugin[/\\]security[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]xpack[/\\]security[/\\]authc[/\\]Realms.java" checks="LineLength" />
<suppress files="plugin[/\\]security[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]xpack[/\\]security[/\\]authc[/\\]ldap[/\\]ActiveDirectorySIDUtil.java" checks="LineLength" />
<suppress files="plugin[/\\]ml[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]xpack[/\\]ml[/\\]integration[/\\]TooManyJobsIT.java" checks="LineLength" />
<suppress files="plugin[/\\]core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]xpack[/\\]persistent[/\\]TestPersistentTasksPlugin.java" checks="LineLength" />
<suppress files="plugin[/\\]security[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]xpack[/\\]security[/\\]action[/\\]user[/\\]TransportGetUsersActionTests.java" checks="LineLength" />
<suppress files="plugin[/\\]security[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]xpack[/\\]security[/\\]authc[/\\]file[/\\]FileRealmTests.java" checks="LineLength" />
<suppress files="plugin[/\\]security[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]xpack[/\\]security[/\\]authc[/\\]ldap[/\\]ActiveDirectoryRealmTests.java" checks="LineLength" />
<suppress files="plugin[/\\]security[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]xpack[/\\]security[/\\]authc[/\\]ldap[/\\]ActiveDirectorySessionFactoryTests.java" checks="LineLength" />
<suppress files="plugin[/\\]security[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]xpack[/\\]security[/\\]authc[/\\]ldap[/\\]LdapRealmTests.java" checks="LineLength" />
<suppress files="plugin[/\\]security[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]xpack[/\\]security[/\\]authc[/\\]ldap[/\\]LdapSessionFactoryTests.java" checks="LineLength" />
<suppress files="plugin[/\\]security[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]xpack[/\\]security[/\\]authc[/\\]ldap[/\\]LdapUserSearchSessionFactoryTests.java" checks="LineLength" />
<suppress files="plugin[/\\]security[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]xpack[/\\]security[/\\]authc[/\\]ldap[/\\]support[/\\]SessionFactoryTests.java" checks="LineLength" />
<suppress files="plugin[/\\]security[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]xpack[/\\]security[/\\]authc[/\\]pki[/\\]PkiRealmTests.java" checks="LineLength" />
<suppress files="qa[/\\]security-example-extension[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]example[/\\]realm[/\\]CustomRealmTests.java" checks="LineLength" />
</suppressions>

195
x-pack/dev-tools/ci Executable file
View File

@ -0,0 +1,195 @@
#!/bin/bash
# This script is used as a single command to run the x-pack tests.
#
# It will attempt to check out 'elasticsearch' into a sibling directory
# unless the environment variable `USE_EXISTING_ES` has a value. The
# branch of elasticsearch which will be checked out depends on
# environment variables. If running locally, set GIT_BRANCH. When
# running in Jenkins, that env var is set. When running a PR
# jenkins job, the variables PR_SOURCE_BRANCH and PR_TARGET_BRANCH
# will be set and the source branch will be looked for in elasticsearch
# before falling back to the target branch name.
#
# It will also attempt to install the appropriate version of node.js
# for the Kibana plugin tests using nvm, unless
# `xpack.kibana.build=false` is defined in
# ~/.gradle/gradle.properties. Set a custom nvm directory using the
# `NVM_DIR` environment variable.
#
# Turn on semi-strict mode
set -e
set -o pipefail
# Allow the user choose different test through a single cli arg
# default to `check` if no argument has been supplied
key=${1-check}
case $key in
intake)
GRADLE_CLI_ARGS=(
"--info"
"compileJava"
"compileTestJava"
"precommit"
"check"
"-Dtests.network=true"
"-Dtests.badapples=true"
)
;;
packagingTest)
GRADLE_CLI_ARGS=(
"--info"
"-Pvagrant.boxes=all"
"packagingTest"
)
;;
packagingTestSample)
GRADLE_CLI_ARGS=(
"--info"
"-Pvagrant.boxes=sample"
"packagingTest"
)
;;
bwcTest)
GRADLE_CLI_ARGS=(
"--info"
"bwcTest"
)
;;
check)
GRADLE_CLI_ARGS=(
"--info"
"check"
"-Dtests.network=true"
"-Dtests.badapples=true"
)
;;
releaseTest)
GRADLE_CLI_ARGS=(
"--info"
"check"
"-Dtests.network=true"
"-Dtests.badapples=true"
"-Dbuild.snapshot=false"
"-Dlicense.key=/etc/x-pack/license.key"
"-Dtests.jvm.argline=-Dbuild.snapshot=false"
)
;;
*)
echo "Unsupported cli argument $1. Allowed arguments are packagingTest or check. No argument defaults to check."
exit 1;;
esac
SCRIPT="$0"
# SCRIPT may be an arbitrarily deep series of symlinks. Loop until we have the concrete path.
while [ -h "$SCRIPT" ] ; do
ls=$(ls -ld "$SCRIPT")
# Drop everything prior to ->
link=$(expr "$ls" : '.*-> \(.*\)$')
if expr "$link" : '/.*' > /dev/null; then
SCRIPT="$link"
else
SCRIPT=$(dirname "$SCRIPT")/"$link"
fi
done
# determine base directory
BASE_DIR=$(dirname "$SCRIPT")/..
# make BASE_DIR absolute
BASE_DIR=$(cd "$BASE_DIR"; pwd)
PARENT_DIR=$(cd "$BASE_DIR"/../..; pwd)
# go to the parent directory
cd $PARENT_DIR
if [ -z ${USE_EXISTING_ES:+x} ]; then
if [ -d "./elasticsearch" ]; then
echo "I expected a clean workspace but an 'elasticsearch' sibling directory already exists in [$PARENT_DIR]!"
echo
echo "Either define 'USE_EXISTING_ES' or remove the existing 'elasticsearch' sibling."
exit 1
fi
function pick_clone_target {
echo "picking which branch of elasticsearch to clone"
# PR_* are provided by the CI git plugin for pull requests
if [[ -n "$PR_AUTHOR" && -n "$PR_SOURCE_BRANCH" ]]; then
GH_USER="$PR_AUTHOR"
BRANCH="$PR_SOURCE_BRANCH"
echo " -> using pull request author $GH_USER and branch $BRANCH"
if [[ -n "$(git ls-remote --heads https://github.com/$GH_USER/elasticsearch.git $BRANCH 2>/dev/null)" ]]; then
return
fi
fi
GH_USER="elastic"
# GIT_BRANCH is provided by normal CI runs. It starts with the repo, i.e., origin/master
# If we are not in CI, we fall back to the master branch
BRANCH="${PR_TARGET_BRANCH:-${GIT_BRANCH#*/}}"
BRANCH="${BRANCH:-master}"
echo " -> using CI branch $BRANCH from elastic repo"
}
pick_clone_target
DEPTH=1
if [ -n "$BUILD_METADATA" ]; then
IFS=';' read -ra metadata <<< "$BUILD_METADATA"
for kv in "${metadata[@]}"; do
IFS='=' read -ra key_value <<< "$kv"
if [ "${key_value[0]}" == "git_ref_elasticsearch" ]; then
# Force checked out hash if build metadata is set. We use a depth of 100, which
# assumes there are no more than 100 commits between head of the branch and
# last-good-commit. This is still quite a bit faster than pulling the entire history.
ES_REF="${key_value[1]}"
DEPTH=100
fi
done
fi
echo " -> checking out '$BRANCH' branch from $GH_USER/elasticsearch..."
git clone -b $BRANCH "https://github.com/$GH_USER/elasticsearch.git" --depth=$DEPTH
if [ ! -z $ES_REF ]; then
echo " -> using elasticsearch ref from build metadata: $ES_REF"
git -C elasticsearch checkout $ES_REF
else
ES_REF="$(git -C elasticsearch rev-parse HEAD)"
fi
echo " -> checked out elasticsearch revision: $ES_REF"
echo
else
if [ -d "./elasticsearch" ]; then
echo "Using existing 'elasticsearch' checkout"
else
echo "You have defined 'USE_EXISTING_ES' but no existing Elasticsearch directory exists!"
exit 2
fi
fi
# back to base directory
cd "$BASE_DIR"
echo "Running x-pack-elasticsearch tests..."
echo "Running in $PWD"
# output the commands
set -xuf
# clean
./gradlew --stacktrace clean -Dorg.gradle.java.home=${RUNTIME_JAVA_HOME:-$JAVA_HOME}
# Actually run the tests
GRADLE_CLI_ARGS+=("-Dorg.gradle.java.home=${RUNTIME_JAVA_HOME:-$JAVA_HOME}")
./gradlew "${GRADLE_CLI_ARGS[@]}"
# write the ES hash we checked out to build metadata
mkdir build
echo "git_ref_elasticsearch=$ES_REF" > build/build_metadata
# ~*~ shell-script-mode ~*~

View File

@ -0,0 +1,184 @@
#!/usr/bin/env perl
use strict;
use warnings;
use HTTP::Tiny;
use IO::Socket::SSL 1.52;
use utf8;
use Getopt::Long;
my $Base_URL = "https://api.github.com/repos/";
my $User_Repo = 'elastic/x-pack-elasticsearch/';
my $Issue_URL = "https://github.com/${User_Repo}issues";
use JSON();
use URI();
use URI::Escape qw(uri_escape_utf8);
our $json = JSON->new->utf8(1);
our $http = HTTP::Tiny->new(
default_headers => {
Accept => "application/vnd.github.v3+json",
Authorization => load_github_key()
}
);
my %Opts = ( state => 'open' );
GetOptions(
\%Opts, #
'state=s', 'labels=s', 'add=s', 'remove=s'
) || exit usage();
die usage('--state must be one of open|all|closed')
unless $Opts{state} =~ /^(open|all|closed)$/;
die usage('--labels is required') unless $Opts{labels};
die usage('Either --add or --remove is required')
unless $Opts{add} || $Opts{remove};
relabel();
#===================================
sub relabel {
#===================================
my @remove = split /,/, ( $Opts{remove} || '' );
my @add = split /,/, ( $Opts{add} || '' );
my $add_json = $json->encode( \@add );
my $url = URI->new( $Base_URL . $User_Repo . 'issues' );
$url->query_form(
state => $Opts{state},
labels => $Opts{labels},
per_page => 100
);
my $spool = Spool->new($url);
while ( my $issue = $spool->next ) {
my $id = $issue->{number};
print "$Issue_URL/$id\n";
if (@add) {
add_label( $id, $add_json );
}
for (@remove) {
remove_label( $id, $_ );
}
}
print "Done\n";
}
#===================================
sub add_label {
#===================================
my ( $id, $json ) = @_;
my $response = $http->post(
$Base_URL . $User_Repo . "issues/$id/labels",
{ content => $json,
headers => { "Content-Type" => "application/json; charset=utf-8" }
}
);
die "$response->{status} $response->{reason}\n"
unless $response->{success};
}
#===================================
sub remove_label {
#===================================
my ( $id, $name ) = @_;
my $url
= $Base_URL
. $User_Repo
. "issues/$id/labels/"
. uri_escape_utf8($name);
my $response = $http->delete($url);
die "$response->{status} $response->{reason}\n"
unless $response->{success};
}
#===================================
sub load_github_key {
#===================================
my ($file) = glob("~/.github_auth");
unless ( -e $file ) {
warn "File ~/.github_auth doesn't exist - using anonymous API. "
. "Generate a Personal Access Token at https://github.com/settings/applications\n";
return '';
}
open my $fh, $file or die "Couldn't open $file: $!";
my ($key) = <$fh> || die "Couldn't read $file: $!";
$key =~ s/^\s+//;
$key =~ s/\s+$//;
die "Invalid GitHub key: $key"
unless $key =~ /^[0-9a-f]{40}$/;
return "token $key";
}
#===================================
sub usage {
#===================================
my $msg = shift || '';
if ($msg) {
$msg = "\nERROR: $msg\n\n";
}
return $msg . <<"USAGE";
$0 --state=open|closed|all --labels=foo,bar --add=new1,new2 --remove=old1,old2
USAGE
}
package Spool;
use strict;
use warnings;
#===================================
sub new {
#===================================
my $class = shift;
my $url = shift;
return bless {
url => $url,
buffer => []
},
$class;
}
#===================================
sub next {
#===================================
my $self = shift;
if ( @{ $self->{buffer} } == 0 ) {
$self->refill;
}
return shift @{ $self->{buffer} };
}
#===================================
sub refill {
#===================================
my $self = shift;
return unless $self->{url};
my $response = $http->get( $self->{url} );
die "$response->{status} $response->{reason}\n"
unless $response->{success};
$self->{url} = '';
if ( my $link = $response->{headers}{link} ) {
my @links = ref $link eq 'ARRAY' ? @$link : $link;
for ($link) {
next unless $link =~ /<([^>]+)>; rel="next"/;
$self->{url} = $1;
last;
}
}
push @{ $self->{buffer} }, @{ $json->decode( $response->{content} ) };
}

View File

@ -0,0 +1,200 @@
# Smoke-tests a x-pack release candidate
#
# 1. Downloads the zip file from the staging URL
# 3. Installs x-pack plugin
# 4. Starts one node for zip package and checks:
# -- if x-pack plugin is loaded
# -- checks xpack info page, if response returns correct version and feature set info
#
# USAGE:
#
# python3 -B ./dev-tools/smoke_test_rc.py --version 5.0.0-beta1 --hash bfa3e47
#
import argparse
import tempfile
import os
import signal
import shutil
import urllib
import urllib.request
import time
import json
import base64
from http.client import HTTPConnection
# in case of debug, uncomment
# HTTPConnection.debuglevel = 4
try:
JAVA_HOME = os.environ['JAVA_HOME']
except KeyError:
raise RuntimeError("""
Please set JAVA_HOME in the env before running release tool
On OSX use: export JAVA_HOME=`/usr/libexec/java_home -v '1.8*'`""")
def java_exe():
path = JAVA_HOME
return 'export JAVA_HOME="%s" PATH="%s/bin:$PATH" JAVACMD="%s/bin/java"' % (path, path, path)
def verify_java_version(version):
s = os.popen('%s; java -version 2>&1' % java_exe()).read()
if ' version "%s.' % version not in s:
raise RuntimeError('got wrong version for java %s:\n%s' % (version, s))
def read_fully(file):
with open(file, encoding='utf-8') as f:
return f.read()
def wait_for_node_startup(es_dir, timeout=60, headers={}):
print(' Waiting until node becomes available for at most %s seconds' % timeout)
for _ in range(timeout):
conn = None
try:
time.sleep(1)
host = get_host_from_ports_file(es_dir)
conn = HTTPConnection(host, timeout=1)
conn.request('GET', '/', headers=headers)
res = conn.getresponse()
if res.status == 200:
return True
except IOError as e:
pass
#that is ok it might not be there yet
finally:
if conn:
conn.close()
return False
def download_release(version, release_hash, url):
print('Downloading release %s from %s' % (version, url))
tmp_dir = tempfile.mkdtemp()
try:
downloaded_files = []
print(' ' + '*' * 80)
print(' Downloading %s' % (url))
file = ('elasticsearch-%s.zip' % version)
artifact_path = os.path.join(tmp_dir, file)
downloaded_files.append(artifact_path)
urllib.request.urlretrieve(url, os.path.join(tmp_dir, file))
print(' ' + '*' * 80)
print()
smoke_test_release(version, downloaded_files, release_hash)
print(' SUCCESS')
finally:
shutil.rmtree(tmp_dir)
def get_host_from_ports_file(es_dir):
return read_fully(os.path.join(es_dir, 'logs/http.ports')).splitlines()[0]
def smoke_test_release(release, files, release_hash):
for release_file in files:
if not os.path.isfile(release_file):
raise RuntimeError('Smoketest failed missing file %s' % (release_file))
tmp_dir = tempfile.mkdtemp()
run('unzip %s -d %s' % (release_file, tmp_dir))
es_dir = os.path.join(tmp_dir, 'elasticsearch-%s' % (release))
es_run_path = os.path.join(es_dir, 'bin/elasticsearch')
print(' Smoke testing package [%s]' % release_file)
es_plugin_path = os.path.join(es_dir, 'bin/elasticsearch-plugin')
print(' Install xpack [%s]')
run('%s; ES_JAVA_OPTS="-Des.plugins.staging=%s" %s install -b x-pack' % (java_exe(), release_hash, es_plugin_path))
headers = { 'Authorization' : 'Basic %s' % base64.b64encode(b"es_admin:foobar").decode("UTF-8") }
es_shield_path = os.path.join(es_dir, 'bin/x-pack/users')
print(" Install dummy shield user")
run('%s; %s useradd es_admin -r superuser -p foobar' % (java_exe(), es_shield_path))
print(' Starting elasticsearch daemon from [%s]' % es_dir)
try:
run('%s; %s -Enode.name=smoke_tester -Ecluster.name=prepare_release -Erepositories.url.allowed_urls=http://snapshot.test* %s -Epidfile=%s -Enode.portsfile=true'
% (java_exe(), es_run_path, '-d', os.path.join(es_dir, 'es-smoke.pid')))
if not wait_for_node_startup(es_dir, headers=headers):
print("elasticsearch logs:")
print('*' * 80)
logs = read_fully(os.path.join(es_dir, 'logs/prepare_release.log'))
print(logs)
print('*' * 80)
raise RuntimeError('server didn\'t start up')
try: # we now get / and /_nodes to fetch basic infos like hashes etc and the installed plugins
host = get_host_from_ports_file(es_dir)
conn = HTTPConnection(host, timeout=20)
# check if plugin is loaded
conn.request('GET', '/_nodes/plugins?pretty=true', headers=headers)
res = conn.getresponse()
if res.status == 200:
nodes = json.loads(res.read().decode("utf-8"))['nodes']
for _, node in nodes.items():
node_plugins = node['plugins']
for node_plugin in node_plugins:
if node_plugin['name'] != 'x-pack':
raise RuntimeError('Unexpected plugin %s, expected x-pack only' % node_plugin['name'])
else:
raise RuntimeError('Expected HTTP 200 but got %s' % res.status)
# check if license is the default one
# also sleep for few more seconds, as the initial license generation might take some time
time.sleep(5)
conn.request('GET', '/_xpack', headers=headers)
res = conn.getresponse()
if res.status == 200:
xpack = json.loads(res.read().decode("utf-8"))
if xpack['license']['type'] != 'trial':
raise RuntimeError('expected license type to be trial, was %s' % xpack['license']['type'])
if xpack['license']['mode'] != 'trial':
raise RuntimeError('expected license mode to be trial, was %s' % xpack['license']['mode'])
if xpack['license']['status'] != 'active':
raise RuntimeError('expected license status to be active, was %s' % xpack['license']['active'])
else:
raise RuntimeError('Expected HTTP 200 but got %s' % res.status)
finally:
conn.close()
finally:
pid_path = os.path.join(es_dir, 'es-smoke.pid')
if os.path.exists(pid_path): # try reading the pid and kill the node
pid = int(read_fully(pid_path))
os.kill(pid, signal.SIGKILL)
shutil.rmtree(tmp_dir)
print(' ' + '*' * 80)
print()
# console colors
COLOR_OK = '\033[92m'
COLOR_END = '\033[0m'
def run(command, env_vars=None):
if env_vars:
for key, value in env_vars.items():
os.putenv(key, value)
print('*** Running: %s%s%s' % (COLOR_OK, command, COLOR_END))
if os.system(command):
raise RuntimeError(' FAILED: %s' % (command))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='SmokeTests a Release Candidate from S3 staging repo')
parser.add_argument('--version', '-v', dest='version', default=None,
help='The Elasticsearch Version to smoke-tests', required=True)
parser.add_argument('--hash', '-r', dest='hash', default=None, required=True,
help='The sha1 short hash of the release git commit to smoketest')
parser.add_argument('--fetch_url', '-u', dest='url', default=None,
help='Fetched from the specified URL')
parser.set_defaults(hash=None)
parser.set_defaults(version=None)
parser.set_defaults(url=None)
args = parser.parse_args()
version = args.version
hash = args.hash
url = args.url
verify_java_version('1.8')
if url:
download_url = url
else:
download_url = 'https://staging.elastic.co/%s-%s/downloads/elasticsearch/elasticsearch-%s.zip' % (version, hash, version)
download_release(version, hash, download_url)

View File

@ -0,0 +1,253 @@
#!/usr/bin/env perl
# Licensed to Elasticsearch under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on
# an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
# either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
use strict;
use warnings;
use HTTP::Tiny 0.070;
use IO::Socket::SSL 1.52;
use utf8;
my $Github_Key = load_github_key();
my $Base_URL = "https://${Github_Key}api.github.com/repos/";
my $User_Repo = 'elastic/x-pack-elasticsearch/';
my $Issue_URL = "http://github.com/${User_Repo}issues/";
my @Groups = (
"breaking", "breaking-java", "deprecation", "feature",
"enhancement", "bug", "regression", "upgrade", "non-issue", "build",
"docs", "test"
);
my %Group_Labels = (
breaking => 'Breaking changes',
'breaking-java' => 'Breaking Java changes',
build => 'Build',
deprecation => 'Deprecations',
docs => 'Docs',
feature => 'New features',
enhancement => 'Enhancements',
bug => 'Bug fixes',
regression => 'Regressions',
test => 'Tests',
upgrade => 'Upgrades',
"non-issue" => 'Non-issue',
other => 'NOT CLASSIFIED',
);
use JSON();
use Encode qw(encode_utf8);
my $json = JSON->new->utf8(1);
my %All_Labels = fetch_labels();
my $version = shift @ARGV
or dump_labels();
dump_labels("Unknown version '$version'")
unless $All_Labels{$version};
my $issues = fetch_issues($version);
dump_issues( $version, $issues );
#===================================
sub dump_issues {
#===================================
my $version = shift;
my $issues = shift;
$version =~ s/v//;
my ( $day, $month, $year ) = (gmtime)[ 3 .. 5 ];
$month++;
$year += 1900;
print <<"ASCIIDOC";
:issue: https://github.com/${User_Repo}issues/
:pull: https://github.com/${User_Repo}pull/
[[release-notes-$version]]
== $version Release Notes
ASCIIDOC
for my $group ( @Groups, 'other' ) {
my $group_issues = $issues->{$group} or next;
print "[[$group-$version]]\n"
. "[float]\n"
. "=== $Group_Labels{$group}\n\n";
for my $header ( sort keys %$group_issues ) {
my $header_issues = $group_issues->{$header};
print( $header || 'HEADER MISSING', "::\n" );
for my $issue (@$header_issues) {
my $title = $issue->{title};
if ( $issue->{state} eq 'open' ) {
$title .= " [OPEN]";
}
unless ( $issue->{pull_request} ) {
$title .= " [ISSUE]";
}
my $number = $issue->{number};
# print encode_utf8("* $title {pull}${number}[#${number}]");
print encode_utf8("* $title");
print "\n";
print encode_utf8("// https://github.com/${User_Repo}pull/${number}[#${number}]");
if ( my $related = $issue->{related_issues} ) {
my %uniq = map { $_ => 1 } @$related;
print keys %uniq > 1
? " (issues: "
: " (issue: ";
# print join ", ", map {"{issue}${_}[#${_}]"}
# print join ", ", map {"#${_}"}
print join ", ", map {"https://github.com/${User_Repo}issues/${_}[#${_}]"}
sort keys %uniq;
print ")";
}
print "\n";
}
print "\n";
}
print "\n\n";
}
}
#===================================
sub fetch_issues {
#===================================
my $version = shift;
my @issues;
my %seen;
for my $state ( 'open', 'closed' ) {
my $page = 1;
while (1) {
my $tranche
= fetch( $User_Repo
. 'issues?labels='
. $version
. '&pagesize=100&state='
. $state
. '&page='
. $page )
or die "Couldn't fetch issues for version '$version'";
push @issues, @$tranche;
for my $issue (@$tranche) {
next unless $issue->{pull_request};
for ( $issue->{body} =~ m{(?:#|${User_Repo}issues/)(\d+)}g ) {
$seen{$_}++;
push @{ $issue->{related_issues} }, $_;
}
}
$page++;
last unless @$tranche;
}
}
my %group;
ISSUE:
for my $issue (@issues) {
next if $seen{ $issue->{number} } && !$issue->{pull_request};
# uncomment for including/excluding PRs already issued in other versions
# next if grep {$_->{name}=~/^v2/} @{$issue->{labels}};
my %labels = map { $_->{name} => 1 } @{ $issue->{labels} };
my ($header) = map { substr( $_, 1 ) } grep {/^:/} sort keys %labels;
$header ||= 'NOT CLASSIFIED';
for (@Groups) {
if ( $labels{$_} ) {
push @{ $group{$_}{$header} }, $issue;
next ISSUE;
}
}
push @{ $group{other}{$header} }, $issue;
}
return \%group;
}
#===================================
sub fetch_labels {
#===================================
my %all;
my $page = 1;
while (1) {
my $labels = fetch( $User_Repo . 'labels?page=' . $page++ )
or die "Couldn't retrieve version labels";
last unless @$labels;
for (@$labels) {
my $name = $_->{name};
next unless $name =~ /^v/;
$all{$name} = 1;
}
}
return %all;
}
#===================================
sub fetch {
#===================================
my $url = $Base_URL . shift();
# print "$url\n";
my $response = HTTP::Tiny->new->get($url);
# use Data::Dumper;
# print Dumper($response);
die "$response->{status} $response->{reason}\n"
unless $response->{success};
# print $response->{content};
return $json->decode( $response->{content} );
}
#===================================
sub load_github_key {
#===================================
my ($file) = glob("~/.github_auth");
unless ( -e $file ) {
warn "File ~/.github_auth doesn't exist - using anonymous API. "
. "Generate a personal access token that has repo scope. See https://github.com/elastic/dev/blob/master/shared/development_process.md \n";
return '';
}
open my $fh, $file or die "Couldn't open $file: $!";
my ($key) = <$fh> || die "Couldn't read $file: $!";
$key =~ s/^\s+//;
$key =~ s/\s+$//;
die "Invalid GitHub key: $key"
unless $key =~ /^[0-9a-f]{40}$/;
return "$key:x-oauth-basic@";
}
#===================================
sub dump_labels {
#===================================
my $error = shift || '';
if ($error) {
$error = "\nERROR: $error\n";
}
my $labels = join( "\n - ", '', ( sort keys %All_Labels ) );
die <<USAGE
$error
USAGE: $0 version > outfile
Known versions:$labels
USAGE
}

View File

@ -0,0 +1,270 @@
#!/usr/bin/env perl
# Licensed to Elasticsearch under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on
# an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
# either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
use strict;
use warnings;
use HTTP::Tiny 0.070;
use IO::Socket::SSL 1.52;
use utf8;
my $Github_Key = load_github_key();
my $Base_URL = "https://${Github_Key}api.github.com/repos/";
my $User_Repo1 = 'elastic/x-pack-elasticsearch/';
my $Issue_URL1 = "http://github.com/${User_Repo1}issues/";
my $User_Repo2 = 'elastic/machine-learning-cpp/';
my $Issue_URL2 = "http://github.com/${User_Repo2}issues/";
my @Groups = (
"breaking", "breaking-java", "deprecation", "feature",
"enhancement", "bug", "regression", "upgrade", "non-issue", "build",
"docs", "test"
);
my %Group_Labels = (
breaking => 'Breaking changes',
'breaking-java' => 'Breaking Java changes',
build => 'Build',
deprecation => 'Deprecations',
docs => 'Docs',
feature => 'New features',
enhancement => 'Enhancements',
bug => 'Bug fixes',
regression => 'Regressions',
test => 'Tests',
upgrade => 'Upgrades',
"non-issue" => 'Non-issue',
other => 'NOT CLASSIFIED',
);
use JSON();
use Encode qw(encode_utf8);
my $json = JSON->new->utf8(1);
my %All_Labels1 = fetch_labels($User_Repo1);
my $version = shift @ARGV
or dump_labels();
dump_labels(%All_Labels1, "Unknown version '$version'")
unless $All_Labels1{$version};
my $issues1 = fetch_issues($User_Repo1, $version);
# Repeat steps for second repo
my %All_Labels2 = fetch_labels($User_Repo2);
dump_labels(%All_Labels2, "Unknown version '$version'")
unless $All_Labels2{$version};
my $issues2 = fetch_issues($User_Repo2, $version);
dump_issues( $User_Repo1, $version, $issues1 );
dump_issues( $User_Repo2, $version, $issues2 );
#===================================
sub dump_issues {
#===================================
my $User_Repo = shift;
my $version = shift;
my $issues = shift;
$version =~ s/v//;
my ( $day, $month, $year ) = (gmtime)[ 3 .. 5 ];
$month++;
$year += 1900;
print <<"ASCIIDOC";
[[release-notes-$version]]
== X-Pack $version Release Notes
// Pulled from $User_Repo
ASCIIDOC
for my $group ( @Groups, 'other' ) {
my $group_issues = $issues->{$group} or next;
print "[[$group-$version]]\n"
. "[float]\n"
. "=== $Group_Labels{$group}\n\n";
for my $header ( sort keys %$group_issues ) {
my $header_issues = $group_issues->{$header};
print( $header || 'HEADER MISSING', "::\n" );
for my $issue (@$header_issues) {
my $title = $issue->{title};
if ( $issue->{state} eq 'open' ) {
$title .= " [OPEN]";
}
unless ( $issue->{pull_request} ) {
$title .= " [ISSUE]";
}
my $number = $issue->{number};
# print encode_utf8("* $title {pull}${number}[#${number}]");
print encode_utf8("* $title");
print "\n";
print encode_utf8("// https://github.com/${User_Repo}pull/${number}[#${number}]");
if ( my $related = $issue->{related_issues} ) {
my %uniq = map { $_ => 1 } @$related;
print keys %uniq > 1
? " (issues: "
: " (issue: ";
# print join ", ", map {"{issue}${_}[#${_}]"}
# print join ", ", map {"#${_}"}
print join ", ", map {"https://github.com/${User_Repo}issues/${_}[#${_}]"}
sort keys %uniq;
print ")";
}
print "\n";
}
print "\n";
}
print "\n\n";
}
}
#===================================
sub fetch_issues {
#===================================
my $User_Repo = shift;
my $version = shift;
my @issues;
my %seen;
for my $state ( 'open', 'closed' ) {
my $page = 1;
while (1) {
my $tranche
= fetch( $User_Repo
. 'issues?labels='
. $version
. '&pagesize=100&state='
. $state
. '&page='
. $page )
or die "Couldn't fetch issues for version '$version'";
push @issues, @$tranche;
for my $issue (@$tranche) {
next unless $issue->{pull_request};
for ( $issue->{body} =~ m{(?:#|${User_Repo}issues/)(\d+)}g ) {
$seen{$_}++;
push @{ $issue->{related_issues} }, $_;
}
}
$page++;
last unless @$tranche;
}
}
my %group;
ISSUE:
for my $issue (@issues) {
next if $seen{ $issue->{number} } && !$issue->{pull_request};
# uncomment for including/excluding PRs already issued in other versions
# next if grep {$_->{name}=~/^v2/} @{$issue->{labels}};
my %labels = map { $_->{name} => 1 } @{ $issue->{labels} };
my ($header) = map { substr( $_, 1 ) } grep {/^:/} sort keys %labels;
$header ||= 'NOT CLASSIFIED';
for (@Groups) {
if ( $labels{$_} ) {
push @{ $group{$_}{$header} }, $issue;
next ISSUE;
}
}
push @{ $group{other}{$header} }, $issue;
}
return \%group;
}
#===================================
sub fetch_labels {
#===================================
my $User_Repo = shift;
my %all;
my $page = 1;
while (1) {
my $labels = fetch( $User_Repo . 'labels?page=' . $page++ )
or die "Couldn't retrieve version labels";
last unless @$labels;
for (@$labels) {
my $name = $_->{name};
next unless $name =~ /^v/;
$all{$name} = 1;
}
}
return %all;
}
#===================================
sub fetch {
#===================================
my $url = $Base_URL . shift();
# print "$url\n";
my $response = HTTP::Tiny->new->get($url);
# use Data::Dumper;
# print Dumper($response);
die "$response->{status} $response->{reason}\n"
unless $response->{success};
# print $response->{content};
return $json->decode( $response->{content} );
}
#===================================
sub load_github_key {
#===================================
my ($file) = glob("~/.github_auth");
unless ( -e $file ) {
warn "File ~/.github_auth doesn't exist - using anonymous API. "
. "Generate a personal access token that has repo scope. See https://github.com/elastic/dev/blob/master/shared/development_process.md \n";
return '';
}
open my $fh, $file or die "Couldn't open $file: $!";
my ($key) = <$fh> || die "Couldn't read $file: $!";
$key =~ s/^\s+//;
$key =~ s/\s+$//;
die "Invalid GitHub key: $key"
unless $key =~ /^[0-9a-f]{40}$/;
return "$key:x-oauth-basic@";
}
#===================================
sub dump_labels {
#===================================
my %All_Labels = shift;
my $error = shift || '';
if ($error) {
$error = "\nERROR: $error\n";
}
my $labels = join( "\n - ", '', ( sort keys %All_Labels ) );
die <<USAGE
$error
USAGE: $0 version > outfile
Known versions:$labels
USAGE
}

683
x-pack/docs/build.gradle Normal file
View File

@ -0,0 +1,683 @@
import org.elasticsearch.gradle.test.NodeInfo
import java.nio.charset.StandardCharsets
apply plugin: 'elasticsearch.docs-test'
/* List of files that have snippets that probably should be converted to
* `// CONSOLE` and `// TESTRESPONSE` but have yet to be converted. Try and
* only remove entries from this list. When it is empty we'll remove it
* entirely and have a party! There will be cake and everything.... */
buildRestTests.expectedUnconvertedCandidates = [
'en/ml/functions/count.asciidoc',
'en/ml/functions/geo.asciidoc',
'en/ml/functions/info.asciidoc',
'en/ml/functions/metric.asciidoc',
'en/ml/functions/rare.asciidoc',
'en/ml/functions/sum.asciidoc',
'en/ml/functions/time.asciidoc',
'en/ml/aggregations.asciidoc',
'en/ml/customurl.asciidoc',
'en/monitoring/indices.asciidoc',
'en/rest-api/security/ssl.asciidoc',
'en/rest-api/security/users.asciidoc',
'en/rest-api/security/tokens.asciidoc',
'en/rest-api/watcher/put-watch.asciidoc',
'en/security/authentication/user-cache.asciidoc',
'en/security/authorization/field-and-document-access-control.asciidoc',
'en/security/authorization/run-as-privilege.asciidoc',
'en/security/ccs-clients-integrations/http.asciidoc',
'en/security/authorization/custom-roles-provider.asciidoc',
'en/watcher/actions/email.asciidoc',
'en/watcher/actions/hipchat.asciidoc',
'en/watcher/actions/index.asciidoc',
'en/watcher/actions/logging.asciidoc',
'en/watcher/actions/pagerduty.asciidoc',
'en/watcher/actions/slack.asciidoc',
'en/watcher/actions/jira.asciidoc',
'en/watcher/actions/webhook.asciidoc',
'en/watcher/condition/always.asciidoc',
'en/watcher/condition/array-compare.asciidoc',
'en/watcher/condition/compare.asciidoc',
'en/watcher/condition/never.asciidoc',
'en/watcher/condition/script.asciidoc',
'en/watcher/customizing-watches.asciidoc',
'en/watcher/example-watches/example-watch-meetupdata.asciidoc',
'en/watcher/how-watcher-works.asciidoc',
'en/watcher/input/chain.asciidoc',
'en/watcher/input/http.asciidoc',
'en/watcher/input/search.asciidoc',
'en/watcher/input/simple.asciidoc',
'en/watcher/transform.asciidoc',
'en/watcher/transform/chain.asciidoc',
'en/watcher/transform/script.asciidoc',
'en/watcher/transform/search.asciidoc',
'en/watcher/trigger/schedule/cron.asciidoc',
'en/watcher/trigger/schedule/daily.asciidoc',
'en/watcher/trigger/schedule/hourly.asciidoc',
'en/watcher/trigger/schedule/interval.asciidoc',
'en/watcher/trigger/schedule/monthly.asciidoc',
'en/watcher/trigger/schedule/weekly.asciidoc',
'en/watcher/trigger/schedule/yearly.asciidoc',
'en/watcher/troubleshooting.asciidoc',
'en/rest-api/license/delete-license.asciidoc',
'en/rest-api/license/start-trial.asciidoc',
'en/rest-api/license/update-license.asciidoc',
'en/ml/api-quickref.asciidoc',
'en/rest-api/ml/delete-calendar-event.asciidoc',
'en/rest-api/ml/delete-snapshot.asciidoc',
'en/rest-api/ml/forecast.asciidoc',
'en/rest-api/ml/get-bucket.asciidoc',
'en/rest-api/ml/get-job-stats.asciidoc',
'en/rest-api/ml/get-overall-buckets.asciidoc',
'en/rest-api/ml/get-category.asciidoc',
'en/rest-api/ml/get-record.asciidoc',
'en/rest-api/ml/get-influencer.asciidoc',
'en/rest-api/ml/get-snapshot.asciidoc',
'en/rest-api/ml/post-data.asciidoc',
'en/rest-api/ml/preview-datafeed.asciidoc',
'en/rest-api/ml/revert-snapshot.asciidoc',
'en/rest-api/ml/update-snapshot.asciidoc',
'en/rest-api/ml/validate-detector.asciidoc',
'en/rest-api/ml/validate-job.asciidoc',
'en/rest-api/security/authenticate.asciidoc',
'en/rest-api/watcher/stats.asciidoc',
'en/security/authorization.asciidoc',
'en/watcher/example-watches/watching-time-series-data.asciidoc',
]
dependencies {
testCompile project(path: xpackModule('core'), configuration: 'runtime')
testCompile project(path: xpackModule('core'), configuration: 'testArtifacts')
testCompile project(path: xpackProject('plugin').path, configuration: 'testArtifacts')
}
Closure waitWithAuth = { NodeInfo node, AntBuilder ant ->
File tmpFile = new File(node.cwd, 'wait.success')
// wait up to twenty seconds
final long stopTime = System.currentTimeMillis() + 20000L;
Exception lastException = null;
while (System.currentTimeMillis() < stopTime) {
lastException = null;
// we use custom wait logic here as the elastic user is not available immediately and ant.get will fail when a 401 is returned
HttpURLConnection httpURLConnection = null;
try {
httpURLConnection = (HttpURLConnection) new URL("http://${node.httpUri()}/_cluster/health").openConnection();
httpURLConnection.setRequestProperty("Authorization", "Basic " +
Base64.getEncoder().encodeToString("test_admin:x-pack-test-password".getBytes(StandardCharsets.UTF_8)));
httpURLConnection.setRequestMethod("GET");
httpURLConnection.setConnectTimeout(1000);
httpURLConnection.setReadTimeout(30000);
httpURLConnection.connect();
if (httpURLConnection.getResponseCode() == 200) {
tmpFile.withWriter StandardCharsets.UTF_8.name(), {
it.write(httpURLConnection.getInputStream().getText(StandardCharsets.UTF_8.name()))
}
break;
}
} catch (Exception e) {
logger.debug("failed to call cluster health", e)
lastException = e
} finally {
if (httpURLConnection != null) {
httpURLConnection.disconnect();
}
}
// did not start, so wait a bit before trying again
Thread.sleep(500L);
}
if (tmpFile.exists() == false && lastException != null) {
logger.error("final attempt of calling cluster health failed", lastException)
}
return tmpFile.exists()
}
integTestCluster {
plugin xpackProject('plugin').path
setting 'xpack.security.enabled', 'true'
setting 'xpack.security.authc.token.enabled', 'true'
// Disable monitoring exporters for the docs tests
setting 'xpack.monitoring.exporters._local.type', 'local'
setting 'xpack.monitoring.exporters._local.enabled', 'false'
setting 'xpack.license.self_generated.type', 'trial'
setupCommand 'setupTestAdmin',
'bin/x-pack/users', 'useradd', 'test_admin', '-p', 'x-pack-test-password', '-r', 'superuser'
waitCondition = waitWithAuth
}
buildRestTests.docs = fileTree(projectDir) {
// No snippets in here!
exclude 'build.gradle'
// That is where the snippets go, not where they come from!
exclude 'build'
// These file simply doesn't pass yet. We should figure out how to fix them.
exclude 'en/watcher/reference/actions.asciidoc'
exclude 'en/rest-api/graph/explore.asciidoc'
}
Map<String, String> setups = buildRestTests.setups
setups['my_inactive_watch'] = '''
- do:
xpack.watcher.put_watch:
id: "my_watch"
active: false
body: >
{
"trigger": {
"schedule": {
"hourly": {
"minute": [ 0, 5 ]
}
}
},
"input": {
"simple": {
"payload": {
"send": "yes"
}
}
},
"condition": {
"always": {}
},
"actions": {
"test_index": {
"index": {
"index": "test",
"doc_type": "test2"
}
}
}
}
- match: { _id: "my_watch" }
'''
setups['my_active_watch'] = setups['my_inactive_watch'].replace(
'active: false', 'active: true')
// Used by SQL because it looks SQL-ish
setups['library'] = '''
- do:
indices.create:
index: library
body:
settings:
number_of_shards: 1
number_of_replicas: 1
mappings:
book:
properties:
name:
type: text
fields:
keyword:
type: keyword
author:
type: text
fields:
keyword:
type: keyword
release_date:
type: date
page_count:
type: short
- do:
bulk:
index: library
type: book
refresh: true
body: |
{"index":{"_id": "Leviathan Wakes"}}
{"name": "Leviathan Wakes", "author": "James S.A. Corey", "release_date": "2011-06-02", "page_count": 561}
{"index":{"_id": "Hyperion"}}
{"name": "Hyperion", "author": "Dan Simmons", "release_date": "1989-05-26", "page_count": 482}
{"index":{"_id": "Dune"}}
{"name": "Dune", "author": "Frank Herbert", "release_date": "1965-06-01", "page_count": 604}
{"index":{"_id": "Dune Messiah"}}
{"name": "Dune Messiah", "author": "Frank Herbert", "release_date": "1969-10-15", "page_count": 331}
{"index":{"_id": "Children of Dune"}}
{"name": "Children of Dune", "author": "Frank Herbert", "release_date": "1976-04-21", "page_count": 408}
{"index":{"_id": "God Emperor of Dune"}}
{"name": "God Emperor of Dune", "author": "Frank Herbert", "release_date": "1981-05-28", "page_count": 454}
{"index":{"_id": "Consider Phlebas"}}
{"name": "Consider Phlebas", "author": "Iain M. Banks", "release_date": "1987-04-23", "page_count": 471}
{"index":{"_id": "Pandora's Star"}}
{"name": "Pandora's Star", "author": "Peter F. Hamilton", "release_date": "2004-03-02", "page_count": 768}
{"index":{"_id": "Revelation Space"}}
{"name": "Revelation Space", "author": "Alastair Reynolds", "release_date": "2000-03-15", "page_count": 585}
{"index":{"_id": "A Fire Upon the Deep"}}
{"name": "A Fire Upon the Deep", "author": "Vernor Vinge", "release_date": "1992-06-01", "page_count": 613}
{"index":{"_id": "Ender's Game"}}
{"name": "Ender's Game", "author": "Orson Scott Card", "release_date": "1985-06-01", "page_count": 324}
{"index":{"_id": "1984"}}
{"name": "1984", "author": "George Orwell", "release_date": "1985-06-01", "page_count": 328}
{"index":{"_id": "Fahrenheit 451"}}
{"name": "Fahrenheit 451", "author": "Ray Bradbury", "release_date": "1953-10-15", "page_count": 227}
{"index":{"_id": "Brave New World"}}
{"name": "Brave New World", "author": "Aldous Huxley", "release_date": "1932-06-01", "page_count": 268}
{"index":{"_id": "Foundation"}}
{"name": "Foundation", "author": "Isaac Asimov", "release_date": "1951-06-01", "page_count": 224}
{"index":{"_id": "The Giver"}}
{"name": "The Giver", "author": "Lois Lowry", "release_date": "1993-04-26", "page_count": 208}
{"index":{"_id": "Slaughterhouse-Five"}}
{"name": "Slaughterhouse-Five", "author": "Kurt Vonnegut", "release_date": "1969-06-01", "page_count": 275}
{"index":{"_id": "The Hitchhiker's Guide to the Galaxy"}}
{"name": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams", "release_date": "1979-10-12", "page_count": 180}
{"index":{"_id": "Snow Crash"}}
{"name": "Snow Crash", "author": "Neal Stephenson", "release_date": "1992-06-01", "page_count": 470}
{"index":{"_id": "Neuromancer"}}
{"name": "Neuromancer", "author": "William Gibson", "release_date": "1984-07-01", "page_count": 271}
{"index":{"_id": "The Handmaid's Tale"}}
{"name": "The Handmaid's Tale", "author": "Margaret Atwood", "release_date": "1985-06-01", "page_count": 311}
{"index":{"_id": "Starship Troopers"}}
{"name": "Starship Troopers", "author": "Robert A. Heinlein", "release_date": "1959-12-01", "page_count": 335}
{"index":{"_id": "The Left Hand of Darkness"}}
{"name": "The Left Hand of Darkness", "author": "Ursula K. Le Guin", "release_date": "1969-06-01", "page_count": 304}
{"index":{"_id": "The Moon is a Harsh Mistress"}}
{"name": "The Moon is a Harsh Mistress", "author": "Robert A. Heinlein", "release_date": "1966-04-01", "page_count": 288}
'''
setups['server_metrics_index'] = '''
- do:
indices.create:
index: server-metrics
body:
settings:
number_of_shards: 1
number_of_replicas: 0
mappings:
metric:
properties:
timestamp:
type: date
total:
type: long
'''
setups['server_metrics_data'] = setups['server_metrics_index'] + '''
- do:
bulk:
index: server-metrics
type: metric
refresh: true
body: |
{"index": {"_id":"1177"}}
{"timestamp":"2017-03-23T13:00:00","total":40476}
{"index": {"_id":"1178"}}
{"timestamp":"2017-03-23T13:00:00","total":15287}
{"index": {"_id":"1179"}}
{"timestamp":"2017-03-23T13:00:00","total":-776}
{"index": {"_id":"1180"}}
{"timestamp":"2017-03-23T13:00:00","total":11366}
{"index": {"_id":"1181"}}
{"timestamp":"2017-03-23T13:00:00","total":3606}
{"index": {"_id":"1182"}}
{"timestamp":"2017-03-23T13:00:00","total":19006}
{"index": {"_id":"1183"}}
{"timestamp":"2017-03-23T13:00:00","total":38613}
{"index": {"_id":"1184"}}
{"timestamp":"2017-03-23T13:00:00","total":19516}
{"index": {"_id":"1185"}}
{"timestamp":"2017-03-23T13:00:00","total":-258}
{"index": {"_id":"1186"}}
{"timestamp":"2017-03-23T13:00:00","total":9551}
{"index": {"_id":"1187"}}
{"timestamp":"2017-03-23T13:00:00","total":11217}
{"index": {"_id":"1188"}}
{"timestamp":"2017-03-23T13:00:00","total":22557}
{"index": {"_id":"1189"}}
{"timestamp":"2017-03-23T13:00:00","total":40508}
{"index": {"_id":"1190"}}
{"timestamp":"2017-03-23T13:00:00","total":11887}
{"index": {"_id":"1191"}}
{"timestamp":"2017-03-23T13:00:00","total":31659}
'''
setups['server_metrics_job'] = setups['server_metrics_data'] + '''
- do:
xpack.ml.put_job:
job_id: "total-requests"
body: >
{
"description" : "Total sum of requests",
"analysis_config" : {
"bucket_span":"10m",
"detectors" :[
{
"detector_description": "Sum of total",
"function": "sum",
"field_name": "total"
}
]},
"data_description" : {
"time_field":"timestamp",
"time_format": "epoch_ms"
}
}
'''
setups['server_metrics_datafeed'] = setups['server_metrics_job'] + '''
- do:
xpack.ml.put_datafeed:
datafeed_id: "datafeed-total-requests"
body: >
{
"job_id":"total-requests",
"indexes":"server-metrics"
}
'''
setups['server_metrics_openjob'] = setups['server_metrics_datafeed'] + '''
- do:
xpack.ml.open_job:
job_id: "total-requests"
'''
setups['server_metrics_startdf'] = setups['server_metrics_openjob'] + '''
- do:
xpack.ml.start_datafeed:
datafeed_id: "datafeed-total-requests"
'''
setups['calendar_outages'] = '''
- do:
xpack.ml.put_calendar:
calendar_id: "planned-outages"
'''
setups['calendar_outages_addevent'] = setups['calendar_outages'] + '''
- do:
xpack.ml.post_calendar_events:
calendar_id: "planned-outages"
body: >
{ "description": "event 1", "start_time": "2017-12-01T00:00:00Z", "end_time": "2017-12-02T00:00:00Z", "calendar_id": "planned-outages" }
'''
setups['calendar_outages_openjob'] = setups['server_metrics_openjob'] + '''
- do:
xpack.ml.put_calendar:
calendar_id: "planned-outages"
'''
setups['calendar_outages_addjob'] = setups['server_metrics_openjob'] + '''
- do:
xpack.ml.put_calendar:
calendar_id: "planned-outages"
body: >
{
"job_ids": ["total-requests"]
}
'''
setups['calendar_outages_addevent'] = setups['calendar_outages_addjob'] + '''
- do:
xpack.ml.post_calendar_events:
calendar_id: "planned-outages"
body: >
{ "events" : [
{ "description": "event 1", "start_time": "1513641600000", "end_time": "1513728000000"},
{ "description": "event 2", "start_time": "1513814400000", "end_time": "1513900800000"},
{ "description": "event 3", "start_time": "1514160000000", "end_time": "1514246400000"}
]}
'''
setups['role_mapping'] = '''
- do:
xpack.security.put_role_mapping:
name: "mapping1"
body: >
{
"enabled": true,
"roles": [ "user" ],
"rules": { "field": { "username": "*" } }
}
'''
setups['sensor_rollup_job'] = '''
- do:
indices.create:
index: sensor-1
body:
settings:
number_of_shards: 1
number_of_replicas: 0
mappings:
_doc:
properties:
timestamp:
type: date
temperature:
type: long
voltage:
type: float
node:
type: keyword
- do:
xpack.rollup.put_job:
id: "sensor"
body: >
{
"index_pattern": "sensor-*",
"rollup_index": "sensor_rollup",
"cron": "*/30 * * * * ?",
"page_size" :1000,
"groups" : {
"date_histogram": {
"field": "timestamp",
"interval": "1h",
"delay": "7d"
},
"terms": {
"fields": ["node"]
}
},
"metrics": [
{
"field": "temperature",
"metrics": ["min", "max", "sum"]
},
{
"field": "voltage",
"metrics": ["avg"]
}
]
}
'''
setups['sensor_started_rollup_job'] = '''
- do:
indices.create:
index: sensor-1
body:
settings:
number_of_shards: 1
number_of_replicas: 0
mappings:
_doc:
properties:
timestamp:
type: date
temperature:
type: long
voltage:
type: float
node:
type: keyword
- do:
bulk:
index: sensor-1
type: _doc
refresh: true
body: |
{"index":{}}
{"timestamp": 1516729294000, "temperature": 200, "voltage": 5.2, "node": "a"}
{"index":{}}
{"timestamp": 1516642894000, "temperature": 201, "voltage": 5.8, "node": "b"}
{"index":{}}
{"timestamp": 1516556494000, "temperature": 202, "voltage": 5.1, "node": "a"}
{"index":{}}
{"timestamp": 1516470094000, "temperature": 198, "voltage": 5.6, "node": "b"}
{"index":{}}
{"timestamp": 1516383694000, "temperature": 200, "voltage": 4.2, "node": "c"}
{"index":{}}
{"timestamp": 1516297294000, "temperature": 202, "voltage": 4.0, "node": "c"}
- do:
xpack.rollup.put_job:
id: "sensor"
body: >
{
"index_pattern": "sensor-*",
"rollup_index": "sensor_rollup",
"cron": "* * * * * ?",
"page_size" :1000,
"groups" : {
"date_histogram": {
"field": "timestamp",
"interval": "1h",
"delay": "7d"
},
"terms": {
"fields": ["node"]
}
},
"metrics": [
{
"field": "temperature",
"metrics": ["min", "max", "sum"]
},
{
"field": "voltage",
"metrics": ["avg"]
}
]
}
- do:
xpack.rollup.start_job:
id: "sensor"
'''
setups['sensor_index'] = '''
- do:
indices.create:
index: sensor-1
body:
settings:
number_of_shards: 1
number_of_replicas: 0
mappings:
_doc:
properties:
timestamp:
type: date
temperature:
type: long
voltage:
type: float
node:
type: keyword
load:
type: double
net_in:
type: long
net_out:
type: long
hostname:
type: keyword
datacenter:
type: keyword
'''
setups['sensor_prefab_data'] = '''
- do:
indices.create:
index: sensor-1
body:
settings:
number_of_shards: 1
number_of_replicas: 0
mappings:
_doc:
properties:
timestamp:
type: date
temperature:
type: long
voltage:
type: float
node:
type: keyword
- do:
indices.create:
index: sensor_rollup
body:
settings:
number_of_shards: 1
number_of_replicas: 0
mappings:
_doc:
properties:
node.terms.value:
type: keyword
temperature.sum.value:
type: double
temperature.max.value:
type: double
temperature.min.value:
type: double
timestamp.date_histogram.time_zone:
type: keyword
timestamp.date_histogram.interval:
type: keyword
timestamp.date_histogram.timestamp:
type: date
timestamp.date_histogram._count:
type: long
voltage.avg.value:
type: double
voltage.avg._count:
type: long
_rollup.id:
type: keyword
_rollup.version:
type: long
_meta:
_rollup:
sensor:
cron: "* * * * * ?"
rollup_index: "sensor_rollup"
index_pattern: "sensor-*"
timeout: "20s"
page_size: 1000
groups:
date_histogram:
delay: "7d"
field: "timestamp"
interval: "1h"
time_zone: "UTC"
terms:
fields:
- "node"
id: sensor
metrics:
- field: "temperature"
metrics:
- min
- max
- sum
- field: "voltage"
metrics:
- avg
- do:
bulk:
index: sensor_rollup
type: _doc
refresh: true
body: |
{"index":{}}
{"node.terms.value":"b","temperature.sum.value":201.0,"temperature.max.value":201.0,"timestamp.date_histogram.time_zone":"UTC","temperature.min.value":201.0,"timestamp.date_histogram._count":1,"timestamp.date_histogram.interval":"1h","_rollup.computed":["temperature.sum","temperature.min","voltage.avg","temperature.max","node.terms","timestamp.date_histogram"],"voltage.avg.value":5.800000190734863,"node.terms._count":1,"_rollup.version":1,"timestamp.date_histogram.timestamp":1516640400000,"voltage.avg._count":1.0,"_rollup.id":"sensor"}
{"index":{}}
{"node.terms.value":"c","temperature.sum.value":200.0,"temperature.max.value":200.0,"timestamp.date_histogram.time_zone":"UTC","temperature.min.value":200.0,"timestamp.date_histogram._count":1,"timestamp.date_histogram.interval":"1h","_rollup.computed":["temperature.sum","temperature.min","voltage.avg","temperature.max","node.terms","timestamp.date_histogram"],"voltage.avg.value":4.199999809265137,"node.terms._count":1,"_rollup.version":1,"timestamp.date_histogram.timestamp":1516381200000,"voltage.avg._count":1.0,"_rollup.id":"sensor"}
{"index":{}}
{"node.terms.value":"a","temperature.sum.value":202.0,"temperature.max.value":202.0,"timestamp.date_histogram.time_zone":"UTC","temperature.min.value":202.0,"timestamp.date_histogram._count":1,"timestamp.date_histogram.interval":"1h","_rollup.computed":["temperature.sum","temperature.min","voltage.avg","temperature.max","node.terms","timestamp.date_histogram"],"voltage.avg.value":5.099999904632568,"node.terms._count":1,"_rollup.version":1,"timestamp.date_histogram.timestamp":1516554000000,"voltage.avg._count":1.0,"_rollup.id":"sensor"}
{"index":{}}
{"node.terms.value":"a","temperature.sum.value":200.0,"temperature.max.value":200.0,"timestamp.date_histogram.time_zone":"UTC","temperature.min.value":200.0,"timestamp.date_histogram._count":1,"timestamp.date_histogram.interval":"1h","_rollup.computed":["temperature.sum","temperature.min","voltage.avg","temperature.max","node.terms","timestamp.date_histogram"],"voltage.avg.value":5.199999809265137,"node.terms._count":1,"_rollup.version":1,"timestamp.date_histogram.timestamp":1516726800000,"voltage.avg._count":1.0,"_rollup.id":"sensor"}
{"index":{}}
{"node.terms.value":"b","temperature.sum.value":198.0,"temperature.max.value":198.0,"timestamp.date_histogram.time_zone":"UTC","temperature.min.value":198.0,"timestamp.date_histogram._count":1,"timestamp.date_histogram.interval":"1h","_rollup.computed":["temperature.sum","temperature.min","voltage.avg","temperature.max","node.terms","timestamp.date_histogram"],"voltage.avg.value":5.599999904632568,"node.terms._count":1,"_rollup.version":1,"timestamp.date_histogram.timestamp":1516467600000,"voltage.avg._count":1.0,"_rollup.id":"sensor"}
{"index":{}}
{"node.terms.value":"c","temperature.sum.value":202.0,"temperature.max.value":202.0,"timestamp.date_histogram.time_zone":"UTC","temperature.min.value":202.0,"timestamp.date_histogram._count":1,"timestamp.date_histogram.interval":"1h","_rollup.computed":["temperature.sum","temperature.min","voltage.avg","temperature.max","node.terms","timestamp.date_histogram"],"voltage.avg.value":4.0,"node.terms._count":1,"_rollup.version":1,"timestamp.date_histogram.timestamp":1516294800000,"voltage.avg._count":1.0,"_rollup.id":"sensor"}
'''

View File

@ -0,0 +1,157 @@
[role="xpack"]
[[certgen]]
== certgen
deprecated[6.1,Replaced by <<certutil,`certutil`>>.]
The `certgen` command simplifies the creation of certificate authorities (CA),
certificate signing requests (CSR), and signed certificates for use with the
Elastic Stack. Though this command is deprecated, you do not need to replace CA,
CSR, or certificates that it created.
[float]
=== Synopsis
[source,shell]
--------------------------------------------------
bin/x-pack/certgen
(([--cert <cert_file>] [--days <n>] [--dn <name>] [--key <key_file>]
[--keysize <bits>] [--pass <password>] [--p12 <password>])
| [--csr])
[-E <KeyValuePair>] [-h, --help] [--in <input_file>] [--out <output_file>]
([-s, --silent] | [-v, --verbose])
--------------------------------------------------
[float]
=== Description
By default, the command runs in interactive mode and you are prompted for
information about each instance. An instance is any piece of the Elastic Stack
that requires a Transport Layer Security (TLS) or SSL certificate. Depending on
your configuration, {es}, Logstash, {kib}, and Beats might all require a
certificate and private key.
The minimum required value for each instance is a name. This can simply be the
hostname, which is used as the Common Name of the certificate. You can also use
a full distinguished name. IP addresses and DNS names are optional. Multiple
values can be specified as a comma separated string. If no IP addresses or DNS
names are provided, you might disable hostname verification in your TLS or SSL
configuration.
Depending on the parameters that you specify, you are also prompted for
necessary information such as the path for the output file and the CA private
key password.
The `certgen` command also supports a silent mode of operation to enable easier
batch operations. For more information, see <<certgen-silent>>.
The output file is a zip file that contains the signed certificates and private
keys for each instance. If you chose to generate a CA, which is the default
behavior, the certificate and private key are included in the output file. If
you chose to generate CSRs, you should provide them to your commercial or
organization-specific certificate authority to obtain signed certificates. The
signed certificates must be in PEM format to work with {security}.
[float]
=== Parameters
`--cert <cert_file>`:: Specifies to generate new instance certificates and keys
using an existing CA certificate, which is provided in the `<cert_file>` argument.
This parameter cannot be used with the `-csr` parameter.
`--csr`:: Specifies to operate in certificate signing request mode.
`--days <n>`::
Specifies an integer value that represents the number of days the generated keys
are valid. The default value is `1095`. This parameter cannot be used with the
`-csr` parameter.
`--dn <name>`::
Defines the _Distinguished Name_ that is used for the generated CA certificate.
The default value is `CN=Elastic Certificate Tool Autogenerated CA`.
This parameter cannot be used with the `-csr` parameter.
`-E <KeyValuePair>`:: Configures a setting.
`-h, --help`:: Returns all of the command parameters.
`--in <input_file>`:: Specifies the file that is used to run in silent mode. The
input file must be a YAML file, as described in <<certgen-silent>>.
`--key <key_file>`:: Specifies the _private-key_ file for the CA certificate.
This parameter is required whenever the `-cert` parameter is used.
`--keysize <bits>`::
Defines the number of bits that are used in generated RSA keys. The default
value is `2048`.
`--out <output_file>`:: Specifies a path for the output file.
`--pass <password>`:: Specifies the password for the CA private key.
If the `-key` parameter is provided, then this is the password for the existing
private key file. Otherwise, it is the password that should be applied to the
generated CA key. This parameter cannot be used with the `-csr` parameter.
`--p12 <password>`::
Generate a PKCS#12 (`.p12` or `.pfx`) container file for each of the instance
certificates and keys. The generated file is protected by the supplied password,
which can be blank. This parameter cannot be used with the `-csr` parameter.
`-s, --silent`:: Shows minimal output.
`-v, --verbose`:: Shows verbose output.
[float]
=== Examples
[float]
[[certgen-silent]]
==== Using `certgen` in Silent Mode
To use the silent mode of operation, you must create a YAML file that contains
information about the instances. It must match the following format:
[source, yaml]
--------------------------------------------------
instances:
- name: "node1" <1>
ip: <2>
- "192.0.2.1"
dns: <3>
- "node1.mydomain.com"
- name: "node2"
ip:
- "192.0.2.2"
- "198.51.100.1"
- name: "node3"
- name: "node4"
dns:
- "node4.mydomain.com"
- "node4.internal"
- name: "CN=node5,OU=IT,DC=mydomain,DC=com"
filename: "node5" <4>
--------------------------------------------------
<1> The name of the instance. This can be a simple string value or can be a
Distinguished Name (DN). This is the only required field.
<2> An optional array of strings that represent IP Addresses for this instance.
Both IPv4 and IPv6 values are allowed. The values are added as Subject
Alternative Names.
<3> An optional array of strings that represent DNS names for this instance.
The values are added as Subject Alternative Names.
<4> The filename to use for this instance. This name is used as the name of the
directory that contains the instance's files in the output. It is also used in
the names of the files within the directory. This filename should not have an
extension. Note: If the `name` provided for the instance does not represent a
valid filename, then the `filename` field must be present.
When your YAML file is ready, you can use the `certgen` command to generate
certificates or certificate signing requests. Simply use the `-in` parameter to
specify the location of the file. For example:
[source, sh]
--------------------------------------------------
bin/x-pack/certgen -in instances.yml
--------------------------------------------------
This command generates a CA certificate and private key as well as certificates
and private keys for the instances that are listed in the YAML file.

View File

@ -0,0 +1,289 @@
[role="xpack"]
[[certutil]]
== certutil
The `certutil` command simplifies the creation of certificates for use with
Transport Layer Security (TLS) in the Elastic Stack.
[float]
=== Synopsis
[source,shell]
--------------------------------------------------
bin/x-pack/certutil
(
(ca [--ca-dn <name>] [--days <n>] [--pem])
| (cert ([--ca <file_path>] | [--ca-cert <file_path> --ca-key <file_path>])
[--ca-dn <name>] [--ca-pass <password>] [--days <n>]
[--dns <domain_name>] [--in <input_file>] [--ip <ip_addresses>]
[--keep-ca-key] [--multiple] [--name <file_name>] [--pem])
| (csr [--dns <domain_name>] [--in <input_file>] [--ip <ip_addresses>]
[--name <file_name>])
[-E <KeyValuePair>] [--keysize <bits>] [--out <file_path>]
[--pass <password>]
)
[-h, --help] ([-s, --silent] | [-v, --verbose])
--------------------------------------------------
[float]
=== Description
You can specify one of the following modes: `ca`, `cert`, `csr`. The `certutil`
command also supports a silent mode of operation to enable easier batch
operations.
[float]
[[certutil-ca]]
==== CA mode
The `ca` mode generates a new certificate authority (CA). By default, it
produces a single PKCS#12 output file, which holds the CA certificate and the
private key for the CA. If you specify the `--pem` parameter, the command
generates a zip file, which contains the certificate and private key in PEM
format.
You can subsequently use these files as input for the `cert` mode of the command.
[float]
[[certutil-cert]]
==== CERT mode
The `cert` mode generates X.509 certificates and private keys. By default, it
produces a single certificate and key for use on a single instance.
To generate certificates and keys for multiple instances, specify the
`--multiple` parameter, which prompts you for details about each instance.
Alternatively, you can use the `--in` parameter to specify a YAML file that
contains details about the instances.
An instance is any piece of the Elastic Stack that requires a TLS or SSL
certificate. Depending on your configuration, {es}, Logstash, {kib}, and Beats
might all require a certificate and private key. The minimum required
information for an instance is its name, which is used as the common name for
the certificate. The instance name can be a hostname value or a full
distinguished name. If the instance name would result in an invalid file or
directory name, you must also specify a file name in the `--name` command
parameter or in the `filename` field in an input YAML file.
You can optionally provide IP addresses or DNS names for each instance. If
neither IP addresses nor DNS names are specified, the Elastic stack products
cannot perform hostname verification and you might need to configure the
`verfication_mode` security setting to `certificate` only. For more information
about this setting, see <<security-settings>>.
All certificates that are generated by this command are signed by a CA. You can
provide your own CA with the `--ca` or `--ca-cert` parameters. Otherwise, the
command automatically generates a new CA for you. For more information about
generating a CA, see the <<certutil-ca,CA mode of this command>>.
By default, the `cert` mode produces a single PKCS#12 output file which holds
the instance certificate, the instance private key, and the CA certificate. If
you specify the `--pem` parameter, the command generates PEM formatted
certificates and keys and packages them into a zip file.
If you specify the `--keep-ca-key`, `--multiple` or `--in` parameters,
the command produces a zip file containing the generated certificates and keys.
[float]
[[certutil-csr]]
==== CSR mode
The `csr` mode generates certificate signing requests (CSRs) that you can send
to a trusted certificate authority to obtain signed certificates. The signed
certificates must be in PEM or PKCS#12 format to work with {security}.
By default, the command produces a single CSR for a single instance.
To generate CSRs for multiple instances, specify the `--multiple` parameter,
which prompts you for details about each instance. Alternatively, you can use
the `--in` parameter to specify a YAML file that contains details about the
instances.
The `cert` mode produces a single zip file which contains the CSRs and the
private keys for each instance. Each CSR is provided as a standard PEM
encoding of a PKCS#10 CSR. Each key is provided as a PEM encoding of an RSA
private key.
[float]
=== Parameters
`ca`:: Specifies to generate a new local certificate authority (CA). This
parameter cannot be used with the `csr` or `cert` parameters.
`cert`:: Specifies to generate new X.509 certificates and keys.
This parameter cannot be used with the `csr` or `ca` parameters.
`csr`:: Specifies to generate certificate signing requests. This parameter
cannot be used with the `ca` or `cert` parameters.
`--ca <file_path>`:: Specifies the path to an existing CA key pair
(in PKCS#12 format). This parameter cannot be used with the `ca` or `csr` parameters.
`--ca-cert <file_path>`:: Specifies the path to an existing CA certificate (in
PEM format). You must also specify the `--ca-key` parameter. The `--ca-cert`
parameter cannot be used with the `ca` or `csr` parameters.
`--ca-dn <name>`:: Defines the _Distinguished Name_ (DN) that is used for the
generated CA certificate. The default value is
`CN=Elastic Certificate Tool Autogenerated CA`. This parameter cannot be used
with the `csr` parameter.
`--ca-key <file_path>`:: Specifies the path to an existing CA private key (in
PEM format). You must also specify the `--ca-cert` parameter. The `--ca-key`
parameter cannot be used with the `ca` or `csr` parameters.
`--ca-pass <password>`:: Specifies the password for an existing CA private key
or the generated CA private key. This parameter cannot be used with the `ca` or
`csr` parameters.
`--days <n>`:: Specifies an integer value that represents the number of days the
generated certificates are valid. The default value is `1095`. This parameter
cannot be used with the `csr` parameter.
`--dns <domain_name>`:: Specifies a comma-separated list of DNS names. This
parameter cannot be used with the `ca` parameter.
`-E <KeyValuePair>`:: Configures a setting.
`-h, --help`:: Returns all of the command parameters.
`--in <input_file>`:: Specifies the file that is used to run in silent mode. The
input file must be a YAML file. This parameter cannot be used with the `ca`
parameter.
`--ip <IP_addresses>`:: Specifies a comma-separated list of IP addresses. This
parameter cannot be used with the `ca` parameter.
`--keep-ca-key`:: When running in `cert` mode with an automatically-generated
CA, specifies to retain the CA private key for future use.
`--keysize <bits>`::
Defines the number of bits that are used in generated RSA keys. The default
value is `2048`.
`--multiple`::
Specifies to generate files for multiple instances. This parameter cannot be
used with the `ca` parameter.
`--name <file_name>`::
Specifies the name of the generated certificate. This parameter cannot be used
with the `ca` parameter.
`--out <file_path>`:: Specifies a path for the output files.
`--pass <password>`:: Specifies the password for the generated private keys.
+
Keys stored in PKCS#12 format are always password protected.
+
Keys stored in PEM format are password protected only if the
`--pass` parameter is specified. If you do not supply an argument for the
`--pass` parameter, you are prompted for a password.
+
If you want to specify a _blank_ password (without prompting), use
`--pass ""` (with no `=`).
`--pem`:: Generates certificates and keys in PEM format instead of PKCS#12. This
parameter cannot be used with the `csr` parameter.
`-s, --silent`:: Shows minimal output.
`-v, --verbose`:: Shows verbose output.
[float]
=== Examples
The following command generates a CA certificate and private key in PKCS#12
format:
[source, sh]
--------------------------------------------------
bin/x-pack/certutil ca
--------------------------------------------------
You are prompted for an output filename and a password. Alternatively, you can
specify the `--out` and `--pass` parameters.
You can then generate X.509 certificates and private keys by using the new
CA. For example:
[source, sh]
--------------------------------------------------
bin/x-pack/certutil cert --ca elastic-stack-ca.p12
--------------------------------------------------
You are prompted for the CA password and for an output filename and password.
Alternatively, you can specify the `--ca-pass`, `--out`, and `--pass` parameters.
By default, this command generates a file called `elastic-certificates.p12`,
which you can copy to the relevant configuration directory for each Elastic
product that you want to configure. For more information, see
{xpack-ref}/ssl-tls.html[Setting Up TLS on a Cluster].
[float]
[[certutil-silent]]
==== Using `certutil` in Silent Mode
To use the silent mode of operation, you must create a YAML file that contains
information about the instances. It must match the following format:
[source, yaml]
--------------------------------------------------
instances:
- name: "node1" <1>
ip: <2>
- "192.0.2.1"
dns: <3>
- "node1.mydomain.com"
- name: "node2"
ip:
- "192.0.2.2"
- "198.51.100.1"
- name: "node3"
- name: "node4"
dns:
- "node4.mydomain.com"
- "node4.internal"
- name: "CN=node5,OU=IT,DC=mydomain,DC=com"
filename: "node5" <4>
--------------------------------------------------
<1> The name of the instance. This can be a simple string value or can be a
Distinguished Name (DN). This is the only required field.
<2> An optional array of strings that represent IP Addresses for this instance.
Both IPv4 and IPv6 values are allowed. The values are added as Subject
Alternative Names.
<3> An optional array of strings that represent DNS names for this instance.
The values are added as Subject Alternative Names.
<4> The filename to use for this instance. This name is used as the name of the
directory that contains the instance's files in the output. It is also used in
the names of the files within the directory. This filename should not have an
extension. Note: If the `name` provided for the instance does not represent a
valid filename, then the `filename` field must be present.
When your YAML file is ready, you can use the `certutil` command to generate
certificates or certificate signing requests. Simply use the `--in` parameter to
specify the location of the file. For example:
[source, sh]
--------------------------------------------------
bin/x-pack/certutil cert --silent --in instances.yml --out test1.zip --pass testpassword
--------------------------------------------------
This command generates a compressed `test1.zip` file. After you decompress the
output file, there is a directory for each instance that was listed in the
`instances.yml` file. Each instance directory contains a single PKCS#12 (`.p12`)
file, which contains the instance certificate, instance private key, and CA
certificate.
You an also use the YAML file to generate certificate signing requests. For
example:
[source, sh]
--------------------------------------------------
bin/x-pack/certutil csr --silent --in instances.yml --out test2.zip --pass testpassword
--------------------------------------------------
This command generates a compressed file, which contains a directory for each
instance. Each instance directory contains a certificate signing request
(`*.csr` file) and private key (`*.key` file).

View File

@ -0,0 +1,26 @@
[role="xpack"]
[[xpack-commands]]
= {xpack} Commands
[partintro]
--
{xpack} includes commands that help you configure security:
* <<certgen>>
* <<certutil>>
* <<migrate-tool>>
* <<saml-metadata>>
* <<setup-passwords>>
* <<syskeygen>>
* <<users-command>>
--
include::certgen.asciidoc[]
include::certutil.asciidoc[]
include::migrate-tool.asciidoc[]
include::saml-metadata.asciidoc[]
include::setup-passwords.asciidoc[]
include::syskeygen.asciidoc[]
include::users-command.asciidoc[]

View File

@ -0,0 +1,109 @@
[role="xpack"]
[[migrate-tool]]
== migrate
The `migrate` command migrates existing file-based users and roles to the native
realm. From 5.0 onward, you should use the `native` realm to manage roles and
local users.
[float]
=== Synopsis
[source,shell]
--------------------------------------------------
bin/x-pack/migrate
(native (-U, --url <url>)
[-h, --help] [-E <KeyValuePair>]
[-n, --users <uids>] [-r, --roles <roles>]
[-u, --username <uid>] [-p, --password <password>]
[-s, --silent] [-v, --verbose])
--------------------------------------------------
[float]
=== Description
NOTE: When migrating from Shield 2.x, the `migrate` tool should be run prior
to upgrading to ensure all roles can be migrated as some may be in a deprecated
format that {xpack} cannot read. The `migrate` tool is available in Shield
2.4.0 and higher.
The `migrate` tool loads the existing file-based users and roles and calls the
user and roles APIs to add them to the native realm. You can migrate all users
and roles, or specify the ones you want to migrate. Users and roles that
already exist in the `native` realm are not replaced or overridden. If
the names you specify with the `--users` and `--roles` options don't
exist in the `file` realm, they are skipped.
[float]
[[migrate-tool-options]]
=== Parameters
The `native` subcommand supports the following options:
`-E <KeyValuePair>`::
Configures a setting.
`-h, --help`::
Returns all of the command parameters.
`-n`, `--users`::
Comma-separated list of the users that you want to migrate. If this parameter is
not specified, all users are migrated.
`-p`, `--password`::
Password to use for authentication with {es}.
//TBD: What is the default if this isn't specified?
`-r`, `--roles`::
Comma-separated list of the roles that you want to migrate. If this parameter is
not specified, all roles are migrated.
`-s, --silent`:: Shows minimal output.
`-U`, `--url`::
Endpoint URL of the {es} cluster to which you want to migrate the
file-based users and roles. This parameter is required.
`-u`, `--username`::
Username to use for authentication with {es}.
//TBD: What is the default if this isn't specified?
`-v, --verbose`:: Shows verbose output.
[float]
=== Examples
Run the migrate tool when {xpack} is installed. For example:
[source, sh]
----------------------------------------------------------------------
$ bin/x-pack/migrate native -U http://localhost:9200 -u elastic
-p x-pack-test-password -n lee,foo -r role1,role2,role3,role4,foo
starting migration of users and roles...
importing users from [/home/es/config/shield/users]...
found existing users: [test_user, joe3, joe2]
migrating user [lee]
{"user":{"created":true}}
no user [foo] found, skipping
importing roles from [/home/es/config/shield/roles.yml]...
found existing roles: [marvel_user, role_query_fields, admin_role, role3, admin,
remote_marvel_agent, power_user, role_new_format_name_array, role_run_as,
logstash, role_fields, role_run_as1, role_new_format, kibana4_server, user,
transport_client, role1.ab, role_query]
migrating role [role1]
{"role":{"created":true}}
migrating role [role2]
{"role":{"created":true}}
role [role3] already exists, skipping
no role [foo] found, skipping
users and roles imported.
----------------------------------------------------------------------
Additionally, the `-E` flag can be used to specify additional settings. For example
to specify a different configuration directory, the command would look like:
[source, sh]
----------------------------------------------------------------------
$ bin/x-pack/migrate native -U http://localhost:9200 -u elastic
-p x-pack-test-password -E path.conf=/etc/elasticsearch
----------------------------------------------------------------------

View File

@ -0,0 +1,132 @@
[role="xpack"]
[[saml-metadata]]
== saml-metadata
The `saml-metadata` command can be used to generate a SAML 2.0 Service Provider
Metadata file.
[float]
=== Synopsis
[source,shell]
--------------------------------------------------
bin/x-pack/saml-metadata
[--realm <name>]
[--out <file_path>] [--batch]
[--attribute <name>] [--service-name <name>]
[--locale <name>] [--contacts]
([--organisation-name <name>] [--organisation-display-name <name>] [--organisation-url <url>])
([--signing-bundle <file_path>] | [--signing-cert <file_path>][--signing-key <file_path>])
[--signing-key-password <password>]
[-E <KeyValuePair>]
[-h, --help] ([-s, --silent] | [-v, --verbose])
--------------------------------------------------
[float]
=== Description
The SAML 2.0 specification provides a mechanism for Service Providers to
describe their capabilities and configuration using a _metadata file_.
The `saml-metadata` command generates such a file, based on the configuration of
a SAML realm in {es}.
Some SAML Identity Providers will allow you to automatically import a metadata
file when you configure the Elastic Stack as a Service Provider.
You can optionally select to digitally sign the metadata file in order to
ensure its integrity and authenticity before sharing it with the Identity Provider.
The key used for signing the metadata file need not necessarily be the same as
the keys already used in the saml realm configuration for SAML message signing.
[float]
=== Parameters
`--attribute <name>`:: Specifies a SAML attribute that should be
included as a `<RequestedAttribute>` element in the metadata. Any attribute
configured in the {es} realm is automatically included and does not need to be
specified as a commandline option.
`--batch`:: Do not prompt for user input.
`--contacts`:: Specifies that the metadata should include one or more
`<ContactPerson>` elements. The user will be prompted to enter the details for
each person.
`-E <KeyValuePair>`:: Configures an {es} setting.
`-h, --help`:: Returns all of the command parameters.
`--locale <name>`:: Specifies the locale to use for metadata elements such as
`<ServiceName>`. Defaults to the JVM's default system locale.
`--organisation-display-name <name`:: Specified the value of the
`<OrganizationDisplayName>` element.
Only valid if `--organisation-name` is also specified.
`--organisation-name <name>`:: Specifies that an `<Organization>` element should
be included in the metadata and provides the value for the `<OrganizationName>`.
If this is specified, then `--organisation-url` must also be specified.
`--organisation-url <url>`:: Specifies the value of the `<OrganizationURL>`
element. This is required if `--organisation-name` is specified.
`--out <file_path>`:: Specifies a path for the output files.
Defaults to `saml-elasticsearch-metadata.xml`
`--service-name <name>`:: Specifies the value for the `<ServiceName>` element in
the metadata. Defaults to `elasticsearch`.
`--signing-bundle <file_path>`:: Specifies the path to an existing key pair
(in PKCS#12 format). The private key of that key pair will be used to sign
the metadata file.
`--signing-cert <file_path>`:: Specifies the path to an existing certificate (in
PEM format) to be used for signing of the metadata file. You must also specify
the `--signing-key` parameter. This parameter cannot be used with the
`--signing-bundle` parameter.
`--signing-key <file_path>`:: Specifies the path to an existing key (in PEM format)
to be used for signing of the metadata file. You must also specify the
`--signing-cert` parameter. This parameter cannot be used with the
`--signing-bundle` parameter.
`--signing-key-password <password>`:: Specifies the password for the signing key.
It can be used with either the `--signing-key` or the `--signing-bundle` parameters.
`--realm <name>`:: Specifies the name of the realm for which the metadata
should be generated. This parameter is required if there is more than 1 `saml`
realm in your {es} configuration.
`-s, --silent`:: Shows minimal output.
`-v, --verbose`:: Shows verbose output.
[float]
=== Examples
The following command generates a default metadata file for the `saml1` realm:
[source, sh]
--------------------------------------------------
bin/x-pack/saml-metadata --realm saml1
--------------------------------------------------
The file will be written to `saml-elasticsearch-metadata.xml`.
You may be prompted to provide the "friendlyName" value for any attributes that
are used by the realm.
The following command generates a metadata file for the `saml2` realm, with a
`<ServiceName>` of `kibana-finance`, a locale of `en-GB` and includes
`<ContactPerson>` elements and an `<Organization>` element:
[source, sh]
--------------------------------------------------
bin/x-pack/saml-metadata --realm saml2 \
--service-name kibana-finance \
--locale en-GB \
--contacts \
--organisation-name "Mega Corp. Finance Team" \
--organisation-url "http://mega.example.com/finance/"
--------------------------------------------------

View File

@ -0,0 +1,72 @@
[role="xpack"]
[[setup-passwords]]
== setup-passwords
The `setup-passwords` command sets the passwords for the built-in `elastic`,
`kibana`, `logstash_system`, and `beats_system` users.
[float]
=== Synopsis
[source,shell]
--------------------------------------------------
bin/x-pack/setup-passwords auto|interactive
[-b, --batch] [-h, --help] [-E <KeyValuePair>]
[-s, --silent] [-u, --url "<URL>"] [-v, --verbose]
--------------------------------------------------
[float]
=== Description
This command is intended for use only during the initial configuration of
{xpack}. It uses the
{xpack-ref}/setting-up-authentication.html#bootstrap-elastic-passwords[`elastic` bootstrap password]
to run user management API requests. After you set a password for the `elastic`
user, the bootstrap password is no longer active and you cannot use this command.
Instead, you can change passwords by using the *Management > Users* UI in {kib}
or the <<security-api-change-password,Change Password API>>.
This command uses an HTTP connection to connect to the cluster and run the user
management requests. If your cluster uses TLS/SSL on the HTTP layer, the command
automatically attempts to establish the connection by using the HTTPS protocol.
It configures the connection by using the `xpack.security.http.ssl` settings in
the `elasticsearch.yml` file. If you do not use the default config directory
location, ensure that the *ES_PATH_CONF* environment variable returns the
correct path before you run the `setup-passwords` command. You can override
settings in your `elasticsearch.yml` file by using the `-E` command option.
For more information about debugging connection failures, see
{xpack-ref}/trb-security-setup.html[Setup-passwords command fails due to connection failure].
[float]
=== Parameters
`auto`:: Outputs randomly-generated passwords to the console.
`-b, --batch`:: If enabled, runs the change password process without prompting the
user.
`-E <KeyValuePair>`:: Configures a standard {es} or {xpack} setting.
`-h, --help`:: Shows help information.
`interactive`:: Prompts you to manually enter passwords.
`-s, --silent`:: Shows minimal output.
`-u, --url "<URL>"`:: Specifies the URL that the tool uses to submit the user management API
requests. The default value is determined from the settings in your
`elasticsearch.yml` file. If `xpack.security.http.ssl.enabled` is set to `true`,
you must specify an HTTPS URL.
`-v, --verbose`:: Shows verbose output.
[float]
=== Examples
The following example uses the `-u` parameter to tell the tool where to submit
its user management API requests:
[source,shell]
--------------------------------------------------
bin/x-pack/setup-passwords auto -u "http://localhost:9201"
--------------------------------------------------

View File

@ -0,0 +1,50 @@
[role="xpack"]
[[syskeygen]]
== syskeygen
The `syskeygen` command creates a system key file in `CONFIG_DIR/x-pack`.
[float]
=== Synopsis
[source,shell]
--------------------------------------------------
bin/x-pack/syskeygen
[-E <KeyValuePair>] [-h, --help]
([-s, --silent] | [-v, --verbose])
--------------------------------------------------
[float]
=== Description
The command generates a `system_key` file, which you can use to symmetrically
encrypt sensitive data. For example, you can use this key to prevent {watcher}
from returning and storing information that contains clear text credentials. See {xpack-ref}/encrypting-data.html[Encrypting sensitive data in {watcher}].
IMPORTANT: The system key is a symmetric key, so the same key must be used on
every node in the cluster.
[float]
=== Parameters
`-E <KeyValuePair>`:: Configures a setting. For example, if you have a custom
installation of {es}, you can use this parameter to specify the `ES_PATH_CONF`
environment variable.
`-h, --help`:: Returns all of the command parameters.
`-s, --silent`:: Shows minimal output.
`-v, --verbose`:: Shows verbose output.
[float]
=== Examples
The following command generates a `system_key` file in the
default `$ES_HOME/config/x-pack` directory:
[source, sh]
--------------------------------------------------
bin/x-pack/syskeygen
--------------------------------------------------

View File

@ -0,0 +1,138 @@
[role="xpack"]
[[users-command]]
== Users Command
++++
<titleabbrev>users</titleabbrev>
++++
If you use file-based user authentication, the `users` command enables you to
add and remove users, assign user roles, and manage passwords.
[float]
=== Synopsis
[source,shell]
--------------------------------------------------
bin/x-pack/users
([useradd <username>] [-p <password>] [-r <roles>]) |
([list] <username>) |
([passwd <username>] [-p <password>]) |
([roles <username>] [-a <roles>] [-r <roles>]) |
([userdel <username>])
--------------------------------------------------
[float]
=== Description
If you use the built-in `file` internal realm, users are defined in local files
on each node in the cluster.
Usernames and roles must be at least 1 and no more than 1024 characters. They
can contain alphanumeric characters (`a-z`, `A-Z`, `0-9`), spaces, punctuation,
and printable symbols in the
https://en.wikipedia.org/wiki/Basic_Latin_(Unicode_block)[Basic Latin (ASCII) block].
Leading or trailing whitespace is not allowed.
Passwords must be at least 6 characters long.
For more information, see {xpack-ref}/file-realm.html[File-based User Authentication].
TIP: To ensure that {es} can read the user and role information at startup, run
`users useradd` as the same user you use to run {es}. Running the command as
root or some other user updates the permissions for the `users` and `users_roles`
files and prevents {es} from accessing them.
[float]
=== Parameters
`-a <roles>`:: If used with the `roles` parameter, adds a comma-separated list
of roles to a user.
//`-h, --help`:: Returns all of the command parameters.
`list`:: List the users that are registered with the `file` realm
on the local node. If you also specify a user name, the command provides
information for that user.
`-p <password>`:: Specifies the user's password. If you do not specify this
parameter, the command prompts you for the password.
+
--
TIP: Omit the `-p` option to keep
plaintext passwords out of the terminal session's command history.
--
`passwd <username>`:: Resets a user's password. You can specify the new
password directly with the `-p` parameter.
`-r <roles>`::
* If used with the `useradd` parameter, defines a user's roles. This option
accepts a comma-separated list of role names to assign to the user.
* If used with the `roles` parameter, removes a comma-separated list of roles
from a user.
`roles`:: Manages the roles of a particular user. You can combine adding and
removing roles within the same command to change a user's roles.
//`-s, --silent`:: Shows minimal output.
`useradd <username>`:: Adds a user to your local node.
`userdel <username>`:: Deletes a user from your local node.
//`-v, --verbose`:: Shows verbose output.
//[float]
//=== Authorization
[float]
=== Examples
The following example adds a new user named `jacknich` to the `file` realm. The
password for this user is `theshining`, and this user is associated with the
`network` and `monitoring` roles.
[source,shell]
-------------------------------------------------------------------
bin/x-pack/users useradd jacknich -p theshining -r network,monitoring
-------------------------------------------------------------------
The following example lists the users that are registered with the `file` realm
on the local node:
[source, shell]
----------------------------------
bin/x-pack/users list
rdeniro : admin
alpacino : power_user
jacknich : monitoring,network
----------------------------------
Users are in the left-hand column and their corresponding roles are listed in
the right-hand column.
The following example resets the `jacknich` user's password:
[source,shell]
--------------------------------------------------
bin/x-pack/users passwd jachnich
--------------------------------------------------
Since the `-p` parameter was omitted, the command prompts you to enter and
confirm a password in interactive mode.
The following example removes the `network` and `monitoring` roles from the
`jacknich` user and adds the `user` role:
[source,shell]
------------------------------------------------------------
bin/x-pack/users roles jacknich -r network,monitoring -a user
------------------------------------------------------------
The following example deletes the `jacknich` user:
[source,shell]
--------------------------------------------------
bin/x-pack/users userdel jacknich
--------------------------------------------------

View File

@ -0,0 +1,38 @@
include::{es-repo-dir}/index-shared1.asciidoc[]
:edit_url!:
include::setup/setup-xes.asciidoc[]
:edit_url:
include::{es-repo-dir}/index-shared2.asciidoc[]
:edit_url!:
include::release-notes/xpack-breaking.asciidoc[]
:edit_url:
include::{es-repo-dir}/index-shared3.asciidoc[]
:edit_url!:
include::sql/index.asciidoc[]
:edit_url!:
include::monitoring/index.asciidoc[]
:edit_url!:
include::rollup/index.asciidoc[]
:edit_url!:
include::rest-api/index.asciidoc[]
:edit_url!:
include::commands/index.asciidoc[]
:edit_url:
include::{es-repo-dir}/index-shared4.asciidoc[]
:edit_url!:
include::release-notes/xpack-xes.asciidoc[]
:edit_url:
include::{es-repo-dir}/index-shared5.asciidoc[]

View File

@ -0,0 +1,183 @@
[[ml-configuring-aggregation]]
=== Aggregating Data For Faster Performance
By default, {dfeeds} fetch data from {es} using search and scroll requests.
It can be significantly more efficient, however, to aggregate data in {es}
and to configure your jobs to analyze aggregated data.
One of the benefits of aggregating data this way is that {es} automatically
distributes these calculations across your cluster. You can then feed this
aggregated data into {xpackml} instead of raw results, which
reduces the volume of data that must be considered while detecting anomalies.
There are some limitations to using aggregations in {dfeeds}, however.
Your aggregation must include a buckets aggregation, which in turn must contain
a date histogram aggregation. This requirement ensures that the aggregated
data is a time series. If you use a terms aggregation and the cardinality of a
term is high, then the aggregation might not be effective and you might want
to just use the default search and scroll behavior.
When you create or update a job, you can include the names of aggregations, for
example:
[source,js]
----------------------------------
PUT _xpack/ml/anomaly_detectors/farequote
{
"analysis_config": {
"bucket_span": "60m",
"detectors": [{
"function":"mean",
"field_name":"responsetime",
"by_field_name":"airline"
}],
"summary_count_field_name": "doc_count"
},
"data_description": {
"time_field":"time"
}
}
----------------------------------
In this example, the `airline`, `responsetime`, and `time` fields are
aggregations.
NOTE: When the `summary_count_field_name` property is set to a non-null value,
the job expects to receive aggregated input. The property must be set to the
name of the field that contains the count of raw data points that have been
aggregated. It applies to all detectors in the job.
The aggregations are defined in the {dfeed} as follows:
[source,js]
----------------------------------
PUT _xpack/ml/datafeeds/datafeed-farequote
{
"job_id":"farequote",
"indices": ["farequote"],
"types": ["response"],
"aggregations": {
"buckets": {
"date_histogram": {
"field": "time",
"interval": "360s",
"time_zone": "UTC"
},
"aggregations": {
"time": {
"max": {"field": "time"}
},
"airline": {
"terms": {
"field": "airline",
"size": 100
},
"aggregations": {
"responsetime": {
"avg": {
"field": "responsetime"
}
}
}
}
}
}
}
}
----------------------------------
In this example, the aggregations have names that match the fields that they
operate on. That is to say, the `max` aggregation is named `time` and its
field is also `time`. The same is true for the aggregations with the names
`airline` and `responsetime`. Since you must create the job before you can
create the {dfeed}, synchronizing your aggregation and field names can simplify
these configuration steps.
IMPORTANT: If you use a `max` aggregation on a time field, the aggregation name
in the {dfeed} must match the name of the time field, as in the previous example.
For all other aggregations, if the aggregation name doesn't match the field name,
there are limitations in the drill-down functionality within the {ml} page in
{kib}.
When you define an aggregation in a {dfeed}, it must have the following form:
[source,js]
----------------------------------
"aggregations" : {
"buckets" : {
"date_histogram" : {
"time_zone": "UTC", ...
},
"aggregations": {
"<time_field>": {
"max": {
"field":"<time_field>"
}
}
[,"<first_term>": {
"terms":{...
}
[,"aggregations" : {
[<sub_aggregation>]+
} ]
}]
}
}
}
----------------------------------
You must specify `buckets` as the aggregation name and `date_histogram` as the
aggregation type. For more information, see
{ref}/search-aggregations-bucket-datehistogram-aggregation.html[Date Histogram Aggregation].
NOTE: The `time_zone` parameter in the date histogram aggregation must be set to `UTC`,
which is the default value.
Each histogram bucket has a key, which is the bucket start time. This key cannot
be used for aggregations in {dfeeds}, however, because they need to know the
time of the latest record within a bucket. Otherwise, when you restart a {dfeed},
it continues from the start time of the histogram bucket and possibly fetches
the same data twice. The max aggregation for the time field is therefore
necessary to provide the time of the latest record within a bucket.
You can optionally specify a terms aggregation, which creates buckets for
different values of a field.
IMPORTANT: If you use a terms aggregation, by default it returns buckets for
the top ten terms. Thus if the cardinality of the term is greater than 10, not
all terms are analyzed.
You can change this behavior by setting the `size` parameter. To
determine the cardinality of your data, you can run searches such as:
[source,js]
--------------------------------------------------
GET .../_search {
"aggs": {
"service_cardinality": {
"cardinality": {
"field": "service"
}
}
}
}
--------------------------------------------------
By default, {es} limits the maximum number of terms returned to 10000. For high
cardinality fields, the query might not run. It might return errors related to
circuit breaking exceptions that indicate that the data is too large. In such
cases, do not use aggregations in your {dfeed}. For more
information, see {ref}/search-aggregations-bucket-terms-aggregation.html[Terms Aggregation].
You can also optionally specify multiple sub-aggregations.
The sub-aggregations are aggregated for the buckets that were created by their
parent aggregation. For more information, see
{ref}/search-aggregations.html[Aggregations].
TIP: If your detectors use metric or sum analytical functions, set the
`interval` of the date histogram aggregation to a tenth of the `bucket_span`
that was defined in the job. This suggestion creates finer, more granular time
buckets, which are ideal for this type of analysis. If your detectors use count
or rare functions, set `interval` to the same value as `bucket_span`. For more
information about analytical functions, see <<ml-functions>>.

View File

@ -0,0 +1,29 @@
[float]
[[ml-analyzing]]
=== Analyzing the Past and Present
The {xpackml} features automate the analysis of time-series data by creating
accurate baselines of normal behavior in the data and identifying anomalous
patterns in that data. You can submit your data for analysis in batches or
continuously in real-time {dfeeds}.
Using proprietary {ml} algorithms, the following circumstances are detected,
scored, and linked with statistically significant influencers in the data:
* Anomalies related to temporal deviations in values, counts, or frequencies
* Statistical rarity
* Unusual behaviors for a member of a population
Automated periodicity detection and quick adaptation to changing data ensure
that you dont need to specify algorithms, models, or other data science-related
configurations in order to get the benefits of {ml}.
You can view the {ml} results in {kib} where, for example, charts illustrate the
actual data values, the bounds for the expected values, and the anomalies that
occur outside these bounds.
[role="screenshot"]
image::images/ml-gs-job-analysis.jpg["Example screenshot from the Machine Learning Single Metric Viewer in Kibana"]
For a more detailed walk-through of {xpackml} features, see
<<ml-getting-started>>.

View File

@ -0,0 +1,91 @@
[[ml-api-quickref]]
== API Quick Reference
All {ml} endpoints have the following base:
[source,js]
----
/_xpack/ml/
----
The main {ml} resources can be accessed with a variety of endpoints:
* <<ml-api-jobs,+/anomaly_detectors/+>>: Create and manage {ml} jobs
* <<ml-api-datafeeds,+/datafeeds/+>>: Select data from {es} to be analyzed
* <<ml-api-results,+/results/+>>: Access the results of a {ml} job
* <<ml-api-snapshots,+/model_snapshots/+>>: Manage model snapshots
//* <<ml-api-validate,+/validate/+>>: Validate subsections of job configurations
[float]
[[ml-api-jobs]]
=== /anomaly_detectors/
* {ref}/ml-put-job.html[PUT /anomaly_detectors/<job_id+++>+++]: Create a job
* {ref}/ml-open-job.html[POST /anomaly_detectors/<job_id>/_open]: Open a job
* {ref}/ml-post-data.html[POST /anomaly_detectors/<job_id>/_data]: Send data to a job
* {ref}/ml-get-job.html[GET /anomaly_detectors]: List jobs
* {ref}/ml-get-job.html[GET /anomaly_detectors/<job_id+++>+++]: Get job details
* {ref}/ml-get-job-stats.html[GET /anomaly_detectors/<job_id>/_stats]: Get job statistics
* {ref}/ml-update-job.html[POST /anomaly_detectors/<job_id>/_update]: Update certain properties of the job configuration
* {ref}/ml-flush-job.html[POST anomaly_detectors/<job_id>/_flush]: Force a job to analyze buffered data
* {ref}/ml-forecast.html[POST anomaly_detectors/<job_id>/_forecast]: Forecast future job behavior
* {ref}/ml-close-job.html[POST /anomaly_detectors/<job_id>/_close]: Close a job
* {ref}/ml-delete-job.html[DELETE /anomaly_detectors/<job_id+++>+++]: Delete a job
[float]
[[ml-api-calendars]]
=== /calendars/
* {ref}/ml-put-calendar.html[PUT /calendars/<calendar_id+++>+++]: Create a calendar
* {ref}/ml-post-calendar-event.html[POST /calendars/<calendar_id+++>+++/events]: Add a scheduled event to a calendar
* {ref}/ml-put-calendar-job.html[PUT /calendars/<calendar_id+++>+++/jobs/<job_id+++>+++]: Associate a job with a calendar
* {ref}/ml-get-calendar.html[GET /calendars/<calendar_id+++>+++]: Get calendar details
* {ref}/ml-get-calendar-event.html[GET /calendars/<calendar_id+++>+++/events]: Get scheduled event details
* {ref}/ml-delete-calendar-event.html[DELETE /calendars/<calendar_id+++>+++/events/<event_id+++>+++]: Remove a scheduled event from a calendar
* {ref}/ml-delete-calendar-job.html[DELETE /calendars/<calendar_id+++>+++/jobs/<job_id+++>+++]: Disassociate a job from a calendar
* {ref}/ml-delete-calendar.html[DELETE /calendars/<calendar_id+++>+++]: Delete a calendar
[float]
[[ml-api-datafeeds]]
=== /datafeeds/
* {ref}/ml-put-datafeed.html[PUT /datafeeds/<datafeed_id+++>+++]: Create a {dfeed}
* {ref}/ml-start-datafeed.html[POST /datafeeds/<datafeed_id>/_start]: Start a {dfeed}
* {ref}/ml-get-datafeed.html[GET /datafeeds]: List {dfeeds}
* {ref}/ml-get-datafeed.html[GET /datafeeds/<datafeed_id+++>+++]: Get {dfeed} details
* {ref}/ml-get-datafeed-stats.html[GET /datafeeds/<datafeed_id>/_stats]: Get statistical information for {dfeeds}
* {ref}/ml-preview-datafeed.html[GET /datafeeds/<datafeed_id>/_preview]: Get a preview of a {dfeed}
* {ref}/ml-update-datafeed.html[POST /datafeeds/<datafeedid>/_update]: Update certain settings for a {dfeed}
* {ref}/ml-stop-datafeed.html[POST /datafeeds/<datafeed_id>/_stop]: Stop a {dfeed}
* {ref}/ml-delete-datafeed.html[DELETE /datafeeds/<datafeed_id+++>+++]: Delete {dfeed}
[float]
[[ml-api-results]]
=== /results/
* {ref}/ml-get-bucket.html[GET /results/buckets]: List the buckets in the results
* {ref}/ml-get-bucket.html[GET /results/buckets/<bucket_id+++>+++]: Get bucket details
* {ref}/ml-get-overall-buckets.html[GET /results/overall_buckets]: Get overall bucket results for multiple jobs
* {ref}/ml-get-category.html[GET /results/categories]: List the categories in the results
* {ref}/ml-get-category.html[GET /results/categories/<category_id+++>+++]: Get category details
* {ref}/ml-get-influencer.html[GET /results/influencers]: Get influencer details
* {ref}/ml-get-record.html[GET /results/records]: Get records from the results
[float]
[[ml-api-snapshots]]
=== /model_snapshots/
* {ref}/ml-get-snapshot.html[GET /model_snapshots]: List model snapshots
* {ref}/ml-get-snapshot.html[GET /model_snapshots/<snapshot_id+++>+++]: Get model snapshot details
* {ref}/ml-revert-snapshot.html[POST /model_snapshots/<snapshot_id>/_revert]: Revert a model snapshot
* {ref}/ml-update-snapshot.html[POST /model_snapshots/<snapshot_id>/_update]: Update certain settings for a model snapshot
* {ref}/ml-delete-snapshot.html[DELETE /model_snapshots/<snapshot_id+++>+++]: Delete a model snapshot
////
[float]
[[ml-api-validate]]
=== /validate/
* {ref}/ml-valid-detector.html[POST /anomaly_detectors/_validate/detector]: Validate a detector
* {ref}/ml-valid-job.html[POST /anomaly_detectors/_validate]: Validate a job
////

View File

@ -0,0 +1,9 @@
[float]
[[ml-nodes]]
=== Machine learning nodes
A {ml} node is a node that has `xpack.ml.enabled` and `node.ml` set to `true`,
which is the default behavior. If you set `node.ml` to `false`, the node can
service API requests but it cannot run jobs. If you want to use {xpackml}
features, there must be at least one {ml} node in your cluster. For more
information about this setting, see <<xpack-settings>>.

View File

@ -0,0 +1,26 @@
[[ml-buckets]]
=== Buckets
++++
<titleabbrev>Buckets</titleabbrev>
++++
The {xpackml} features use the concept of a _bucket_ to divide the time series
into batches for processing.
The _bucket span_ is part of the configuration information for a job. It defines
the time interval that is used to summarize and model the data. This is
typically between 5 minutes to 1 hour and it depends on your data characteristics.
When you set the bucket span, take into account the granularity at which you
want to analyze, the frequency of the input data, the typical duration of the
anomalies, and the frequency at which alerting is required.
When you view your {ml} results, each bucket has an anomaly score. This score is
a statistically aggregated and normalized view of the combined anomalousness of
all the record results in the bucket. If you have more than one job, you can
also obtain overall bucket results, which combine and correlate anomalies from
multiple jobs into an overall score. When you view the results for jobs groups
in {kib}, it provides the overall bucket scores.
For more information, see
{ref}/ml-results-resource.html[Results Resources] and
{ref}/ml-get-overall-buckets.html[Get Overall Buckets API].

View File

@ -0,0 +1,40 @@
[[ml-calendars]]
=== Calendars and Scheduled Events
Sometimes there are periods when you expect unusual activity to take place,
such as bank holidays, "Black Friday", or planned system outages. If you
identify these events in advance, no anomalies are generated during that period.
The {ml} model is not ill-affected and you do not receive spurious results.
You can create calendars and scheduled events in the **Settings** pane on the
**Machine Learning** page in {kib} or by using {ref}/ml-apis.html[{ml} APIs].
A scheduled event must have a start time, end time, and description. In general,
scheduled events are short in duration (typically lasting from a few hours to a
day) and occur infrequently. If you have regularly occurring events, such as
weekly maintenance periods, you do not need to create scheduled events for these
circumstances; they are already handled by the {ml} analytics.
You can identify zero or more scheduled events in a calendar. Jobs can then
subscribe to calendars and the {ml} analytics handle all subsequent scheduled
events appropriately.
If you want to add multiple scheduled events at once, you can import an
iCalendar (`.ics`) file in {kib} or a JSON file in the
{ref}/ml-post-calendar-event.html[add events to calendar API].
[NOTE]
--
* You must identify scheduled events before your job analyzes the data for that
time period. Machine learning results are not updated retroactively.
* If your iCalendar file contains recurring events, only the first occurrence is
imported.
* Bucket results are generated during scheduled events but they have an
anomaly score of zero. For more information about bucket results, see
{ref}/ml-results-resource.html[Results Resources].
* If you use long or frequent scheduled events, it might take longer for the
{ml} analytics to learn to model your data and some anomalous behavior might be
missed.
--

View File

@ -0,0 +1,228 @@
[[ml-configuring-categories]]
=== Categorizing log messages
Application log events are often unstructured and contain variable data. For
example:
//Obtained from it_ops_new_app_logs.json
[source,js]
----------------------------------
{"time":1454516381000,"message":"org.jdbi.v2.exceptions.UnableToExecuteStatementException: com.mysql.jdbc.exceptions.MySQLTimeoutException: Statement cancelled due to timeout or client request [statement:\"SELECT id, customer_id, name, force_disabled, enabled FROM customers\"]","type":"logs"}
----------------------------------
//NOTCONSOLE
You can use {ml} to observe the static parts of the message, cluster similar
messages together, and classify them into message categories.
The {ml} model learns what volume and pattern is normal for each category over
time. You can then detect anomalies and surface rare events or unusual types of
messages by using count or rare functions. For example:
//Obtained from it_ops_new_app_logs.sh
[source,js]
----------------------------------
PUT _xpack/ml/anomaly_detectors/it_ops_new_logs
{
"description" : "IT Ops Application Logs",
"analysis_config" : {
"categorization_field_name": "message", <1>
"bucket_span":"30m",
"detectors" :[{
"function":"count",
"by_field_name": "mlcategory", <2>
"detector_description": "Unusual message counts"
}],
"categorization_filters":[ "\\[statement:.*\\]"]
},
"analysis_limits":{
"categorization_examples_limit": 5
},
"data_description" : {
"time_field":"time",
"time_format": "epoch_ms"
}
}
----------------------------------
//CONSOLE
<1> The `categorization_field_name` property indicates which field will be
categorized.
<2> The resulting categories are used in a detector by setting `by_field_name`,
`over_field_name`, or `partition_field_name` to the keyword `mlcategory`. If you
do not specify this keyword in one of those properties, the API request fails.
The optional `categorization_examples_limit` property specifies the
maximum number of examples that are stored in memory and in the results data
store for each category. The default value is `4`. Note that this setting does
not affect the categorization; it just affects the list of visible examples. If
you increase this value, more examples are available, but you must have more
storage available. If you set this value to `0`, no examples are stored.
The optional `categorization_filters` property can contain an array of regular
expressions. If a categorization field value matches the regular expression, the
portion of the field that is matched is not taken into consideration when
defining categories. The categorization filters are applied in the order they
are listed in the job configuration, which allows you to disregard multiple
sections of the categorization field value. In this example, we have decided that
we do not want the detailed SQL to be considered in the message categorization.
This particular categorization filter removes the SQL statement from the categorization
algorithm.
If your data is stored in {es}, you can create an advanced job with these same
properties:
[role="screenshot"]
image::images/ml-category-advanced.jpg["Advanced job configuration options related to categorization"]
NOTE: To add the `categorization_examples_limit` property, you must use the
**Edit JSON** tab and copy the `analysis_limits` object from the API example.
[float]
[[ml-configuring-analyzer]]
==== Customizing the Categorization Analyzer
Categorization uses English dictionary words to identify log message categories.
By default, it also uses English tokenization rules. For this reason, if you use
the default categorization analyzer, only English language log messages are
supported, as described in the <<ml-limitations>>.
You can, however, change the tokenization rules by customizing the way the
categorization field values are interpreted. For example:
[source,js]
----------------------------------
PUT _xpack/ml/anomaly_detectors/it_ops_new_logs2
{
"description" : "IT Ops Application Logs",
"analysis_config" : {
"categorization_field_name": "message",
"bucket_span":"30m",
"detectors" :[{
"function":"count",
"by_field_name": "mlcategory",
"detector_description": "Unusual message counts"
}],
"categorization_analyzer":{
"char_filter": [
{ "type": "pattern_replace", "pattern": "\\[statement:.*\\]" } <1>
],
"tokenizer": "ml_classic", <2>
"filter": [
{ "type" : "stop", "stopwords": [
"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
"Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun",
"January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December",
"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
"GMT", "UTC"
] } <3>
]
}
},
"analysis_limits":{
"categorization_examples_limit": 5
},
"data_description" : {
"time_field":"time",
"time_format": "epoch_ms"
}
}
----------------------------------
//CONSOLE
<1> The
{ref}/analysis-pattern-replace-charfilter.html[`pattern_replace` character filter]
here achieves exactly the same as the `categorization_filters` in the first
example.
<2> The `ml_classic` tokenizer works like the non-customizable tokenization
that was used for categorization in older versions of machine learning. If you
want the same categorization behavior as older versions, use this property value.
<3> By default, English day or month words are filtered from log messages before
categorization. If your logs are in a different language and contain
dates, you might get better results by filtering the day or month words in your
language.
The optional `categorization_analyzer` property allows even greater customization
of how categorization interprets the categorization field value. It can refer to
a built-in {es} analyzer or a combination of zero or more character filters,
a tokenizer, and zero or more token filters.
The `ml_classic` tokenizer and the day and month stopword filter are more or less
equivalent to the following analyzer, which is defined using only built-in {es}
{ref}/analysis-tokenizers.html[tokenizers] and
{ref}/analysis-tokenfilters.html[token filters]:
[source,js]
----------------------------------
PUT _xpack/ml/anomaly_detectors/it_ops_new_logs3
{
"description" : "IT Ops Application Logs",
"analysis_config" : {
"categorization_field_name": "message",
"bucket_span":"30m",
"detectors" :[{
"function":"count",
"by_field_name": "mlcategory",
"detector_description": "Unusual message counts"
}],
"categorization_analyzer":{
"tokenizer": {
"type" : "simple_pattern_split",
"pattern" : "[^-0-9A-Za-z_.]+" <1>
},
"filter": [
{ "type" : "pattern_replace", "pattern": "^[0-9].*" }, <2>
{ "type" : "pattern_replace", "pattern": "^[-0-9A-Fa-f.]+$" }, <3>
{ "type" : "pattern_replace", "pattern": "^[^0-9A-Za-z]+" }, <4>
{ "type" : "pattern_replace", "pattern": "[^0-9A-Za-z]+$" }, <5>
{ "type" : "stop", "stopwords": [
"",
"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
"Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun",
"January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December",
"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
"GMT", "UTC"
] }
]
}
},
"analysis_limits":{
"categorization_examples_limit": 5
},
"data_description" : {
"time_field":"time",
"time_format": "epoch_ms"
}
}
----------------------------------
//CONSOLE
<1> Tokens basically consist of hyphens, digits, letters, underscores and dots.
<2> By default, categorization ignores tokens that begin with a digit.
<3> By default, categorization also ignores tokens that are hexadecimal numbers.
<4> Underscores, hyphens, and dots are removed from the beginning of tokens.
<5> Underscores, hyphens, and dots are also removed from the end of tokens.
The key difference between the default `categorization_analyzer` and this example
analyzer is that using the `ml_classic` tokenizer is several times faster. The
difference in behavior is that this custom analyzer does not include accented
letters in tokens whereas the `ml_classic` tokenizer does, although that could
be fixed by using more complex regular expressions.
For more information about the `categorization_analyzer` property, see
{ref}/ml-job-resource.html#ml-categorizationanalyzer[Categorization Analyzer].
NOTE: To add the `categorization_analyzer` property in {kib}, you must use the
**Edit JSON** tab and copy the `categorization_analyzer` object from one of the
API examples above.
[float]
[[ml-viewing-categories]]
==== Viewing Categorization Results
After you open the job and start the {dfeed} or supply data to the job, you can
view the categorization results in {kib}. For example:
[role="screenshot"]
image::images/ml-category-anomalies.jpg["Categorization example in the Anomaly Explorer"]
For this type of job, the **Anomaly Explorer** contains extra information for
each anomaly: the name of the category (for example, `mlcategory 11`) and
examples of the messages in that category. In this case, you can use these
details to investigate occurrences of unusually high message counts for specific
message categories.

View File

@ -0,0 +1,41 @@
[[ml-configuring]]
== Configuring Machine Learning
If you want to use {xpackml} features, there must be at least one {ml} node in
your cluster and all master-eligible nodes must have {ml} enabled. By default,
all nodes are {ml} nodes. For more information about these settings, see
<<xpack-settings>>.
To use the {xpackml} features to analyze your data, you must create a job and
send your data to that job.
* If your data is stored in {es}:
** You can create a {dfeed}, which retrieves data from {es} for analysis.
** You can use {kib} to expedite the creation of jobs and {dfeeds}.
* If your data is not stored in {es}, you can
{ref}/ml-post-data.html[POST data] from any source directly to an API.
The results of {ml} analysis are stored in {es} and you can use {kib} to help
you visualize and explore the results.
For a tutorial that walks you through these configuration steps,
see <<ml-getting-started>>.
Though it is quite simple to analyze your data and provide quick {ml} results,
gaining deep insights might require some additional planning and configuration.
The scenarios in this section describe some best practices for generating useful
{ml} results and insights from your data.
* <<ml-configuring-url>>
* <<ml-configuring-aggregation>>
* <<ml-configuring-categories>>
* <<ml-configuring-pop>>
* <<ml-configuring-transform>>
include::customurl.asciidoc[]
include::aggregations.asciidoc[]
include::categories.asciidoc[]
include::populations.asciidoc[]
include::transforms.asciidoc[]

View File

@ -0,0 +1,104 @@
[[ml-configuring-url]]
=== Adding Custom URLs To Machine Learning Results
When you create an advanced job or edit any job in {kib}, you can optionally
attach one or more custom URLs. You can also specify these custom settings when
you create or update jobs by using the {ml} APIs.
The custom URLs provide links from the anomalies table in the Anomaly Explorer
or Single Metric Viewer window in {kib} to custom dashboards or external
websites. For example, you can define a custom URL that provides a way for users
to drill down to the source data from the results set.
For each custom URL, you must supply the URL and a label, which is the link text
that appears in the anomalies table.
[role="screenshot"]
image::images/ml-customurl.jpg["Links in the Anomaly Explorer anoamilies table"]
[float]
==== String Substitution in Custom URLs
You can use dollar sign ($) delimited tokens in a custom URL. These tokens are
substituted for the values of the corresponding fields in the anomaly records.
For example, for a configured URL of
`http://my.datastore.com/dashboards?user=$user_name$`, the value of the
`user_name` field in the anomaly record is substituted into the `$user_name$`
token when you click the link in the anomalies table.
NOTE: Not all fields in your source data exist in the anomaly results. If a
field is specified in the detector as the `field_name`, `by_field_name`,
`over_field_name`, or `partition_field_name`, for example, it can be used in a
custom URL. A field that is only used in the `categorization_field_name`
property, however, does not exist in the anomaly results.
The following keywords can also be used as tokens for string substitution in a
custom URL: `$earliest$`; `$latest$`; `$mlcategoryregex$`; `$mlcategoryterms$`.
The `$earliest$` and `$latest$` tokens pass the beginning and end of the time
span of the selected anomaly to the target page. The tokens are substituted with
date-time strings in ISO-8601 format. If you selected an interval of 1 hour for
the anomalies table, these tokens use one hour on either side of the anomaly
time as the earliest and latest times. The same is also true if the interval is
set to `Auto` and a one hour interval was chosen.
The `$mlcategoryregex$` and `$mlcategoryterms$` tokens pertain to jobs where you
are categorizing field values. For more information about this type of analysis,
see <<ml-configuring-categories>>.
The `$mlcategoryregex$` token passes the regular expression value of the
category of the selected anomaly, as identified by the value of the `mlcategory`
field of the anomaly record.
The `$mlcategoryterms$` token likewise passes the terms value of the category of
the selected anomaly. Each categorization term is prefixed by a plus (+)
character, so that when the token is passed to a {kib} dashboard, the resulting
dashboard query seeks a match for all of the terms of the category.
For example, the following API updates a `log_categories` job to add a custom
URL that uses `$earliest$`, `$latest$`, and `$mlcategoryterms$` tokens:
[source,js]
----------------------------------
POST _xpack/ml/anomaly_detectors/log_categories/_update
{
"custom_settings": {
"custom_urls": [
{
"url_name": "test-link1",
"url_value": "http://localhost:5601/app/kibana#/discover?_g=(refreshInterval:(display:Off,pause:!f,value:0),time:(from:'$earliest$',mode:quick,to:'$latest$'))&_a=(columns:!(_source),index:AV3OWB68ue3Ht69t29aw,interval:auto,query:(query_string:(analyze_wildcard:!t,query:'$mlcategoryterms$')),sort:!(time,desc))"
}
]
}
}
----------------------------------
When you click this custom URL in the anomalies table in {kib}, it opens up the
Discover page and displays source data for the period when the anomaly occurred.
Since this job was categorizing log messages, some `$mlcategoryterms$` token
values that were passed to the target page for an example anomaly are as follows:
[role="screenshot"]
image::images/ml-categoryterms.jpg["A query for category terms on the Discover page in {kib}"]
[TIP]
===============================
* The custom URL links in the anomaly tables use pop-ups. You must configure
your web browser so that it does not block pop-up windows or create an exception
for your {kib} URL.
* When creating a link to a {kib} dashboard, the URLs for dashboards can be very
long. Be careful of typos, end of line characters, and URL encoding. Also ensure
you use the appropriate index ID for the target {kib} index pattern.
* If you use an influencer name for string substitution, keep in mind that it
might not always be available in the analysis results and the URL is invalid in
those cases. There is not always a statistically significant influencer for each
anomaly.
* The dates substituted for `$earliest$` and `$latest$` tokens are in
ISO-8601 format and the target system must understand this format.
* If the job performs an analysis against nested JSON fields, the tokens for
string substitution can refer to these fields using dot notation. For example,
`$cpu.total$`.
* {es} source data mappings might make it difficult for the query string to work.
Test the custom URL before saving the job configuration to check that it works
as expected, particularly when using string substitution.
===============================

View File

@ -0,0 +1,40 @@
[[ml-dfeeds]]
=== {dfeeds-cap}
Machine learning jobs can analyze data that is stored in {es} or data that is
sent from some other source via an API. _{dfeeds-cap}_ retrieve data from {es}
for analysis, which is the simpler and more common scenario.
If you create jobs in {kib}, you must use {dfeeds}. When you create a job, you
select an index pattern and {kib} configures the {dfeed} for you under the
covers. If you use {ml} APIs instead, you can create a {dfeed} by using the
{ref}/ml-put-datafeed.html[create {dfeeds} API] after you create a job. You can
associate only one {dfeed} with each job.
For a description of all the {dfeed} properties, see
{ref}/ml-datafeed-resource.html[Datafeed Resources].
To start retrieving data from {es}, you must start the {dfeed}. When you start
it, you can optionally specify start and end times. If you do not specify an
end time, the {dfeed} runs continuously. You can start and stop {dfeeds} in
{kib} or use the {ref}/ml-start-datafeed.html[start {dfeeds}] and
{ref}/ml-stop-datafeed.html[stop {dfeeds}] APIs. A {dfeed} can be started and
stopped multiple times throughout its lifecycle.
[IMPORTANT]
--
When {security} is enabled, a {dfeed} stores the roles of the user who created
or updated the {dfeed} at that time. This means that if those roles are updated,
the {dfeed} subsequently runs with the new permissions that are associated with
the roles. However, if the users roles are adjusted after creating or updating
the {dfeed}, the {dfeed} continues to run with the permissions that were
associated with the original roles.
One way to update the roles that are stored within the {dfeed} without changing
any other settings is to submit an empty JSON document ({}) to the
{ref}/ml-update-datafeed.html[update {dfeed} API].
--
If the data that you want to analyze is not stored in {es}, you cannot use
{dfeeds}. You can however send batches of data directly to the job by using the
{ref}/ml-post-data.html[post data to jobs API].

View File

@ -0,0 +1,69 @@
[float]
[[ml-forecasting]]
=== Forecasting the Future
After the {xpackml} features create baselines of normal behavior for your data,
you can use that information to extrapolate future behavior.
You can use a forecast to estimate a time series value at a specific future date.
For example, you might want to determine how many users you can expect to visit
your website next Sunday at 0900.
You can also use it to estimate the probability of a time series value occurring
at a future date. For example, you might want to determine how likely it is that
your disk utilization will reach 100% before the end of next week.
Each forecast has a unique ID, which you can use to distinguish between forecasts
that you created at different times. You can create a forecast by using the
{ref}/ml-forecast.html[Forecast Jobs API] or by using {kib}. For example:
[role="screenshot"]
image::images/ml-gs-job-forecast.jpg["Example screenshot from the Machine Learning Single Metric Viewer in Kibana"]
//For a more detailed walk-through of {xpackml} features, see <<ml-getting-started>>.
The yellow line in the chart represents the predicted data values. The
shaded yellow area represents the bounds for the predicted values, which also
gives an indication of the confidence of the predictions.
When you create a forecast, you specify its _duration_, which indicates how far
the forecast extends beyond the last record that was processed. By default, the
duration is 1 day. Typically the farther into the future that you forecast, the
lower the confidence levels become (that is to say, the bounds increase).
Eventually if the confidence levels are too low, the forecast stops.
You can also optionally specify when the forecast expires. By default, it
expires in 14 days and is deleted automatically thereafter. You can specify a
different expiration period by using the `expires_in` parameter in the
{ref}/ml-forecast.html[Forecast Jobs API].
//Add examples of forecast_request_stats and forecast documents?
There are some limitations that affect your ability to create a forecast:
* You can generate only three forecasts concurrently. There is no limit to the
number of forecasts that you retain. Existing forecasts are not overwritten when
you create new forecasts. Rather, they are automatically deleted when they expire.
* If you use an `over_field_name` property in your job (that is to say, it's a
_population job_), you cannot create a forecast.
* If you use any of the following analytical functions in your job, you
cannot create a forecast:
** `lat_long`
** `rare` and `freq_rare`
** `time_of_day` and `time_of_week`
+
--
For more information about any of these functions, see <<ml-functions>>.
--
* Forecasts run concurrently with real-time {ml} analysis. That is to say, {ml}
analysis does not stop while forecasts are generated. Forecasts can have an
impact on {ml} jobs, however, especially in terms of memory usage. For this
reason, forecasts run only if the model memory status is acceptable and the
snapshot models for the forecast do not require more than 20 MB. If these memory
limits are reached, consider splitting the job into multiple smaller jobs and
creating forecasts for these.
* The job must be open when you create a forecast. Otherwise, an error occurs.
* If there is insufficient data to generate any meaningful predictions, an
error occurs. In general, forecasts that are created early in the learning phase
of the data analysis are less accurate.

View File

@ -0,0 +1,79 @@
[[ml-functions]]
== Function Reference
The {xpackml} features include analysis functions that provide a wide variety of
flexible ways to analyze data for anomalies.
When you create jobs, you specify one or more detectors, which define the type of
analysis that needs to be done. If you are creating your job by using {ml} APIs,
you specify the functions in
{ref}/ml-job-resource.html#ml-detectorconfig[Detector Configuration Objects].
If you are creating your job in {kib}, you specify the functions differently
depending on whether you are creating single metric, multi-metric, or advanced
jobs. For a demonstration of creating jobs in {kib}, see <<ml-getting-started>>.
Most functions detect anomalies in both low and high values. In statistical
terminology, they apply a two-sided test. Some functions offer low and high
variations (for example, `count`, `low_count`, and `high_count`). These variations
apply one-sided tests, detecting anomalies only when the values are low or
high, depending one which alternative is used.
//For some functions, you can optionally specify a field name in the
//`by_field_name` property. The analysis then considers whether there is an
//anomaly for one of more specific values of that field. In {kib}, use the
//**Key Fields** field in multi-metric jobs or the **by_field_name** field in
//advanced jobs.
////
TODO: Per Sophie, "This is incorrect... Split Data refers to a partition_field_name. Over fields can only be added in Adv Config...
Can you please remove the explanations for by/over/partition fields from the documentation for analytical functions. It's a complex topic and will be easier to review in a separate exercise."
////
//For some functions, you can also optionally specify a field name in the
//`over_field_name` property. This property shifts the analysis to be population-
//or peer-based and uses the field to split the data. In {kib}, use the
//**Split Data** field in multi-metric jobs or the **over_field_name** field in
//advanced jobs.
//You can specify a `partition_field_name` with any function. The analysis is then
//segmented with completely independent baselines for each value of that field.
//In {kib}, use the **partition_field_name** field in advanced jobs.
You can specify a `summary_count_field_name` with any function except `metric`.
When you use `summary_count_field_name`, the {ml} features expect the input
data to be pre-aggregated. The value of the `summary_count_field_name` field
must contain the count of raw events that were summarized. In {kib}, use the
**summary_count_field_name** in advanced jobs. Analyzing aggregated input data
provides a significant boost in performance. For more information, see
<<ml-configuring-aggregation>>.
If your data is sparse, there may be gaps in the data which means you might have
empty buckets. You might want to treat these as anomalies or you might want these
gaps to be ignored. Your decision depends on your use case and what is important
to you. It also depends on which functions you use. The `sum` and `count`
functions are strongly affected by empty buckets. For this reason, there are
`non_null_sum` and `non_zero_count` functions, which are tolerant to sparse data.
These functions effectively ignore empty buckets.
////
Some functions can benefit from overlapping buckets. This improves the overall
accuracy of the results but at the cost of a 2 bucket delay in seeing the results.
The table below provides a high-level summary of the analytical functions provided by the API. Each of the functions is described in detail over the following pages. Note the examples given in these pages use single Detector Configuration objects.
////
* <<ml-count-functions>>
* <<ml-geo-functions>>
* <<ml-info-functions>>
* <<ml-metric-functions>>
* <<ml-rare-functions>>
* <<ml-sum-functions>>
* <<ml-time-functions>>
include::functions/count.asciidoc[]
include::functions/geo.asciidoc[]
include::functions/info.asciidoc[]
include::functions/metric.asciidoc[]
include::functions/rare.asciidoc[]
include::functions/sum.asciidoc[]
include::functions/time.asciidoc[]

View File

@ -0,0 +1,214 @@
[[ml-count-functions]]
=== Count Functions
Count functions detect anomalies when the number of events in a bucket is
anomalous.
Use `non_zero_count` functions if your data is sparse and you want to ignore
cases where the bucket count is zero.
Use `distinct_count` functions to determine when the number of distinct values
in one field is unusual, as opposed to the total count.
Use high-sided functions if you want to monitor unusually high event rates.
Use low-sided functions if you want to look at drops in event rate.
The {xpackml} features include the following count functions:
* xref:ml-count[`count`, `high_count`, `low_count`]
* xref:ml-nonzero-count[`non_zero_count`, `high_non_zero_count`, `low_non_zero_count`]
* xref:ml-distinct-count[`distinct_count`, `high_distinct_count`, `low_distinct_count`]
[float]
[[ml-count]]
===== Count, High_count, Low_count
The `count` function detects anomalies when the number of events in a bucket is
anomalous.
The `high_count` function detects anomalies when the count of events in a
bucket are unusually high.
The `low_count` function detects anomalies when the count of events in a
bucket are unusually low.
These functions support the following properties:
* `by_field_name` (optional)
* `over_field_name` (optional)
* `partition_field_name` (optional)
For more information about those properties,
see {ref}/ml-job-resource.html#ml-detectorconfig[Detector Configuration Objects].
.Example 1: Analyzing events with the count function
[source,js]
--------------------------------------------------
{ "function" : "count" }
--------------------------------------------------
This example is probably the simplest possible analysis. It identifies
time buckets during which the overall count of events is higher or lower than
usual.
When you use this function in a detector in your job, it models the event rate
and detects when the event rate is unusual compared to its past behavior.
.Example 2: Analyzing errors with the high_count function
[source,js]
--------------------------------------------------
{
"function" : "high_count",
"by_field_name" : "error_code",
"over_field_name": "user"
}
--------------------------------------------------
If you use this `high_count` function in a detector in your job, it
models the event rate for each error code. It detects users that generate an
unusually high count of error codes compared to other users.
.Example 3: Analyzing status codes with the low_count function
[source,js]
--------------------------------------------------
{
"function" : "low_count",
"by_field_name" : "status_code"
}
--------------------------------------------------
In this example, the function detects when the count of events for a
status code is lower than usual.
When you use this function in a detector in your job, it models the event rate
for each status code and detects when a status code has an unusually low count
compared to its past behavior.
.Example 4: Analyzing aggregated data with the count function
[source,js]
--------------------------------------------------
{
"summary_count_field_name" : "events_per_min",
"detectors" [
{ "function" : "count" }
]
}
--------------------------------------------------
If you are analyzing an aggregated `events_per_min` field, do not use a sum
function (for example, `sum(events_per_min)`). Instead, use the count function
and the `summary_count_field_name` property.
//TO-DO: For more information, see <<aggreggations.asciidoc>>.
[float]
[[ml-nonzero-count]]
===== Non_zero_count, High_non_zero_count, Low_non_zero_count
The `non_zero_count` function detects anomalies when the number of events in a
bucket is anomalous, but it ignores cases where the bucket count is zero. Use
this function if you know your data is sparse or has gaps and the gaps are not
important.
The `high_non_zero_count` function detects anomalies when the number of events
in a bucket is unusually high and it ignores cases where the bucket count is
zero.
The `low_non_zero_count` function detects anomalies when the number of events in
a bucket is unusually low and it ignores cases where the bucket count is zero.
These functions support the following properties:
* `by_field_name` (optional)
* `partition_field_name` (optional)
For more information about those properties,
see {ref}/ml-job-resource.html#ml-detectorconfig[Detector Configuration Objects].
For example, if you have the following number of events per bucket:
========================================
1,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,43,31,0,0,0,0,0,0,0,0,0,0,0,0,2,1
========================================
The `non_zero_count` function models only the following data:
========================================
1,22,2,43,31,2,1
========================================
.Example 5: Analyzing signatures with the high_non_zero_count function
[source,js]
--------------------------------------------------
{
"function" : "high_non_zero_count",
"by_field_name" : "signaturename"
}
--------------------------------------------------
If you use this `high_non_zero_count` function in a detector in your job, it
models the count of events for the `signaturename` field. It ignores any buckets
where the count is zero and detects when a `signaturename` value has an
unusually high count of events compared to its past behavior.
NOTE: Population analysis (using an `over_field_name` property value) is not
supported for the `non_zero_count`, `high_non_zero_count`, and
`low_non_zero_count` functions. If you want to do population analysis and your
data is sparse, use the `count` functions, which are optimized for that scenario.
[float]
[[ml-distinct-count]]
===== Distinct_count, High_distinct_count, Low_distinct_count
The `distinct_count` function detects anomalies where the number of distinct
values in one field is unusual.
The `high_distinct_count` function detects unusually high numbers of distinct
values in one field.
The `low_distinct_count` function detects unusually low numbers of distinct
values in one field.
These functions support the following properties:
* `field_name` (required)
* `by_field_name` (optional)
* `over_field_name` (optional)
* `partition_field_name` (optional)
For more information about those properties,
see {ref}/ml-job-resource.html#ml-detectorconfig[Detector Configuration Objects].
.Example 6: Analyzing users with the distinct_count function
[source,js]
--------------------------------------------------
{
"function" : "distinct_count",
"field_name" : "user"
}
--------------------------------------------------
This `distinct_count` function detects when a system has an unusual number
of logged in users. When you use this function in a detector in your job, it
models the distinct count of users. It also detects when the distinct number of
users is unusual compared to the past.
.Example 7: Analyzing ports with the high_distinct_count function
[source,js]
--------------------------------------------------
{
"function" : "high_distinct_count",
"field_name" : "dst_port",
"over_field_name": "src_ip"
}
--------------------------------------------------
This example detects instances of port scanning. When you use this function in a
detector in your job, it models the distinct count of ports. It also detects the
`src_ip` values that connect to an unusually high number of different
`dst_ports` values compared to other `src_ip` values.

View File

@ -0,0 +1,79 @@
[[ml-geo-functions]]
=== Geographic Functions
The geographic functions detect anomalies in the geographic location of the
input data.
The {xpackml} features include the following geographic function: `lat_long`.
NOTE: You cannot create forecasts for jobs that contain geographic functions.
[float]
[[ml-lat-long]]
==== Lat_long
The `lat_long` function detects anomalies in the geographic location of the
input data.
This function supports the following properties:
* `field_name` (required)
* `by_field_name` (optional)
* `over_field_name` (optional)
* `partition_field_name` (optional)
For more information about those properties,
see {ref}/ml-job-resource.html#ml-detectorconfig[Detector Configuration Objects].
.Example 1: Analyzing transactions with the lat_long function
[source,js]
--------------------------------------------------
{
"function" : "lat_long",
"field_name" : "transactionCoordinates",
"by_field_name" : "creditCardNumber"
}
--------------------------------------------------
If you use this `lat_long` function in a detector in your job, it
detects anomalies where the geographic location of a credit card transaction is
unusual for a particular customers credit card. An anomaly might indicate fraud.
IMPORTANT: The `field_name` that you supply must be a single string that contains
two comma-separated numbers of the form `latitude,longitude`. The `latitude` and
`longitude` must be in the range -180 to 180 and represent a point on the
surface of the Earth.
For example, JSON data might contain the following transaction coordinates:
[source,js]
--------------------------------------------------
{
"time": 1460464275,
"transactionCoordinates": "40.7,-74.0",
"creditCardNumber": "1234123412341234"
}
--------------------------------------------------
In {es}, location data is likely to be stored in `geo_point` fields. For more
information, see {ref}/geo-point.html[Geo-point datatype]. This data type is not
supported natively in {xpackml} features. You can, however, use Painless scripts
in `script_fields` in your {dfeed} to transform the data into an appropriate
format. For example, the following Painless script transforms
`"coords": {"lat" : 41.44, "lon":90.5}` into `"lat-lon": "41.44,90.5"`:
[source,js]
--------------------------------------------------
{
"script_fields": {
"lat-lon": {
"script": {
"source": "doc['coords'].lat + ',' + doc['coords'].lon",
"lang": "painless"
}
}
}
}
--------------------------------------------------
For more information, see <<ml-configuring-transform>>.

View File

@ -0,0 +1,87 @@
[[ml-info-functions]]
=== Information Content Functions
The information content functions detect anomalies in the amount of information
that is contained in strings within a bucket. These functions can be used as
a more sophisticated method to identify incidences of data exfiltration or
C2C activity, when analyzing the size in bytes of the data might not be sufficient.
The {xpackml} features include the following information content functions:
* `info_content`, `high_info_content`, `low_info_content`
[float]
[[ml-info-content]]
==== Info_content, High_info_content, Low_info_content
The `info_content` function detects anomalies in the amount of information that
is contained in strings in a bucket.
If you want to monitor for unusually high amounts of information,
use `high_info_content`.
If want to look at drops in information content, use `low_info_content`.
These functions support the following properties:
* `field_name` (required)
* `by_field_name` (optional)
* `over_field_name` (optional)
* `partition_field_name` (optional)
For more information about those properties, see
{ref}/ml-job-resource.html#ml-detectorconfig[Detector Configuration Objects].
.Example 1: Analyzing subdomain strings with the info_content function
[source,js]
--------------------------------------------------
{
"function" : "info_content",
"field_name" : "subdomain",
"over_field_name" : "highest_registered_domain"
}
--------------------------------------------------
If you use this `info_content` function in a detector in your job, it models
information that is present in the `subdomain` string. It detects anomalies
where the information content is unusual compared to the other
`highest_registered_domain` values. An anomaly could indicate an abuse of the
DNS protocol, such as malicious command and control activity.
NOTE: In this example, both high and low values are considered anomalous.
In many use cases, the `high_info_content` function is often a more appropriate
choice.
.Example 2: Analyzing query strings with the high_info_content function
[source,js]
--------------------------------------------------
{
"function" : "high_info_content",
"field_name" : "query",
"over_field_name" : "src_ip"
}
--------------------------------------------------
If you use this `high_info_content` function in a detector in your job, it
models information content that is held in the DNS query string. It detects
`src_ip` values where the information content is unusually high compared to
other `src_ip` values. This example is similar to the example for the
`info_content` function, but it reports anomalies only where the amount of
information content is higher than expected.
.Example 3: Analyzing message strings with the low_info_content function
[source,js]
--------------------------------------------------
{
"function" : "low_info_content",
"field_name" : "message",
"by_field_name" : "logfilename"
}
--------------------------------------------------
If you use this `low_info_content` function in a detector in your job, it models
information content that is present in the message string for each
`logfilename`. It detects anomalies where the information content is low
compared to its past behavior. For example, this function detects unusually low
amounts of information in a collection of rolling log files. Low information
might indicate that a process has entered an infinite loop or that logging
features have been disabled.

View File

@ -0,0 +1,310 @@
[[ml-metric-functions]]
=== Metric Functions
The metric functions include functions such as mean, min and max. These values
are calculated for each bucket. Field values that cannot be converted to
double precision floating point numbers are ignored.
The {xpackml} features include the following metric functions:
* <<ml-metric-min,`min`>>
* <<ml-metric-max,`max`>>
* xref:ml-metric-median[`median`, `high_median`, `low_median`]
* xref:ml-metric-mean[`mean`, `high_mean`, `low_mean`]
* <<ml-metric-metric,`metric`>>
* xref:ml-metric-varp[`varp`, `high_varp`, `low_varp`]
[float]
[[ml-metric-min]]
==== Min
The `min` function detects anomalies in the arithmetic minimum of a value.
The minimum value is calculated for each bucket.
High- and low-sided functions are not applicable.
This function supports the following properties:
* `field_name` (required)
* `by_field_name` (optional)
* `over_field_name` (optional)
* `partition_field_name` (optional)
For more information about those properties, see
{ref}/ml-job-resource.html#ml-detectorconfig[Detector Configuration Objects].
.Example 1: Analyzing minimum transactions with the min function
[source,js]
--------------------------------------------------
{
"function" : "min",
"field_name" : "amt",
"by_field_name" : "product"
}
--------------------------------------------------
If you use this `min` function in a detector in your job, it detects where the
smallest transaction is lower than previously observed. You can use this
function to detect items for sale at unintentionally low prices due to data
entry mistakes. It models the minimum amount for each product over time.
[float]
[[ml-metric-max]]
==== Max
The `max` function detects anomalies in the arithmetic maximum of a value.
The maximum value is calculated for each bucket.
High- and low-sided functions are not applicable.
This function supports the following properties:
* `field_name` (required)
* `by_field_name` (optional)
* `over_field_name` (optional)
* `partition_field_name` (optional)
For more information about those properties, see
{ref}/ml-job-resource.html#ml-detectorconfig[Detector Configuration Objects].
.Example 2: Analyzing maximum response times with the max function
[source,js]
--------------------------------------------------
{
"function" : "max",
"field_name" : "responsetime",
"by_field_name" : "application"
}
--------------------------------------------------
If you use this `max` function in a detector in your job, it detects where the
longest `responsetime` is longer than previously observed. You can use this
function to detect applications that have `responsetime` values that are
unusually lengthy. It models the maximum `responsetime` for each application
over time and detects when the longest `responsetime` is unusually long compared
to previous applications.
.Example 3: Two detectors with max and high_mean functions
[source,js]
--------------------------------------------------
{
"function" : "max",
"field_name" : "responsetime",
"by_field_name" : "application"
},
{
"function" : "high_mean",
"field_name" : "responsetime",
"by_field_name" : "application"
}
--------------------------------------------------
The analysis in the previous example can be performed alongside `high_mean`
functions by application. By combining detectors and using the same influencer
this job can detect both unusually long individual response times and average
response times for each bucket.
[float]
[[ml-metric-median]]
==== Median, High_median, Low_median
The `median` function detects anomalies in the statistical median of a value.
The median value is calculated for each bucket.
If you want to monitor unusually high median values, use the `high_median`
function.
If you are just interested in unusually low median values, use the `low_median`
function.
These functions support the following properties:
* `field_name` (required)
* `by_field_name` (optional)
* `over_field_name` (optional)
* `partition_field_name` (optional)
For more information about those properties, see
{ref}/ml-job-resource.html#ml-detectorconfig[Detector Configuration Objects].
.Example 4: Analyzing response times with the median function
[source,js]
--------------------------------------------------
{
"function" : "median",
"field_name" : "responsetime",
"by_field_name" : "application"
}
--------------------------------------------------
If you use this `median` function in a detector in your job, it models the
median `responsetime` for each application over time. It detects when the median
`responsetime` is unusual compared to previous `responsetime` values.
[float]
[[ml-metric-mean]]
==== Mean, High_mean, Low_mean
The `mean` function detects anomalies in the arithmetic mean of a value.
The mean value is calculated for each bucket.
If you want to monitor unusually high average values, use the `high_mean`
function.
If you are just interested in unusually low average values, use the `low_mean`
function.
These functions support the following properties:
* `field_name` (required)
* `by_field_name` (optional)
* `over_field_name` (optional)
* `partition_field_name` (optional)
For more information about those properties, see
{ref}/ml-job-resource.html#ml-detectorconfig[Detector Configuration Objects].
.Example 5: Analyzing response times with the mean function
[source,js]
--------------------------------------------------
{
"function" : "mean",
"field_name" : "responsetime",
"by_field_name" : "application"
}
--------------------------------------------------
If you use this `mean` function in a detector in your job, it models the mean
`responsetime` for each application over time. It detects when the mean
`responsetime` is unusual compared to previous `responsetime` values.
.Example 6: Analyzing response times with the high_mean function
[source,js]
--------------------------------------------------
{
"function" : "high_mean",
"field_name" : "responsetime",
"by_field_name" : "application"
}
--------------------------------------------------
If you use this `high_mean` function in a detector in your job, it models the
mean `responsetime` for each application over time. It detects when the mean
`responsetime` is unusually high compared to previous `responsetime` values.
.Example 7: Analyzing response times with the low_mean function
[source,js]
--------------------------------------------------
{
"function" : "low_mean",
"field_name" : "responsetime",
"by_field_name" : "application"
}
--------------------------------------------------
If you use this `low_mean` function in a detector in your job, it models the
mean `responsetime` for each application over time. It detects when the mean
`responsetime` is unusually low compared to previous `responsetime` values.
[float]
[[ml-metric-metric]]
==== Metric
The `metric` function combines `min`, `max`, and `mean` functions. You can use
it as a shorthand for a combined analysis. If you do not specify a function in
a detector, this is the default function.
//TBD: Is that default behavior still true?
High- and low-sided functions are not applicable. You cannot use this function
when a `summary_count_field_name` is specified.
This function supports the following properties:
* `field_name` (required)
* `by_field_name` (optional)
* `over_field_name` (optional)
* `partition_field_name` (optional)
For more information about those properties, see
{ref}/ml-job-resource.html#ml-detectorconfig[Detector Configuration Objects].
.Example 8: Analyzing response times with the metric function
[source,js]
--------------------------------------------------
{
"function" : "metric",
"field_name" : "responsetime",
"by_field_name" : "application"
}
--------------------------------------------------
If you use this `metric` function in a detector in your job, it models the
mean, min, and max `responsetime` for each application over time. It detects
when the mean, min, or max `responsetime` is unusual compared to previous
`responsetime` values.
[float]
[[ml-metric-varp]]
==== Varp, High_varp, Low_varp
The `varp` function detects anomalies in the variance of a value which is a
measure of the variability and spread in the data.
If you want to monitor unusually high variance, use the `high_varp` function.
If you are just interested in unusually low variance, use the `low_varp` function.
These functions support the following properties:
* `field_name` (required)
* `by_field_name` (optional)
* `over_field_name` (optional)
* `partition_field_name` (optional)
For more information about those properties, see
{ref}/ml-job-resource.html#ml-detectorconfig[Detector Configuration Objects].
.Example 9: Analyzing response times with the varp function
[source,js]
--------------------------------------------------
{
"function" : "varp",
"field_name" : "responsetime",
"by_field_name" : "application"
}
--------------------------------------------------
If you use this `varp` function in a detector in your job, it models the
variance in values of `responsetime` for each application over time. It detects
when the variance in `responsetime` is unusual compared to past application
behavior.
.Example 10: Analyzing response times with the high_varp function
[source,js]
--------------------------------------------------
{
"function" : "high_varp",
"field_name" : "responsetime",
"by_field_name" : "application"
}
--------------------------------------------------
If you use this `high_varp` function in a detector in your job, it models the
variance in values of `responsetime` for each application over time. It detects
when the variance in `responsetime` is unusual compared to past application
behavior.
.Example 11: Analyzing response times with the low_varp function
[source,js]
--------------------------------------------------
{
"function" : "low_varp",
"field_name" : "responsetime",
"by_field_name" : "application"
}
--------------------------------------------------
If you use this `low_varp` function in a detector in your job, it models the
variance in values of `responsetime` for each application over time. It detects
when the variance in `responsetime` is unusual compared to past application
behavior.

View File

@ -0,0 +1,128 @@
[[ml-rare-functions]]
=== Rare Functions
The rare functions detect values that occur rarely in time or rarely for a
population.
The `rare` analysis detects anomalies according to the number of distinct rare
values. This differs from `freq_rare`, which detects anomalies according to the
number of times (frequency) rare values occur.
[NOTE]
====
* The `rare` and `freq_rare` functions should not be used in conjunction with
`exclude_frequent`.
* You cannot create forecasts for jobs that contain `rare` or `freq_rare`
functions.
* Shorter bucket spans (less than 1 hour, for example) are recommended when
looking for rare events. The functions model whether something happens in a
bucket at least once. With longer bucket spans, it is more likely that
entities will be seen in a bucket and therefore they appear less rare.
Picking the ideal the bucket span depends on the characteristics of the data
with shorter bucket spans typically being measured in minutes, not hours.
* To model rare data, a learning period of at least 20 buckets is required
for typical data.
====
The {xpackml} features include the following rare functions:
* <<ml-rare,`rare`>>
* <<ml-freq-rare,`freq_rare`>>
[float]
[[ml-rare]]
==== Rare
The `rare` function detects values that occur rarely in time or rarely for a
population. It detects anomalies according to the number of distinct rare values.
This function supports the following properties:
* `by_field_name` (required)
* `over_field_name` (optional)
* `partition_field_name` (optional)
For more information about those properties, see
{ref}/ml-job-resource.html#ml-detectorconfig[Detector Configuration Objects].
.Example 1: Analyzing status codes with the rare function
[source,js]
--------------------------------------------------
{
"function" : "rare",
"by_field_name" : "status"
}
--------------------------------------------------
If you use this `rare` function in a detector in your job, it detects values
that are rare in time. It models status codes that occur over time and detects
when rare status codes occur compared to the past. For example, you can detect
status codes in a web access log that have never (or rarely) occurred before.
.Example 2: Analyzing status codes in a population with the rare function
[source,js]
--------------------------------------------------
{
"function" : "rare",
"by_field_name" : "status",
"over_field_name" : "clientip"
}
--------------------------------------------------
If you use this `rare` function in a detector in your job, it detects values
that are rare in a population. It models status code and client IP interactions
that occur. It defines a rare status code as one that occurs for few client IP
values compared to the population. It detects client IP values that experience
one or more distinct rare status codes compared to the population. For example
in a web access log, a `clientip` that experiences the highest number of
different rare status codes compared to the population is regarded as highly
anomalous. This analysis is based on the number of different status code values,
not the count of occurrences.
NOTE: To define a status code as rare the {xpackml} features look at the number
of distinct status codes that occur, not the number of times the status code
occurs. If a single client IP experiences a single unique status code, this
is rare, even if it occurs for that client IP in every bucket.
[float]
[[ml-freq-rare]]
==== Freq_rare
The `freq_rare` function detects values that occur rarely for a population.
It detects anomalies according to the number of times (frequency) that rare
values occur.
This function supports the following properties:
* `by_field_name` (required)
* `over_field_name` (required)
* `partition_field_name` (optional)
For more information about those properties, see
{ref}/ml-job-resource.html#ml-detectorconfig[Detector Configuration Objects].
.Example 3: Analyzing URI values in a population with the freq_rare function
[source,js]
--------------------------------------------------
{
"function" : "freq_rare",
"by_field_name" : "uri",
"over_field_name" : "clientip"
}
--------------------------------------------------
If you use this `freq_rare` function in a detector in your job, it
detects values that are frequently rare in a population. It models URI paths and
client IP interactions that occur. It defines a rare URI path as one that is
visited by few client IP values compared to the population. It detects the
client IP values that experience many interactions with rare URI paths compared
to the population. For example in a web access log, a client IP that visits
one or more rare URI paths many times compared to the population is regarded as
highly anomalous. This analysis is based on the count of interactions with rare
URI paths, not the number of different URI path values.
NOTE: To define a URI path as rare, the analytics consider the number of
distinct values that occur and not the number of times the URI path occurs.
If a single client IP visits a single unique URI path, this is rare, even if it
occurs for that client IP in every bucket.

View File

@ -0,0 +1,119 @@
[[ml-sum-functions]]
=== Sum Functions
The sum functions detect anomalies when the sum of a field in a bucket is anomalous.
If you want to monitor unusually high totals, use high-sided functions.
If want to look at drops in totals, use low-sided functions.
If your data is sparse, use `non_null_sum` functions. Buckets without values are
ignored; buckets with a zero value are analyzed.
The {xpackml} features include the following sum functions:
* xref:ml-sum[`sum`, `high_sum`, `low_sum`]
* xref:ml-nonnull-sum[`non_null_sum`, `high_non_null_sum`, `low_non_null_sum`]
////
TBD: Incorporate from prelert docs?:
Input data may contain pre-calculated fields giving the total count of some value e.g. transactions per minute.
Ensure you are familiar with our advice on Summarization of Input Data, as this is likely to provide
a more appropriate method to using the sum function.
////
[float]
[[ml-sum]]
==== Sum, High_sum, Low_sum
The `sum` function detects anomalies where the sum of a field in a bucket is
anomalous.
If you want to monitor unusually high sum values, use the `high_sum` function.
If you want to monitor unusually low sum values, use the `low_sum` function.
These functions support the following properties:
* `field_name` (required)
* `by_field_name` (optional)
* `over_field_name` (optional)
* `partition_field_name` (optional)
For more information about those properties, see
{ref}/ml-job-resource.html#ml-detectorconfig[Detector Configuration Objects].
.Example 1: Analyzing total expenses with the sum function
[source,js]
--------------------------------------------------
{
"function" : "sum",
"field_name" : "expenses",
"by_field_name" : "costcenter",
"over_field_name" : "employee"
}
--------------------------------------------------
If you use this `sum` function in a detector in your job, it
models total expenses per employees for each cost center. For each time bucket,
it detects when an employees expenses are unusual for a cost center compared
to other employees.
.Example 2: Analyzing total bytes with the high_sum function
[source,js]
--------------------------------------------------
{
"function" : "high_sum",
"field_name" : "cs_bytes",
"over_field_name" : "cs_host"
}
--------------------------------------------------
If you use this `high_sum` function in a detector in your job, it
models total `cs_bytes`. It detects `cs_hosts` that transfer unusually high
volumes compared to other `cs_hosts`. This example looks for volumes of data
transferred from a client to a server on the internet that are unusual compared
to other clients. This scenario could be useful to detect data exfiltration or
to find users that are abusing internet privileges.
[float]
[[ml-nonnull-sum]]
==== Non_null_sum, High_non_null_sum, Low_non_null_sum
The `non_null_sum` function is useful if your data is sparse. Buckets without
values are ignored and buckets with a zero value are analyzed.
If you want to monitor unusually high totals, use the `high_non_null_sum`
function.
If you want to look at drops in totals, use the `low_non_null_sum` function.
These functions support the following properties:
* `field_name` (required)
* `by_field_name` (optional)
* `partition_field_name` (optional)
For more information about those properties, see
{ref}/ml-job-resource.html#ml-detectorconfig[Detector Configuration Objects].
NOTE: Population analysis (that is to say, use of the `over_field_name` property)
is not applicable for this function.
.Example 3: Analyzing employee approvals with the high_non_null_sum function
[source,js]
--------------------------------------------------
{
"function" : "high_non_null_sum",
"fieldName" : "amount_approved",
"byFieldName" : "employee"
}
--------------------------------------------------
If you use this `high_non_null_sum` function in a detector in your job, it
models the total `amount_approved` for each employee. It ignores any buckets
where the amount is null. It detects employees who approve unusually high
amounts compared to their past behavior.
//For this credit control system analysis, using non_null_sum will ignore
//periods where the employees are not active on the system.

View File

@ -0,0 +1,99 @@
[[ml-time-functions]]
=== Time Functions
The time functions detect events that happen at unusual times, either of the day
or of the week. These functions can be used to find unusual patterns of behavior,
typically associated with suspicious user activity.
The {xpackml} features include the following time functions:
* <<ml-time-of-day,`time_of_day`>>
* <<ml-time-of-week,`time_of_week`>>
[NOTE]
====
* NOTE: You cannot create forecasts for jobs that contain time functions.
* The `time_of_day` function is not aware of the difference between days, for instance
work days and weekends. When modeling different days, use the `time_of_week` function.
In general, the `time_of_week` function is more suited to modeling the behavior of people
rather than machines, as people vary their behavior according to the day of the week.
* Shorter bucket spans (for example, 10 minutes) are recommended when performing a
`time_of_day` or `time_of_week` analysis. The time of the events being modeled are not
affected by the bucket span, but a shorter bucket span enables quicker alerting on unusual
events.
* Unusual events are flagged based on the previous pattern of the data, not on what we
might think of as unusual based on human experience. So, if events typically occur
between 3 a.m. and 5 a.m., and event occurring at 3 p.m. is be flagged as unusual.
* When Daylight Saving Time starts or stops, regular events can be flagged as anomalous.
This situation occurs because the actual time of the event (as measured against a UTC
baseline) has changed. This situation is treated as a step change in behavior and the new
times will be learned quickly.
====
[float]
[[ml-time-of-day]]
==== Time_of_day
The `time_of_day` function detects when events occur that are outside normal
usage patterns. For example, it detects unusual activity in the middle of the
night.
The function expects daily behavior to be similar. If you expect the behavior of
your data to differ on Saturdays compared to Wednesdays, the `time_of_week`
function is more appropriate.
This function supports the following properties:
* `by_field_name` (optional)
* `over_field_name` (optional)
* `partition_field_name` (optional)
For more information about those properties, see
{ref}/ml-job-resource.html#ml-detectorconfig[Detector Configuration Objects].
.Example 1: Analyzing events with the time_of_day function
[source,js]
--------------------------------------------------
{
"function" : "time_of_day",
"by_field_name" : "process"
}
--------------------------------------------------
If you use this `time_of_day` function in a detector in your job, it
models when events occur throughout a day for each process. It detects when an
event occurs for a process that is at an unusual time in the day compared to
its past behavior.
[float]
[[ml-time-of-week]]
==== Time_of_week
The `time_of_week` function detects when events occur that are outside normal
usage patterns. For example, it detects login events on the weekend.
This function supports the following properties:
* `by_field_name` (optional)
* `over_field_name` (optional)
* `partition_field_name` (optional)
For more information about those properties, see
{ref}/ml-job-resource.html#ml-detectorconfig[Detector Configuration Objects].
.Example 2: Analyzing events with the time_of_week function
[source,js]
--------------------------------------------------
{
"function" : "time_of_week",
"by_field_name" : "eventcode",
"over_field_name" : "workstation"
}
--------------------------------------------------
If you use this `time_of_week` function in a detector in your job, it
models when events occur throughout the week for each `eventcode`. It detects
when a workstation event occurs at an unusual time during the week for that
`eventcode` compared to other workstations. It detects events for a
particular workstation that are outside the normal usage pattern.

View File

@ -0,0 +1,210 @@
[[ml-gs-data]]
=== Identifying Data for Analysis
For the purposes of this tutorial, we provide sample data that you can play with
and search in {es}. When you consider your own data, however, it's important to
take a moment and think about where the {xpackml} features will be most
impactful.
The first consideration is that it must be time series data. The {ml} features
are designed to model and detect anomalies in time series data.
The second consideration, especially when you are first learning to use {ml},
is the importance of the data and how familiar you are with it. Ideally, it is
information that contains key performance indicators (KPIs) for the health,
security, or success of your business or system. It is information that you need
to monitor and act on when anomalous behavior occurs. You might even have {kib}
dashboards that you're already using to watch this data. The better you know the
data, the quicker you will be able to create {ml} jobs that generate useful
insights.
The final consideration is where the data is located. This tutorial assumes that
your data is stored in {es}. It guides you through the steps required to create
a _{dfeed}_ that passes data to a job. If your own data is outside of {es},
analysis is still possible by using a post data API.
IMPORTANT: If you want to create {ml} jobs in {kib}, you must use {dfeeds}.
That is to say, you must store your input data in {es}. When you create
a job, you select an existing index pattern and {kib} configures the {dfeed}
for you under the covers.
[float]
[[ml-gs-sampledata]]
==== Obtaining a Sample Data Set
In this step we will upload some sample data to {es}. This is standard
{es} functionality, and is needed to set the stage for using {ml}.
The sample data for this tutorial contains information about the requests that
are received by various applications and services in a system. A system
administrator might use this type of information to track the total number of
requests across all of the infrastructure. If the number of requests increases
or decreases unexpectedly, for example, this might be an indication that there
is a problem or that resources need to be redistributed. By using the {xpack}
{ml} features to model the behavior of this data, it is easier to identify
anomalies and take appropriate action.
Download this sample data by clicking here:
https://download.elastic.co/demos/machine_learning/gettingstarted/server_metrics.tar.gz[server_metrics.tar.gz]
Use the following commands to extract the files:
[source,sh]
----------------------------------
tar -zxvf server_metrics.tar.gz
----------------------------------
Each document in the server-metrics data set has the following schema:
[source,js]
----------------------------------
{
"index":
{
"_index":"server-metrics",
"_type":"metric",
"_id":"1177"
}
}
{
"@timestamp":"2017-03-23T13:00:00",
"accept":36320,
"deny":4156,
"host":"server_2",
"response":2.4558210155,
"service":"app_3",
"total":40476
}
----------------------------------
// NOTCONSOLE
TIP: The sample data sets include summarized data. For example, the `total`
value is a sum of the requests that were received by a specific service at a
particular time. If your data is stored in {es}, you can generate
this type of sum or average by using aggregations. One of the benefits of
summarizing data this way is that {es} automatically distributes
these calculations across your cluster. You can then feed this summarized data
into {xpackml} instead of raw results, which reduces the volume
of data that must be considered while detecting anomalies. For the purposes of
this tutorial, however, these summary values are stored in {es}. For more
information, see <<ml-configuring-aggregation>>.
Before you load the data set, you need to set up {ref}/mapping.html[_mappings_]
for the fields. Mappings divide the documents in the index into logical groups
and specify a field's characteristics, such as the field's searchability or
whether or not it's _tokenized_, or broken up into separate words.
The sample data includes an `upload_server-metrics.sh` script, which you can use
to create the mappings and load the data set. You can download it by clicking
here: https://download.elastic.co/demos/machine_learning/gettingstarted/upload_server-metrics.sh[upload_server-metrics.sh]
Before you run it, however, you must edit the USERNAME and PASSWORD variables
with your actual user ID and password.
The script runs a command similar to the following example, which sets up a
mapping for the data set:
[source,sh]
----------------------------------
curl -u elastic:x-pack-test-password -X PUT -H 'Content-Type: application/json'
http://localhost:9200/server-metrics -d '{
"settings":{
"number_of_shards":1,
"number_of_replicas":0
},
"mappings":{
"metric":{
"properties":{
"@timestamp":{
"type":"date"
},
"accept":{
"type":"long"
},
"deny":{
"type":"long"
},
"host":{
"type":"keyword"
},
"response":{
"type":"float"
},
"service":{
"type":"keyword"
},
"total":{
"type":"long"
}
}
}
}
}'
----------------------------------
// NOTCONSOLE
NOTE: If you run this command, you must replace `x-pack-test-password` with your
actual password.
You can then use the {es} `bulk` API to load the data set. The
`upload_server-metrics.sh` script runs commands similar to the following
example, which loads the four JSON files:
[source,sh]
----------------------------------
curl -u elastic:x-pack-test-password -X POST -H "Content-Type: application/json"
http://localhost:9200/server-metrics/_bulk --data-binary "@server-metrics_1.json"
curl -u elastic:x-pack-test-password -X POST -H "Content-Type: application/json"
http://localhost:9200/server-metrics/_bulk --data-binary "@server-metrics_2.json"
curl -u elastic:x-pack-test-password -X POST -H "Content-Type: application/json"
http://localhost:9200/server-metrics/_bulk --data-binary "@server-metrics_3.json"
curl -u elastic:x-pack-test-password -X POST -H "Content-Type: application/json"
http://localhost:9200/server-metrics/_bulk --data-binary "@server-metrics_4.json"
----------------------------------
// NOTCONSOLE
TIP: This will upload 200MB of data. This is split into 4 files as there is a
maximum 100MB limit when using the `_bulk` API.
These commands might take some time to run, depending on the computing resources
available.
You can verify that the data was loaded successfully with the following command:
[source,sh]
----------------------------------
curl 'http://localhost:9200/_cat/indices?v' -u elastic:x-pack-test-password
----------------------------------
// NOTCONSOLE
You should see output similar to the following:
[source,txt]
----------------------------------
health status index ... pri rep docs.count ...
green open server-metrics ... 1 0 905940 ...
----------------------------------
// NOTCONSOLE
Next, you must define an index pattern for this data set:
. Open {kib} in your web browser and log in. If you are running {kib}
locally, go to `http://localhost:5601/`.
. Click the **Management** tab, then **{kib}** > **Index Patterns**.
. If you already have index patterns, click **Create Index** to define a new
one. Otherwise, the **Create index pattern** wizard is already open.
. For this tutorial, any pattern that matches the name of the index you've
loaded will work. For example, enter `server-metrics*` as the index pattern.
. In the **Configure settings** step, select the `@timestamp` field in the
**Time Filter field name** list.
. Click **Create index pattern**.
This data set can now be analyzed in {ml} jobs in {kib}.

View File

@ -0,0 +1,76 @@
[[ml-gs-forecast]]
=== Creating Forecasts
In addition to detecting anomalous behavior in your data, you can use
{ml} to predict future behavior. For more information, see <<ml-forecasting>>.
To create a forecast in {kib}:
. Go to the **Single Metric Viewer** and select one of the jobs that you created
in this tutorial. For example, select the `total-requests` job.
. Click **Forecast**. +
+
--
[role="screenshot"]
image::images/ml-gs-forecast.jpg["Create a forecast from the Single Metric Viewer"]
--
. Specify a duration for your forecast. This value indicates how far to
extrapolate beyond the last record that was processed. You must use time units,
such as `30d` for 30 days. For more information, see
{ref}/common-options.html#time-units[Time Units]. In this example, we use a
duration of 1 week: +
+
--
[role="screenshot"]
image::images/ml-gs-duration.jpg["Specify a duration of 1w"]
--
. View the forecast in the **Single Metric Viewer**: +
+
--
[role="screenshot"]
image::images/ml-gs-forecast-results.jpg["View a forecast from the Single Metric Viewer"]
The yellow line in the chart represents the predicted data values. The shaded
yellow area represents the bounds for the predicted values, which also gives an
indication of the confidence of the predictions. Note that the bounds generally
increase with time (that is to say, the confidence levels decrease), since you
are forecasting further into the future. Eventually if the confidence levels are
too low, the forecast stops.
--
. Optional: Compare the forecast to actual data. +
+
--
You can try this with the sample data by choosing a subset of the data when you
create the job, as described in <<ml-gs-jobs>>. Create the forecast then process
the remaining data, as described in <<ml-gs-job1-datafeed>>.
--
.. After you restart the {dfeed}, re-open the forecast by selecting the job in
the **Single Metric Viewer**, clicking **Forecast**, and selecting your forecast
from the list. For example: +
+
--
[role="screenshot"]
image::images/ml-gs-forecast-open.jpg["Open a forecast in the Single Metric Viewer"]
--
.. View the forecast and actual data in the **Single Metric Viewer**: +
+
--
[role="screenshot"]
image::images/ml-gs-forecast-actual.jpg["View a forecast over actual data in the Single Metric Viewer"]
The chart contains the actual data values, the bounds for the expected values,
the anomalies, the forecast data values, and the bounds for the forecast. This
combination of actual and forecast data gives you an indication of how well the
{xpack} {ml} features can extrapolate the future behavior of the data.
--
Now that you have seen how easy it is to create forecasts with the sample data,
consider what type of events you might want to predict in your own data. For
more information and ideas, as well as a list of limitations related to
forecasts, see <<ml-forecasting>>.

View File

@ -0,0 +1,211 @@
[[ml-gs-multi-jobs]]
=== Creating Multi-metric Jobs
The multi-metric job wizard in {kib} provides a simple way to create more
complex jobs with multiple detectors. For example, in the single metric job, you
were tracking total requests versus time. You might also want to track other
metrics like average response time or the maximum number of denied requests.
Instead of creating jobs for each of those metrics, you can combine them in a
multi-metric job.
You can also use multi-metric jobs to split a single time series into multiple
time series based on a categorical field. For example, you can split the data
based on its hostnames, locations, or users. Each time series is modeled
independently. By looking at temporal patterns on a per entity basis, you might
spot things that might have otherwise been hidden in the lumped view.
Conceptually, you can think of this as running many independent single metric
jobs. By bundling them together in a multi-metric job, however, you can see an
overall score and shared influencers for all the metrics and all the entities in
the job. Multi-metric jobs therefore scale better than having many independent
single metric jobs and provide better results when you have influencers that are
shared across the detectors.
The sample data for this tutorial contains information about the requests that
are received by various applications and services in a system. Let's assume that
you want to monitor the requests received and the response time. In particular,
you might want to track those metrics on a per service basis to see if any
services have unusual patterns.
To create a multi-metric job in {kib}:
. Open {kib} in your web browser and log in. If you are running {kib} locally,
go to `http://localhost:5601/`.
. Click **Machine Learning** in the side navigation, then click **Create new job**.
. Select the index pattern that you created for the sample data. For example,
`server-metrics*`.
. In the **Use a wizard** section, click **Multi metric**.
. Configure the job by providing the following job settings: +
+
--
[role="screenshot"]
image::images/ml-gs-multi-job.jpg["Create a new job from the server-metrics index"]
--
.. For the **Fields**, select `high mean(response)` and `sum(total)`. This
creates two detectors and specifies the analysis function and field that each
detector uses. The first detector uses the high mean function to detect
unusually high average values for the `response` field in each bucket. The
second detector uses the sum function to detect when the sum of the `total`
field is anomalous in each bucket. For more information about any of the
analytical functions, see <<ml-functions>>.
.. For the **Bucket span**, enter `10m`. This value specifies the size of the
interval that the analysis is aggregated into. As was the case in the single
metric example, this value has a significant impact on the analysis. When you're
creating jobs for your own data, you might need to experiment with different
bucket spans depending on the frequency of the input data, the duration of
typical anomalies, and the frequency at which alerting is required.
.. For the **Split Data**, select `service`. When you specify this
option, the analysis is segmented such that you have completely independent
baselines for each distinct value of this field.
//TBD: What is the importance of having separate baselines?
There are seven unique service keyword values in the sample data. Thus for each
of the seven services, you will see the high mean response metrics and sum
total metrics. +
+
--
NOTE: If you are creating a job by using the {ml} APIs or the advanced job
wizard in {kib}, you can accomplish this split by using the
`partition_field_name` property.
--
.. For the **Key Fields (Influencers)**, select `host`. Note that the `service` field
is also automatically selected because you used it to split the data. These key
fields are also known as _influencers_.
When you identify a field as an influencer, you are indicating that you think
it contains information about someone or something that influences or
contributes to anomalies.
+
--
[TIP]
========================
Picking an influencer is strongly recommended for the following reasons:
* It allows you to more easily assign blame for the anomaly
* It simplifies and aggregates the results
The best influencer is the person or thing that you want to blame for the
anomaly. In many cases, users or client IP addresses make excellent influencers.
Influencers can be any field in your data; they do not need to be fields that
are specified in your detectors, though they often are.
As a best practice, do not pick too many influencers. For example, you generally
do not need more than three. If you pick many influencers, the results can be
overwhelming and there is a small overhead to the analysis.
========================
//TBD: Is this something you can determine later from looking at results and
//update your job with if necessary? Is it all post-processing or does it affect
//the ongoing modeling?
--
. Click **Use full server-metrics* data**. Two graphs are generated for each
`service` value, which represent the high mean `response` values and
sum `total` values over time. For example:
+
--
[role="screenshot"]
image::images/ml-gs-job2-split.jpg["Kibana charts for data split by service"]
--
. Provide a name for the job, for example `response_requests_by_app`. The job
name must be unique in your cluster. You can also optionally provide a
description of the job.
. Click **Create Job**.
When the job is created, you can choose to view the results, continue the job in
real-time, and create a watch. In this tutorial, we will proceed to view the
results.
TIP: The `create_multi_metic.sh` script creates a similar job and {dfeed} by
using the {ml} APIs. You can download that script by clicking
here: https://download.elastic.co/demos/machine_learning/gettingstarted/create_multi_metric.sh[create_multi_metric.sh]
For API reference information, see {ref}/ml-apis.html[Machine Learning APIs].
[[ml-gs-job2-analyze]]
=== Exploring Multi-metric Job Results
The {xpackml} features analyze the input stream of data, model its behavior, and
perform analysis based on the two detectors you defined in your job. When an
event occurs outside of the model, that event is identified as an anomaly.
You can use the **Anomaly Explorer** in {kib} to view the analysis results:
[role="screenshot"]
image::images/ml-gs-job2-explorer.jpg["Job results in the Anomaly Explorer"]
You can explore the overall anomaly time line, which shows the maximum anomaly
score for each section in the specified time period. You can change the time
period by using the time picker in the {kib} toolbar. Note that the sections in
this time line do not necessarily correspond to the bucket span. If you change
the time period, the sections change size too. The smallest possible size for
these sections is a bucket. If you specify a large time period, the sections can
span many buckets.
On the left is a list of the top influencers for all of the detected anomalies
in that same time period. The list includes maximum anomaly scores, which in
this case are aggregated for each influencer, for each bucket, across all
detectors. There is also a total sum of the anomaly scores for each influencer.
You can use this list to help you narrow down the contributing factors and focus
on the most anomalous entities.
If your job contains influencers, you can also explore swim lanes that
correspond to the values of an influencer. In this example, the swim lanes
correspond to the values for the `service` field that you used to split the data.
Each lane represents a unique application or service name. Since you specified
the `host` field as an influencer, you can also optionally view the results in
swim lanes for each host name:
[role="screenshot"]
image::images/ml-gs-job2-explorer-host.jpg["Job results sorted by host"]
By default, the swim lanes are ordered by their maximum anomaly score values.
You can click on the sections in the swim lane to see details about the
anomalies that occurred in that time interval.
NOTE: The anomaly scores that you see in each section of the **Anomaly Explorer**
might differ slightly. This disparity occurs because for each job we generate
bucket results, influencer results, and record results. Anomaly scores are
generated for each type of result. The anomaly timeline uses the bucket-level
anomaly scores. The list of top influencers uses the influencer-level anomaly
scores. The list of anomalies uses the record-level anomaly scores. For more
information about these different result types, see
{ref}/ml-results-resource.html[Results Resources].
Click on a section in the swim lanes to obtain more information about the
anomalies in that time period. For example, click on the red section in the swim
lane for `server_2`:
[role="screenshot"]
image::images/ml-gs-job2-explorer-anomaly.jpg["Job results for an anomaly"]
You can see exact times when anomalies occurred and which detectors or metrics
caught the anomaly. Also note that because you split the data by the `service`
field, you see separate charts for each applicable service. In particular, you
see charts for each service for which there is data on the specified host in the
specified time interval.
Below the charts, there is a table that provides more information, such as the
typical and actual values and the influencers that contributed to the anomaly.
[role="screenshot"]
image::images/ml-gs-job2-explorer-table.jpg["Job results table"]
Notice that there are anomalies for both detectors, that is to say for both the
`high_mean(response)` and the `sum(total)` metrics in this time interval. The
table aggregates the anomalies to show the highest severity anomaly per detector
and entity, which is the by, over, or partition field value that is displayed
in the **found for** column. To view all the anomalies without any aggregation,
set the **Interval** to `Show all`.
By
investigating multiple metrics in a single job, you might see relationships
between events in your data that would otherwise be overlooked.

View File

@ -0,0 +1,55 @@
[[ml-gs-next]]
=== Next Steps
By completing this tutorial, you've learned how you can detect anomalous
behavior in a simple set of sample data. You created single and multi-metric
jobs in {kib}, which creates and opens jobs and creates and starts {dfeeds} for
you under the covers. You examined the results of the {ml} analysis in the
**Single Metric Viewer** and **Anomaly Explorer** in {kib}. You also
extrapolated the future behavior of a job by creating a forecast.
If you want to learn about advanced job options, you might be interested in
the following video tutorial:
https://www.elastic.co/videos/machine-learning-lab-3-detect-outliers-in-a-population[Machine Learning Lab 3 - Detect Outliers in a Population].
If you intend to use {ml} APIs in your applications, a good next step might be
to learn about the APIs by retrieving information about these sample jobs.
For example, the following APIs retrieve information about the jobs and {dfeeds}.
[source,js]
--------------------------------------------------
GET _xpack/ml/anomaly_detectors
GET _xpack/ml/datafeeds
--------------------------------------------------
// CONSOLE
For more information about the {ml} APIs, see <<ml-api-quickref>>.
Ultimately, the next step is to start applying {ml} to your own data.
As mentioned in <<ml-gs-data>>, there are three things to consider when you're
thinking about where {ml} will be most impactful:
. It must be time series data.
. It should be information that contains key performance indicators for the
health, security, or success of your business or system. The better you know the
data, the quicker you will be able to create jobs that generate useful
insights.
. Ideally, the data is located in {es} and you can therefore create a {dfeed}
that retrieves data in real time. If your data is outside of {es}, you
cannot use {kib} to create your jobs and you cannot use {dfeeds}. Machine
learning analysis is still possible, however, by using APIs to create and manage
jobs and to post data to them.
Once you have decided which data to analyze, you can start considering which
analysis functions you want to use. For more information, see <<ml-functions>>.
In general, it is a good idea to start with single metric jobs for your
key performance indicators. After you examine these simple analysis results,
you will have a better idea of what the influencers might be. You can create
multi-metric jobs and split the data or create more complex analysis functions
as necessary. For examples of more complicated configuration options, see
<<ml-configuring>>.
If you encounter problems, we're here to help. See <<xpack-help>> and
<<ml-troubleshooting>>.

View File

@ -0,0 +1,331 @@
[[ml-gs-jobs]]
=== Creating Single Metric Jobs
At this point in the tutorial, the goal is to detect anomalies in the
total requests received by your applications and services. The sample data
contains a single key performance indicator(KPI) to track this, which is the total
requests over time. It is therefore logical to start by creating a single metric
job for this KPI.
TIP: If you are using aggregated data, you can create an advanced job
and configure it to use a `summary_count_field_name`. The {ml} algorithms will
make the best possible use of summarized data in this case. For simplicity, in
this tutorial we will not make use of that advanced functionality. For more
information, see <<ml-configuring-aggregation>>.
A single metric job contains a single _detector_. A detector defines the type of
analysis that will occur (for example, `max`, `average`, or `rare` analytical
functions) and the fields that will be analyzed.
To create a single metric job in {kib}:
. Open {kib} in your web browser and log in. If you are running {kib} locally,
go to `http://localhost:5601/`.
. Click **Machine Learning** in the side navigation.
. Click **Create new job**.
. Select the index pattern that you created for the sample data. For example,
`server-metrics*`.
. In the **Use a wizard** section, click **Single metric**.
. Configure the job by providing the following information: +
+
--
[role="screenshot"]
image::images/ml-gs-single-job.jpg["Create a new job from the server-metrics index"]
--
.. For the **Aggregation**, select `Sum`. This value specifies the analysis
function that is used.
+
--
Some of the analytical functions look for single anomalous data points. For
example, `max` identifies the maximum value that is seen within a bucket.
Others perform some aggregation over the length of the bucket. For example,
`mean` calculates the mean of all the data points seen within the bucket.
Similarly, `count` calculates the total number of data points within the bucket.
In this tutorial, you are using the `sum` function, which calculates the sum of
the specified field's values within the bucket. For descriptions of all the
functions, see <<ml-functions>>.
--
.. For the **Field**, select `total`. This value specifies the field that
the detector uses in the function.
+
--
NOTE: Some functions such as `count` and `rare` do not require fields.
--
.. For the **Bucket span**, enter `10m`. This value specifies the size of the
interval that the analysis is aggregated into.
+
--
The {xpackml} features use the concept of a bucket to divide up the time series
into batches for processing. For example, if you are monitoring
the total number of requests in the system,
using a bucket span of 1 hour would mean that at the end of each hour, it
calculates the sum of the requests for the last hour and computes the
anomalousness of that value compared to previous hours.
The bucket span has two purposes: it dictates over what time span to look for
anomalous features in data, and also determines how quickly anomalies can be
detected. Choosing a shorter bucket span enables anomalies to be detected more
quickly. However, there is a risk of being too sensitive to natural variations
or noise in the input data. Choosing too long a bucket span can mean that
interesting anomalies are averaged away. There is also the possibility that the
aggregation might smooth out some anomalies based on when the bucket starts
in time.
The bucket span has a significant impact on the analysis. When you're trying to
determine what value to use, take into account the granularity at which you
want to perform the analysis, the frequency of the input data, the duration of
typical anomalies, and the frequency at which alerting is required.
--
. Determine whether you want to process all of the data or only part of it. If
you want to analyze all of the existing data, click
**Use full server-metrics* data**. If you want to see what happens when you
stop and start {dfeeds} and process additional data over time, click the time
picker in the {kib} toolbar. Since the sample data spans a period of time
between March 23, 2017 and April 22, 2017, click **Absolute**. Set the start
time to March 23, 2017 and the end time to April 1, 2017, for example. Once
you've got the time range set up, click the **Go** button. +
+
--
[role="screenshot"]
image::images/ml-gs-job1-time.jpg["Setting the time range for the {dfeed}"]
--
+
--
A graph is generated, which represents the total number of requests over time.
Note that the **Estimate bucket span** option is no longer greyed out in the
**Buck span** field. This is an experimental feature that you can use to help
determine an appropriate bucket span for your data. For the purposes of this
tutorial, we will leave the bucket span at 10 minutes.
--
. Provide a name for the job, for example `total-requests`. The job name must
be unique in your cluster. You can also optionally provide a description of the
job and create a job group.
. Click **Create Job**. +
+
--
[role="screenshot"]
image::images/ml-gs-job1.jpg["A graph of the total number of requests over time"]
--
As the job is created, the graph is updated to give a visual representation of
the progress of {ml} as the data is processed. This view is only available whilst the
job is running.
When the job is created, you can choose to view the results, continue the job
in real-time, and create a watch. In this tutorial, we will look at how to
manage jobs and {dfeeds} before we view the results.
TIP: The `create_single_metic.sh` script creates a similar job and {dfeed} by
using the {ml} APIs. You can download that script by clicking
here: https://download.elastic.co/demos/machine_learning/gettingstarted/create_single_metric.sh[create_single_metric.sh]
For API reference information, see {ref}/ml-apis.html[Machine Learning APIs].
[[ml-gs-job1-manage]]
=== Managing Jobs
After you create a job, you can see its status in the **Job Management** tab: +
[role="screenshot"]
image::images/ml-gs-job1-manage1.jpg["Status information for the total-requests job"]
The following information is provided for each job:
Job ID::
The unique identifier for the job.
Description::
The optional description of the job.
Processed records::
The number of records that have been processed by the job.
Memory status::
The status of the mathematical models. When you create jobs by using the APIs or
by using the advanced options in {kib}, you can specify a `model_memory_limit`.
That value is the maximum amount of memory resources that the mathematical
models can use. Once that limit is approached, data pruning becomes more
aggressive. Upon exceeding that limit, new entities are not modeled. For more
information about this setting, see
{ref}/ml-job-resource.html#ml-apilimits[Analysis Limits]. The memory status
field reflects whether you have reached or exceeded the model memory limit. It
can have one of the following values: +
`ok`::: The models stayed below the configured value.
`soft_limit`::: The models used more than 60% of the configured memory limit
and older unused models will be pruned to free up space.
`hard_limit`::: The models used more space than the configured memory limit.
As a result, not all incoming data was processed.
Job state::
The status of the job, which can be one of the following values: +
`opened`::: The job is available to receive and process data.
`closed`::: The job finished successfully with its model state persisted.
The job must be opened before it can accept further data.
`closing`::: The job close action is in progress and has not yet completed.
A closing job cannot accept further data.
`failed`::: The job did not finish successfully due to an error.
This situation can occur due to invalid input data.
If the job had irrevocably failed, it must be force closed and then deleted.
If the {dfeed} can be corrected, the job can be closed and then re-opened.
{dfeed-cap} state::
The status of the {dfeed}, which can be one of the following values: +
started::: The {dfeed} is actively receiving data.
stopped::: The {dfeed} is stopped and will not receive data until it is
re-started.
Latest timestamp::
The timestamp of the last processed record.
If you click the arrow beside the name of job, you can show or hide additional
information, such as the settings, configuration information, or messages for
the job.
You can also click one of the **Actions** buttons to start the {dfeed}, edit
the job or {dfeed}, and clone or delete the job, for example.
[float]
[[ml-gs-job1-datafeed]]
==== Managing {dfeeds-cap}
A {dfeed} can be started and stopped multiple times throughout its lifecycle.
If you want to retrieve more data from {es} and the {dfeed} is stopped, you must
restart it.
For example, if you did not use the full data when you created the job, you can
now process the remaining data by restarting the {dfeed}:
. In the **Machine Learning** / **Job Management** tab, click the following
button to start the {dfeed}: image:images/ml-start-feed.jpg["Start {dfeed}"]
. Choose a start time and end time. For example,
click **Continue from 2017-04-01 23:59:00** and select **2017-04-30** as the
search end time. Then click **Start**. The date picker defaults to the latest
timestamp of processed data. Be careful not to leave any gaps in the analysis,
otherwise you might miss anomalies. +
+
--
[role="screenshot"]
image::images/ml-gs-job1-datafeed.jpg["Restarting a {dfeed}"]
--
The {dfeed} state changes to `started`, the job state changes to `opened`,
and the number of processed records increases as the new data is analyzed. The
latest timestamp information also increases.
TIP: If your data is being loaded continuously, you can continue running the job
in real time. For this, start your {dfeed} and select **No end time**.
If you want to stop the {dfeed} at this point, you can click the following
button: image:images/ml-stop-feed.jpg["Stop {dfeed}"]
Now that you have processed all the data, let's start exploring the job results.
[[ml-gs-job1-analyze]]
=== Exploring Single Metric Job Results
The {xpackml} features analyze the input stream of data, model its behavior,
and perform analysis based on the detectors you defined in your job. When an
event occurs outside of the model, that event is identified as an anomaly.
Result records for each anomaly are stored in `.ml-anomalies-*` indices in {es}.
By default, the name of the index where {ml} results are stored is labelled
`shared`, which corresponds to the `.ml-anomalies-shared` index.
You can use the **Anomaly Explorer** or the **Single Metric Viewer** in {kib} to
view the analysis results.
Anomaly Explorer::
This view contains swim lanes showing the maximum anomaly score over time.
There is an overall swim lane that shows the overall score for the job, and
also swim lanes for each influencer. By selecting a block in a swim lane, the
anomaly details are displayed alongside the original source data (where
applicable).
Single Metric Viewer::
This view contains a chart that represents the actual and expected values over
time. This is only available for jobs that analyze a single time series and
where `model_plot_config` is enabled. As in the **Anomaly Explorer**, anomalous
data points are shown in different colors depending on their score.
By default when you view the results for a single metric job, the
**Single Metric Viewer** opens:
[role="screenshot"]
image::images/ml-gs-job1-analysis.jpg["Single Metric Viewer for total-requests job"]
The blue line in the chart represents the actual data values. The shaded blue
area represents the bounds for the expected values. The area between the upper
and lower bounds are the most likely values for the model. If a value is outside
of this area then it can be said to be anomalous.
If you slide the time selector from the beginning of the data to the end of the
data, you can see how the model improves as it processes more data. At the
beginning, the expected range of values is pretty broad and the model is not
capturing the periodicity in the data. But it quickly learns and begins to
reflect the daily variation.
Any data points outside the range that was predicted by the model are marked
as anomalies. When you have high volumes of real-life data, many anomalies
might be found. These vary in probability from very likely to highly unlikely,
that is to say, from not particularly anomalous to highly anomalous. There
can be none, one or two or tens, sometimes hundreds of anomalies found within
each bucket. There can be many thousands found per job. In order to provide
a sensible view of the results, an _anomaly score_ is calculated for each bucket
time interval. The anomaly score is a value from 0 to 100, which indicates
the significance of the observed anomaly compared to previously seen anomalies.
The highly anomalous values are shown in red and the low scored values are
indicated in blue. An interval with a high anomaly score is significant and
requires investigation.
Slide the time selector to a section of the time series that contains a red
anomaly data point. If you hover over the point, you can see more information
about that data point. You can also see details in the **Anomalies** section
of the viewer. For example:
[role="screenshot"]
image::images/ml-gs-job1-anomalies.jpg["Single Metric Viewer Anomalies for total-requests job"]
For each anomaly you can see key details such as the time, the actual and
expected ("typical") values, and their probability.
By default, the table contains all anomalies that have a severity of "warning"
or higher in the selected section of the timeline. If you are only interested in
critical anomalies, for example, you can change the severity threshold for this
table.
The anomalies table also automatically calculates an interval for the data in
the table. If the time difference between the earliest and latest records in the
table is less than two days, the data is aggregated by hour to show the details
of the highest severity anomaly for each detector. Otherwise, it is
aggregated by day. You can change the interval for the table, for example, to
show all anomalies.
You can see the same information in a different format by using the
**Anomaly Explorer**:
[role="screenshot"]
image::images/ml-gs-job1-explorer.jpg["Anomaly Explorer for total-requests job"]
Click one of the red sections in the swim lane to see details about the anomalies
that occurred in that time interval. For example:
[role="screenshot"]
image::images/ml-gs-job1-explorer-anomaly.jpg["Anomaly Explorer details for total-requests job"]
After you have identified anomalies, often the next step is to try to determine
the context of those situations. For example, are there other factors that are
contributing to the problem? Are the anomalies confined to particular
applications or servers? You can begin to troubleshoot these situations by
layering additional jobs or creating multi-metric jobs.

View File

@ -0,0 +1,99 @@
[[ml-gs-wizards]]
=== Creating Jobs in {kib}
++++
<titleabbrev>Creating Jobs</titleabbrev>
++++
Machine learning jobs contain the configuration information and metadata
necessary to perform an analytical task. They also contain the results of the
analytical task.
[NOTE]
--
This tutorial uses {kib} to create jobs and view results, but you can
alternatively use APIs to accomplish most tasks.
For API reference information, see {ref}/ml-apis.html[Machine Learning APIs].
The {xpackml} features in {kib} use pop-ups. You must configure your
web browser so that it does not block pop-up windows or create an
exception for your {kib} URL.
--
{kib} provides wizards that help you create typical {ml} jobs. For example, you
can use wizards to create single metric, multi-metric, population, and advanced
jobs.
To see the job creation wizards:
. Open {kib} in your web browser and log in. If you are running {kib} locally,
go to `http://localhost:5601/`.
. Click **Machine Learning** in the side navigation.
. Click **Create new job**.
. Click the `server-metrics*` index pattern.
You can then choose from a list of job wizards. For example:
[role="screenshot"]
image::images/ml-create-job.jpg["Job creation wizards in {kib}"]
If you are not certain which wizard to use, there is also a **Data Visualizer**
that can help you explore the fields in your data.
To learn more about the sample data:
. Click **Data Visualizer**. +
+
--
[role="screenshot"]
image::images/ml-data-visualizer.jpg["Data Visualizer in {kib}"]
--
. Select a time period that you're interested in exploring by using the time
picker in the {kib} toolbar. Alternatively, click
**Use full server-metrics* data** to view data over the full time range. In this
sample data, the documents relate to March and April 2017.
. Optional: Change the number of documents per shard that are used in the
visualizations. There is a relatively small number of documents in the sample
data, so you can choose a value of `all`. For larger data sets, keep in mind
that using a large sample size increases query run times and increases the load
on the cluster.
[role="screenshot"]
image::images/ml-data-metrics.jpg["Data Visualizer output for metrics in {kib}"]
The fields in the indices are listed in two sections. The first section contains
the numeric ("metric") fields. The second section contains non-metric fields
(such as `keyword`, `text`, `date`, `boolean`, `ip`, and `geo_point` data types).
For metric fields, the **Data Visualizer** indicates how many documents contain
the field in the selected time period. It also provides information about the
minimum, median, and maximum values, the number of distinct values, and their
distribution. You can use the distribution chart to get a better idea of how
the values in the data are clustered. Alternatively, you can view the top values
for metric fields. For example:
[role="screenshot"]
image::images/ml-data-topmetrics.jpg["Data Visualizer output for top values in {kib}"]
For date fields, the **Data Visualizer** provides the earliest and latest field
values and the number and percentage of documents that contain the field
during the selected time period. For example:
[role="screenshot"]
image::images/ml-data-dates.jpg["Data Visualizer output for date fields in {kib}"]
For keyword fields, the **Data Visualizer** provides the number of distinct
values, a list of the top values, and the number and percentage of documents
that contain the field during the selected time period. For example:
[role="screenshot"]
image::images/ml-data-keywords.jpg["Data Visualizer output for date fields in {kib}"]
In this tutorial, you will create single and multi-metric jobs that use the
`total`, `response`, `service`, and `host` fields. Though there is an option to
create an advanced job directly from the **Data Visualizer**, we will use the
single and multi-metric job creation wizards instead.

View File

@ -0,0 +1,80 @@
[[ml-getting-started]]
== Getting Started with Machine Learning
++++
<titleabbrev>Getting Started</titleabbrev>
++++
Ready to get some hands-on experience with the {xpackml} features? This
tutorial shows you how to:
* Load a sample data set into {es}
* Create single and multi-metric {ml} jobs in {kib}
* Use the results to identify possible anomalies in the data
At the end of this tutorial, you should have a good idea of what {ml} is and
will hopefully be inspired to use it to detect anomalies in your own data.
You might also be interested in these video tutorials, which use the same sample
data:
* https://www.elastic.co/videos/machine-learning-tutorial-creating-a-single-metric-job[Machine Learning for the Elastic Stack: Creating a single metric job]
* https://www.elastic.co/videos/machine-learning-tutorial-creating-a-multi-metric-job[Machine Learning for the Elastic Stack: Creating a multi-metric job]
[float]
[[ml-gs-sysoverview]]
=== System Overview
To follow the steps in this tutorial, you will need the following
components of the Elastic Stack:
* {es} {version}, which stores the data and the analysis results
* {kib} {version}, which provides a helpful user interface for creating and
viewing jobs
See the https://www.elastic.co/support/matrix[Elastic Support Matrix] for
information about supported operating systems.
See {stack-ref}/installing-elastic-stack.html[Installing the Elastic Stack] for
information about installing each of the components.
NOTE: To get started, you can install {es} and {kib} on a
single VM or even on your laptop (requires 64-bit OS).
As you add more data and your traffic grows,
you'll want to replace the single {es} instance with a cluster.
By default, when you install {es} and {kib}, {xpack} is installed and the
{ml} features are enabled. You cannot use {ml} with the free basic license, but
you can try all of the {xpack} features with a <<license-management,trial license>>.
If you have multiple nodes in your cluster, you can optionally dedicate nodes to
specific purposes. If you want to control which nodes are
_machine learning nodes_ or limit which nodes run resource-intensive
activity related to jobs, see <<xpack-settings>>.
[float]
[[ml-gs-users]]
==== Users, Roles, and Privileges
The {xpackml} features implement cluster privileges and built-in roles to
make it easier to control which users have authority to view and manage the jobs,
{dfeeds}, and results.
By default, you can perform all of the steps in this tutorial by using the
built-in `elastic` super user. However, the password must be set before the user
can do anything. For information about how to set that password, see
<<security-getting-started>>.
If you are performing these steps in a production environment, take extra care
because `elastic` has the `superuser` role and you could inadvertently make
significant changes to the system. You can alternatively assign the
`machine_learning_admin` and `kibana_user` roles to a user ID of your choice.
For more information, see <<built-in-roles>> and <<privileges-list-cluster>>.
include::getting-started-data.asciidoc[]
include::getting-started-wizards.asciidoc[]
include::getting-started-single.asciidoc[]
include::getting-started-multi.asciidoc[]
include::getting-started-forecast.asciidoc[]
include::getting-started-next.asciidoc[]

Binary file not shown.

After

Width:  |  Height:  |  Size: 118 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 347 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 70 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 187 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 97 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 350 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 99 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 75 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 304 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 53 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 293 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 286 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 92 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 262 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 398 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 133 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 157 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 236 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 134 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 154 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 218 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 175 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 245 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 249 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 64 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 122 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 230 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 141 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 84 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 87 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 176 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 96 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 205 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 100 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 90 KiB

View File

@ -0,0 +1,27 @@
[[xpack-ml]]
= Machine Learning in the Elastic Stack
[partintro]
--
Machine learning is tightly integrated with the Elastic Stack. Data is pulled
from {es} for analysis and anomaly results are displayed in {kib} dashboards.
* <<ml-overview>>
* <<ml-getting-started>>
* <<ml-configuring>>
* <<stopping-ml>>
* <<ml-troubleshooting, Troubleshooting Machine Learning>>
* <<ml-api-quickref>>
* <<ml-functions>>
--
include::overview.asciidoc[]
include::getting-started.asciidoc[]
include::configuring.asciidoc[]
include::stopping-ml.asciidoc[]
// include::ml-scenarios.asciidoc[]
include::api-quickref.asciidoc[]
//include::troubleshooting.asciidoc[] Referenced from x-pack/docs/public/xpack-troubleshooting.asciidoc
include::functions.asciidoc[]

View File

@ -0,0 +1,33 @@
[[ml-jobs]]
=== Machine Learning Jobs
++++
<titleabbrev>Jobs</titleabbrev>
++++
Machine learning jobs contain the configuration information and metadata
necessary to perform an analytics task.
Each job has one or more _detectors_. A detector applies an analytical function
to specific fields in your data. For more information about the types of
analysis you can perform, see <<ml-functions>>.
A job can also contain properties that affect which types of entities or events
are considered anomalous. For example, you can specify whether entities are
analyzed relative to their own previous behavior or relative to other entities
in a population. There are also multiple options for splitting the data into
categories and partitions. Some of these more advanced job configurations
are described in the following section: <<ml-configuring>>.
For a description of all the job properties, see
{ref}/ml-job-resource.html[Job Resources].
In {kib}, there are wizards that help you create specific types of jobs, such
as _single metric_, _multi-metric_, and _population_ jobs. A single metric job
is just a job with a single detector and limited job properties. To have access
to all of the job properties in {kib}, you must choose the _advanced_ job wizard.
If you want to try creating single and multi-metrics jobs in {kib} with sample
data, see <<ml-getting-started>>.
You can also optionally assign jobs to one or more _job groups_. You can use
job groups to view the results from multiple jobs more easily and to expedite
administrative tasks by opening or closing multiple jobs at once.

Some files were not shown because too many files have changed in this diff Show More