[7.x] [ML] adding docs + hlrc for data frame analysis feature_processors (#61149) (#61493)

* [ML] adding docs + hlrc for data frame analysis feature_processors (#61149)

Adds HLRC and some docs for the new feature_processors field in Data frame analytics.

Co-authored-by: Przemysław Witek <przemyslaw.witek@elastic.co>
Co-authored-by: Lisa Cawley <lcawley@elastic.co>
This commit is contained in:
Benjamin Trent 2020-08-24 12:56:21 -04:00 committed by GitHub
parent d05649bfae
commit 1ae2923632
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 254 additions and 116 deletions

View File

@ -18,6 +18,8 @@
*/
package org.elasticsearch.client.ml.dataframe;
import org.elasticsearch.client.ml.inference.NamedXContentObjectHelper;
import org.elasticsearch.client.ml.inference.preprocessing.PreProcessor;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.Strings;
@ -26,6 +28,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import java.io.IOException;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
@ -53,7 +56,9 @@ public class Classification implements DataFrameAnalysis {
static final ParseField CLASS_ASSIGNMENT_OBJECTIVE = new ParseField("class_assignment_objective");
static final ParseField NUM_TOP_CLASSES = new ParseField("num_top_classes");
static final ParseField RANDOMIZE_SEED = new ParseField("randomize_seed");
static final ParseField FEATURE_PROCESSORS = new ParseField("feature_processors");
@SuppressWarnings("unchecked")
private static final ConstructingObjectParser<Classification, Void> PARSER =
new ConstructingObjectParser<>(
NAME.getPreferredName(),
@ -70,7 +75,8 @@ public class Classification implements DataFrameAnalysis {
(Double) a[8],
(Integer) a[9],
(Long) a[10],
(ClassAssignmentObjective) a[11]));
(ClassAssignmentObjective) a[11],
(List<PreProcessor>) a[12]));
static {
PARSER.declareString(ConstructingObjectParser.constructorArg(), DEPENDENT_VARIABLE);
@ -86,6 +92,10 @@ public class Classification implements DataFrameAnalysis {
PARSER.declareLong(ConstructingObjectParser.optionalConstructorArg(), RANDOMIZE_SEED);
PARSER.declareString(
ConstructingObjectParser.optionalConstructorArg(), ClassAssignmentObjective::fromString, CLASS_ASSIGNMENT_OBJECTIVE);
PARSER.declareNamedObjects(ConstructingObjectParser.optionalConstructorArg(),
(p, c, n) -> p.namedObject(PreProcessor.class, n, c),
(classification) -> {},
FEATURE_PROCESSORS);
}
private final String dependentVariable;
@ -100,12 +110,13 @@ public class Classification implements DataFrameAnalysis {
private final ClassAssignmentObjective classAssignmentObjective;
private final Integer numTopClasses;
private final Long randomizeSeed;
private final List<PreProcessor> featureProcessors;
private Classification(String dependentVariable, @Nullable Double lambda, @Nullable Double gamma, @Nullable Double eta,
@Nullable Integer maxTrees, @Nullable Double featureBagFraction,
@Nullable Integer numTopFeatureImportanceValues, @Nullable String predictionFieldName,
@Nullable Double trainingPercent, @Nullable Integer numTopClasses, @Nullable Long randomizeSeed,
@Nullable ClassAssignmentObjective classAssignmentObjective) {
@Nullable ClassAssignmentObjective classAssignmentObjective, @Nullable List<PreProcessor> featureProcessors) {
this.dependentVariable = Objects.requireNonNull(dependentVariable);
this.lambda = lambda;
this.gamma = gamma;
@ -118,6 +129,7 @@ public class Classification implements DataFrameAnalysis {
this.classAssignmentObjective = classAssignmentObjective;
this.numTopClasses = numTopClasses;
this.randomizeSeed = randomizeSeed;
this.featureProcessors = featureProcessors;
}
@Override
@ -173,6 +185,10 @@ public class Classification implements DataFrameAnalysis {
return numTopClasses;
}
public List<PreProcessor> getFeatureProcessors() {
return featureProcessors;
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
@ -210,6 +226,9 @@ public class Classification implements DataFrameAnalysis {
if (numTopClasses != null) {
builder.field(NUM_TOP_CLASSES.getPreferredName(), numTopClasses);
}
if (featureProcessors != null) {
NamedXContentObjectHelper.writeNamedObjects(builder, params, true, FEATURE_PROCESSORS.getPreferredName(), featureProcessors);
}
builder.endObject();
return builder;
}
@ -217,7 +236,7 @@ public class Classification implements DataFrameAnalysis {
@Override
public int hashCode() {
return Objects.hash(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction, numTopFeatureImportanceValues,
predictionFieldName, trainingPercent, randomizeSeed, numTopClasses, classAssignmentObjective);
predictionFieldName, trainingPercent, randomizeSeed, numTopClasses, classAssignmentObjective, featureProcessors);
}
@Override
@ -236,7 +255,8 @@ public class Classification implements DataFrameAnalysis {
&& Objects.equals(trainingPercent, that.trainingPercent)
&& Objects.equals(randomizeSeed, that.randomizeSeed)
&& Objects.equals(numTopClasses, that.numTopClasses)
&& Objects.equals(classAssignmentObjective, that.classAssignmentObjective);
&& Objects.equals(classAssignmentObjective, that.classAssignmentObjective)
&& Objects.equals(featureProcessors, that.featureProcessors);
}
@Override
@ -270,6 +290,7 @@ public class Classification implements DataFrameAnalysis {
private Integer numTopClasses;
private Long randomizeSeed;
private ClassAssignmentObjective classAssignmentObjective;
private List<PreProcessor> featureProcessors;
private Builder(String dependentVariable) {
this.dependentVariable = Objects.requireNonNull(dependentVariable);
@ -330,10 +351,15 @@ public class Classification implements DataFrameAnalysis {
return this;
}
public Builder setFeatureProcessors(List<PreProcessor> featureProcessors) {
this.featureProcessors = featureProcessors;
return this;
}
public Classification build() {
return new Classification(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction,
numTopFeatureImportanceValues, predictionFieldName, trainingPercent, numTopClasses, randomizeSeed,
classAssignmentObjective);
classAssignmentObjective, featureProcessors);
}
}
}

View File

@ -18,6 +18,8 @@
*/
package org.elasticsearch.client.ml.dataframe;
import org.elasticsearch.client.ml.inference.NamedXContentObjectHelper;
import org.elasticsearch.client.ml.inference.preprocessing.PreProcessor;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.Strings;
@ -26,6 +28,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import java.io.IOException;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
@ -55,7 +58,9 @@ public class Regression implements DataFrameAnalysis {
static final ParseField RANDOMIZE_SEED = new ParseField("randomize_seed");
static final ParseField LOSS_FUNCTION = new ParseField("loss_function");
static final ParseField LOSS_FUNCTION_PARAMETER = new ParseField("loss_function_parameter");
static final ParseField FEATURE_PROCESSORS = new ParseField("feature_processors");
@SuppressWarnings("unchecked")
private static final ConstructingObjectParser<Regression, Void> PARSER =
new ConstructingObjectParser<>(
NAME.getPreferredName(),
@ -72,7 +77,8 @@ public class Regression implements DataFrameAnalysis {
(Double) a[8],
(Long) a[9],
(LossFunction) a[10],
(Double) a[11]
(Double) a[11],
(List<PreProcessor>) a[12]
));
static {
@ -88,6 +94,10 @@ public class Regression implements DataFrameAnalysis {
PARSER.declareLong(ConstructingObjectParser.optionalConstructorArg(), RANDOMIZE_SEED);
PARSER.declareString(optionalConstructorArg(), LossFunction::fromString, LOSS_FUNCTION);
PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), LOSS_FUNCTION_PARAMETER);
PARSER.declareNamedObjects(ConstructingObjectParser.optionalConstructorArg(),
(p, c, n) -> p.namedObject(PreProcessor.class, n, c),
(regression) -> {},
FEATURE_PROCESSORS);
}
private final String dependentVariable;
@ -102,12 +112,13 @@ public class Regression implements DataFrameAnalysis {
private final Long randomizeSeed;
private final LossFunction lossFunction;
private final Double lossFunctionParameter;
private final List<PreProcessor> featureProcessors;
private Regression(String dependentVariable, @Nullable Double lambda, @Nullable Double gamma, @Nullable Double eta,
@Nullable Integer maxTrees, @Nullable Double featureBagFraction,
@Nullable Integer numTopFeatureImportanceValues, @Nullable String predictionFieldName,
@Nullable Double trainingPercent, @Nullable Long randomizeSeed, @Nullable LossFunction lossFunction,
@Nullable Double lossFunctionParameter) {
@Nullable Double lossFunctionParameter, @Nullable List<PreProcessor> featureProcessors) {
this.dependentVariable = Objects.requireNonNull(dependentVariable);
this.lambda = lambda;
this.gamma = gamma;
@ -120,6 +131,7 @@ public class Regression implements DataFrameAnalysis {
this.randomizeSeed = randomizeSeed;
this.lossFunction = lossFunction;
this.lossFunctionParameter = lossFunctionParameter;
this.featureProcessors = featureProcessors;
}
@Override
@ -175,6 +187,10 @@ public class Regression implements DataFrameAnalysis {
return lossFunctionParameter;
}
public List<PreProcessor> getFeatureProcessors() {
return featureProcessors;
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
@ -212,6 +228,9 @@ public class Regression implements DataFrameAnalysis {
if (lossFunctionParameter != null) {
builder.field(LOSS_FUNCTION_PARAMETER.getPreferredName(), lossFunctionParameter);
}
if (featureProcessors != null) {
NamedXContentObjectHelper.writeNamedObjects(builder, params, true, FEATURE_PROCESSORS.getPreferredName(), featureProcessors);
}
builder.endObject();
return builder;
}
@ -219,7 +238,7 @@ public class Regression implements DataFrameAnalysis {
@Override
public int hashCode() {
return Objects.hash(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction, numTopFeatureImportanceValues,
predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter);
predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter, featureProcessors);
}
@Override
@ -238,7 +257,8 @@ public class Regression implements DataFrameAnalysis {
&& Objects.equals(trainingPercent, that.trainingPercent)
&& Objects.equals(randomizeSeed, that.randomizeSeed)
&& Objects.equals(lossFunction, that.lossFunction)
&& Objects.equals(lossFunctionParameter, that.lossFunctionParameter);
&& Objects.equals(lossFunctionParameter, that.lossFunctionParameter)
&& Objects.equals(featureProcessors, that.featureProcessors);
}
@Override
@ -259,6 +279,7 @@ public class Regression implements DataFrameAnalysis {
private Long randomizeSeed;
private LossFunction lossFunction;
private Double lossFunctionParameter;
private List<PreProcessor> featureProcessors;
private Builder(String dependentVariable) {
this.dependentVariable = Objects.requireNonNull(dependentVariable);
@ -319,9 +340,15 @@ public class Regression implements DataFrameAnalysis {
return this;
}
public Builder setFeatureProcessors(List<PreProcessor> featureProcessors) {
this.featureProcessors = featureProcessors;
return this;
}
public Regression build() {
return new Regression(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction,
numTopFeatureImportanceValues, predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter);
numTopFeatureImportanceValues, predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter,
featureProcessors);
}
}

View File

@ -114,7 +114,7 @@ public class OneHotEncoding implements PreProcessor {
return Objects.hash(field, hotMap, custom);
}
public Builder builder(String field) {
public static Builder builder(String field) {
return new Builder(field);
}

View File

@ -179,6 +179,7 @@ import org.elasticsearch.client.ml.inference.TrainedModelDefinition;
import org.elasticsearch.client.ml.inference.TrainedModelDefinitionTests;
import org.elasticsearch.client.ml.inference.TrainedModelInput;
import org.elasticsearch.client.ml.inference.TrainedModelStats;
import org.elasticsearch.client.ml.inference.preprocessing.OneHotEncoding;
import org.elasticsearch.client.ml.inference.trainedmodel.RegressionConfig;
import org.elasticsearch.client.ml.inference.trainedmodel.TargetType;
import org.elasticsearch.client.ml.job.config.AnalysisConfig;
@ -3003,6 +3004,9 @@ public class MlClientDocumentationIT extends ESRestHighLevelClientTestCase {
.setRandomizeSeed(1234L) // <10>
.setClassAssignmentObjective(Classification.ClassAssignmentObjective.MAXIMIZE_ACCURACY) // <11>
.setNumTopClasses(1) // <12>
.setFeatureProcessors(Arrays.asList(OneHotEncoding.builder("categorical_feature") // <13>
.addOneHot("cat", "cat_column")
.build()))
.build();
// end::put-data-frame-analytics-classification
@ -3019,6 +3023,9 @@ public class MlClientDocumentationIT extends ESRestHighLevelClientTestCase {
.setRandomizeSeed(1234L) // <10>
.setLossFunction(Regression.LossFunction.MSE) // <11>
.setLossFunctionParameter(1.0) // <12>
.setFeatureProcessors(Arrays.asList(OneHotEncoding.builder("categorical_feature") // <13>
.addOneHot("cat", "cat_column")
.build()))
.build();
// end::put-data-frame-analytics-regression

View File

@ -18,10 +18,20 @@
*/
package org.elasticsearch.client.ml.dataframe;
import org.elasticsearch.client.ml.inference.MlInferenceNamedXContentProvider;
import org.elasticsearch.client.ml.inference.preprocessing.FrequencyEncodingTests;
import org.elasticsearch.client.ml.inference.preprocessing.OneHotEncodingTests;
import org.elasticsearch.client.ml.inference.preprocessing.TargetMeanEncodingTests;
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.test.AbstractXContentTestCase;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class ClassificationTests extends AbstractXContentTestCase<Classification> {
@ -38,9 +48,20 @@ public class ClassificationTests extends AbstractXContentTestCase<Classification
.setRandomizeSeed(randomBoolean() ? null : randomLong())
.setClassAssignmentObjective(randomBoolean() ? null : randomFrom(Classification.ClassAssignmentObjective.values()))
.setNumTopClasses(randomBoolean() ? null : randomIntBetween(0, 10))
.setFeatureProcessors(randomBoolean() ? null :
Stream.generate(() -> randomFrom(FrequencyEncodingTests.createRandom(),
OneHotEncodingTests.createRandom(),
TargetMeanEncodingTests.createRandom()))
.limit(randomIntBetween(1, 10))
.collect(Collectors.toList()))
.build();
}
@Override
protected Predicate<String> getRandomFieldsExcludeFilter() {
return field -> field.startsWith("feature_processors");
}
@Override
protected Classification createTestInstance() {
return randomClassification();
@ -55,4 +76,11 @@ public class ClassificationTests extends AbstractXContentTestCase<Classification
protected boolean supportsUnknownFields() {
return true;
}
@Override
protected NamedXContentRegistry xContentRegistry() {
List<NamedXContentRegistry.Entry> namedXContent = new ArrayList<>();
namedXContent.addAll(new MlInferenceNamedXContentProvider().getNamedXContentParsers());
return new NamedXContentRegistry(namedXContent);
}
}

View File

@ -20,6 +20,7 @@
package org.elasticsearch.client.ml.dataframe;
import org.elasticsearch.Version;
import org.elasticsearch.client.ml.inference.MlInferenceNamedXContentProvider;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.ByteSizeUnit;
import org.elasticsearch.common.unit.ByteSizeValue;
@ -101,6 +102,7 @@ public class DataFrameAnalyticsConfigTests extends AbstractXContentTestCase<Data
List<NamedXContentRegistry.Entry> namedXContent = new ArrayList<>();
namedXContent.addAll(new SearchModule(Settings.EMPTY, false, Collections.emptyList()).getNamedXContents());
namedXContent.addAll(new MlDataFrameAnalysisNamedXContentProvider().getNamedXContentParsers());
namedXContent.addAll(new MlInferenceNamedXContentProvider().getNamedXContentParsers());
return new NamedXContentRegistry(namedXContent);
}
}

View File

@ -18,10 +18,20 @@
*/
package org.elasticsearch.client.ml.dataframe;
import org.elasticsearch.client.ml.inference.MlInferenceNamedXContentProvider;
import org.elasticsearch.client.ml.inference.preprocessing.FrequencyEncodingTests;
import org.elasticsearch.client.ml.inference.preprocessing.OneHotEncodingTests;
import org.elasticsearch.client.ml.inference.preprocessing.TargetMeanEncodingTests;
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.test.AbstractXContentTestCase;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class RegressionTests extends AbstractXContentTestCase<Regression> {
@ -37,9 +47,20 @@ public class RegressionTests extends AbstractXContentTestCase<Regression> {
.setTrainingPercent(randomBoolean() ? null : randomDoubleBetween(1.0, 100.0, true))
.setLossFunction(randomBoolean() ? null : randomFrom(Regression.LossFunction.values()))
.setLossFunctionParameter(randomBoolean() ? null : randomDoubleBetween(1.0, Double.MAX_VALUE, true))
.setFeatureProcessors(randomBoolean() ? null :
Stream.generate(() -> randomFrom(FrequencyEncodingTests.createRandom(),
OneHotEncodingTests.createRandom(),
TargetMeanEncodingTests.createRandom()))
.limit(randomIntBetween(1, 10))
.collect(Collectors.toList()))
.build();
}
@Override
protected Predicate<String> getRandomFieldsExcludeFilter() {
return field -> field.startsWith("feature_processors");
}
@Override
protected Regression createTestInstance() {
return randomRegression();
@ -54,4 +75,11 @@ public class RegressionTests extends AbstractXContentTestCase<Regression> {
protected boolean supportsUnknownFields() {
return true;
}
@Override
protected NamedXContentRegistry xContentRegistry() {
List<NamedXContentRegistry.Entry> namedXContent = new ArrayList<>();
namedXContent.addAll(new MlInferenceNamedXContentProvider().getNamedXContentParsers());
return new NamedXContentRegistry(namedXContent);
}
}

View File

@ -124,6 +124,8 @@ include-tagged::{doc-tests-file}[{api}-classification]
<10> The seed to be used by the random generator that picks which rows are used in training.
<11> The optimization objective to target when assigning class labels. Defaults to maximize_minimum_recall.
<12> The number of top classes to be reported in the results. Defaults to 2.
<13> Custom feature processors that will create new features for analysis from the included document
fields. Note, automatic categorical {ml-docs}/ml-feature-encoding.html[feature encoding] still occurs for all features.
===== Regression
@ -146,6 +148,8 @@ include-tagged::{doc-tests-file}[{api}-regression]
<10> The seed to be used by the random generator that picks which rows are used in training.
<11> The loss function used for regression. Defaults to `mse`.
<12> An optional parameter to the loss function.
<13> Custom feature processors that will create new features for analysis from the included document
fields. Note, automatic categorical {ml-docs}/ml-feature-encoding.html[feature encoding] still occurs for all features.
==== Analyzed fields

View File

@ -25,7 +25,7 @@ If the {es} {security-features} are enabled, you must have the following built-i
* `machine_learning_admin`
* source indices: `read`, `view_index_metadata`
* destination index: `read`, `create_index`, `manage` and `index`
For more information, see <<built-in-roles>>, <<security-privileges>>, and
{ml-docs-setup-privileges}.
@ -33,20 +33,20 @@ For more information, see <<built-in-roles>>, <<security-privileges>>, and
NOTE: The {dfanalytics-job} remembers which roles the user who created it had at
the time of creation. When you start the job, it performs the analysis using
those same roles. If you provide
<<http-clients-secondary-authorization,secondary authorization headers>>,
<<http-clients-secondary-authorization,secondary authorization headers>>,
those credentials are used instead.
[[ml-put-dfanalytics-desc]]
== {api-description-title}
This API creates a {dfanalytics-job} that performs an analysis on the source
This API creates a {dfanalytics-job} that performs an analysis on the source
indices and stores the outcome in a destination index.
If the destination index does not exist, it is created automatically when you
start the job. See <<start-dfanalytics>>.
If you supply only a subset of the {regression} or {classification} parameters,
{ml-docs}/hyperparameters.html[hyperparameter optimization] occurs. It
{ml-docs}/hyperparameters.html[hyperparameter optimization] occurs. It
determines a value for each of the undefined parameters.
[[ml-put-dfanalytics-path-params]]
@ -61,9 +61,9 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=job-id-data-frame-analytics-def
== {api-request-body-title}
`allow_lazy_start`::
(Optional, boolean)
Specifies whether this job can start when there is insufficient {ml} node
capacity for it to be immediately assigned to a node. The default is `false`; if
(Optional, boolean)
Specifies whether this job can start when there is insufficient {ml} node
capacity for it to be immediately assigned to a node. The default is `false`; if
a {ml} node with capacity to run the job cannot immediately be found, the
<<start-dfanalytics>> API returns an error. However, this is also subject to the
cluster-wide `xpack.ml.max_lazy_ml_nodes` setting. See <<advanced-ml-settings>>.
@ -86,7 +86,7 @@ one of the following types of analysis: {classification}, {oldetection}, or
The configuration information necessary to perform
{ml-docs}/dfa-classification.html[{classification}].
+
TIP: Advanced parameters are for fine-tuning {classanalysis}. They are set
TIP: Advanced parameters are for fine-tuning {classanalysis}. They are set
automatically by hyperparameter optimization to give the minimum validation
error. It is highly recommended to use the default values unless you fully
understand the function of these parameters.
@ -108,23 +108,27 @@ categorical (`ip` or `keyword`), or boolean. There must be no more than 30
different values in this field.
`eta`::::
(Optional, double)
(Optional, double)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=eta]
`feature_bag_fraction`::::
(Optional, double)
(Optional, double)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-bag-fraction]
`feature_processors`::::
(Optional, list)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=dfas-feature-processors]
`gamma`::::
(Optional, double)
(Optional, double)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=gamma]
`lambda`::::
(Optional, double)
(Optional, double)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=lambda]
`max_trees`::::
(Optional, integer)
(Optional, integer)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=max-trees]
`num_top_classes`::::
@ -136,11 +140,11 @@ categories, the API reports all category probabilities. Defaults to 2.
`num_top_feature_importance_values`::::
(Optional, integer)
Advanced configuration option. Specifies the maximum number of
{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return.
{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return.
By default, it is zero and no {feat-imp} calculation occurs.
`prediction_field_name`::::
(Optional, string)
(Optional, string)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=prediction-field-name]
`randomize_seed`::::
@ -164,23 +168,23 @@ The configuration information necessary to perform
`compute_feature_influence`::::
(Optional, boolean)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=compute-feature-influence]
`feature_influence_threshold`::::
`feature_influence_threshold`::::
(Optional, double)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-influence-threshold]
`method`::::
(Optional, string)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=method]
`n_neighbors`::::
(Optional, integer)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=n-neighbors]
`outlier_fraction`::::
(Optional, double)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=outlier-fraction]
`standardization_enabled`::::
(Optional, boolean)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=standardization-enabled]
@ -192,7 +196,7 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=standardization-enabled]
The configuration information necessary to perform
{ml-docs}/dfa-regression.html[{regression}].
+
TIP: Advanced parameters are for fine-tuning {reganalysis}. They are set
TIP: Advanced parameters are for fine-tuning {reganalysis}. They are set
automatically by hyperparameter optimization to give minimum validation error.
It is highly recommended to use the default values unless you fully understand
the function of these parameters.
@ -215,20 +219,24 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=eta]
(Optional, double)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-bag-fraction]
`feature_processors`::::
(Optional, list)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=dfas-feature-processors]
`gamma`::::
(Optional, double)
(Optional, double)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=gamma]
`lambda`::::
(Optional, double)
(Optional, double)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=lambda]
`loss_function`::::
(Optional, string)
The loss function used during {regression}. Available options are `mse` (mean
squared error), `msle` (mean squared logarithmic error), `huber` (Pseudo-Huber
loss). Defaults to `mse`. Refer to
{ml-docs}/dfa-regression.html#dfa-regression-lossfunction[Loss functions for {regression} analyses]
The loss function used during {regression}. Available options are `mse` (mean
squared error), `msle` (mean squared logarithmic error), `huber` (Pseudo-Huber
loss). Defaults to `mse`. Refer to
{ml-docs}/dfa-regression.html#dfa-regression-lossfunction[Loss functions for {regression} analyses]
to learn more.
`loss_function_parameter`::::
@ -236,13 +244,13 @@ to learn more.
A positive number that is used as a parameter to the `loss_function`.
`max_trees`::::
(Optional, integer)
(Optional, integer)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=max-trees]
`num_top_feature_importance_values`::::
(Optional, integer)
Advanced configuration option. Specifies the maximum number of
{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return.
{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return.
By default, it is zero and no {feat-imp} calculation occurs.
`prediction_field_name`::::
@ -264,31 +272,31 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=training-percent]
//Begin analyzed_fields
`analyzed_fields`::
(Optional, object)
Specify `includes` and/or `excludes` patterns to select which fields will be
included in the analysis. The patterns specified in `excludes` are applied last,
therefore `excludes` takes precedence. In other words, if the same field is
specified in both `includes` and `excludes`, then the field will not be included
Specify `includes` and/or `excludes` patterns to select which fields will be
included in the analysis. The patterns specified in `excludes` are applied last,
therefore `excludes` takes precedence. In other words, if the same field is
specified in both `includes` and `excludes`, then the field will not be included
in the analysis.
+
--
[[dfa-supported-fields]]
The supported fields for each type of analysis are as follows:
* {oldetection-cap} requires numeric or boolean data to analyze. The algorithms
don't support missing values therefore fields that have data types other than
numeric or boolean are ignored. Documents where included fields contain missing
values, null values, or an array are also ignored. Therefore the `dest` index
* {oldetection-cap} requires numeric or boolean data to analyze. The algorithms
don't support missing values therefore fields that have data types other than
numeric or boolean are ignored. Documents where included fields contain missing
values, null values, or an array are also ignored. Therefore the `dest` index
may contain documents that don't have an {olscore}.
* {regression-cap} supports fields that are numeric, `boolean`, `text`,
`keyword`, and `ip`. It is also tolerant of missing values. Fields that are
supported are included in the analysis, other fields are ignored. Documents
where included fields contain an array with two or more values are also
ignored. Documents in the `dest` index that dont contain a results field are
* {regression-cap} supports fields that are numeric, `boolean`, `text`,
`keyword`, and `ip`. It is also tolerant of missing values. Fields that are
supported are included in the analysis, other fields are ignored. Documents
where included fields contain an array with two or more values are also
ignored. Documents in the `dest` index that dont contain a results field are
not included in the {reganalysis}.
* {classification-cap} supports fields that are numeric, `boolean`, `text`,
`keyword`, and `ip`. It is also tolerant of missing values. Fields that are
`keyword`, and `ip`. It is also tolerant of missing values. Fields that are
supported are included in the analysis, other fields are ignored. Documents
where included fields contain an array with two or more values are also ignored.
where included fields contain an array with two or more values are also ignored.
Documents in the `dest` index that dont contain a results field are not
included in the {classanalysis}. {classanalysis-cap} can be improved by mapping
ordinal variable values to a single number. For example, in case of age ranges,
@ -310,7 +318,7 @@ analysis. You do not need to add fields with unsupported data types to
`includes`:::
(Optional, array)
An array of strings that defines the fields that will be included in the
An array of strings that defines the fields that will be included in the
analysis.
//End analyzed_fields
====
@ -330,16 +338,16 @@ The default value is `1`. Using more threads may decrease the time
necessary to complete the analysis at the cost of using more CPU.
Note that the process may use additional threads for operational
functionality other than the analysis itself.
`model_memory_limit`::
(Optional, string)
The approximate maximum amount of memory resources that are permitted for
analytical processing. The default value for {dfanalytics-jobs} is `1gb`. If
your `elasticsearch.yml` file contains an `xpack.ml.max_model_memory_limit`
setting, an error occurs when you try to create {dfanalytics-jobs} that have
`model_memory_limit` values greater than that setting. For more information, see
The approximate maximum amount of memory resources that are permitted for
analytical processing. The default value for {dfanalytics-jobs} is `1gb`. If
your `elasticsearch.yml` file contains an `xpack.ml.max_model_memory_limit`
setting, an error occurs when you try to create {dfanalytics-jobs} that have
`model_memory_limit` values greater than that setting. For more information, see
<<ml-settings>>.
`source`::
(object)
The configuration of how to source the analysis data. It requires an `index`.
@ -353,7 +361,7 @@ Optionally, `query` and `_source` may be specified.
It can be a single index or index pattern as well as an array of indices or
patterns.
+
WARNING: If your source indices contain documents with the same IDs, only the
WARNING: If your source indices contain documents with the same IDs, only the
document that is indexed last appears in the destination index.
`query`:::
@ -374,7 +382,7 @@ included in the analysis.
`includes`::::
(array) An array of strings that defines the fields that will be included in the
destination.
`excludes`::::
(array) An array of strings that defines the fields that will be excluded from
the destination.
@ -390,8 +398,8 @@ the destination.
[[ml-put-dfanalytics-example-preprocess]]
=== Preprocessing actions example
The following example shows how to limit the scope of the analysis to certain
fields, specify excluded fields in the destination index, and use a query to
The following example shows how to limit the scope of the analysis to certain
fields, specify excluded fields in the destination index, and use a query to
filter your data before analysis.
[source,console]
@ -404,7 +412,7 @@ PUT _ml/data_frame/analytics/model-flight-delays-pre
],
"query": { <2>
"range": {
"DistanceKilometers": {
"DistanceKilometers": {
"gt": 0
}
}
@ -429,7 +437,7 @@ PUT _ml/data_frame/analytics/model-flight-delays-pre
},
"analyzed_fields": { <5>
"includes": [],
"excludes": [
"excludes": [
"FlightNum"
]
},
@ -439,29 +447,29 @@ PUT _ml/data_frame/analytics/model-flight-delays-pre
// TEST[skip:setup kibana sample data]
<1> Source index to analyze.
<2> This query filters out entire documents that will not be present in the
<2> This query filters out entire documents that will not be present in the
destination index.
<3> The `_source` object defines fields in the dataset that will be included or
excluded in the destination index.
<4> Defines the destination index that contains the results of the analysis and
the fields of the source index specified in the `_source` object. Also defines
<3> The `_source` object defines fields in the dataset that will be included or
excluded in the destination index.
<4> Defines the destination index that contains the results of the analysis and
the fields of the source index specified in the `_source` object. Also defines
the name of the `results_field`.
<5> Specifies fields to be included in or excluded from the analysis. This does
not affect whether the fields will be present in the destination index, only
<5> Specifies fields to be included in or excluded from the analysis. This does
not affect whether the fields will be present in the destination index, only
affects whether they are used in the analysis.
In this example, we can see that all the fields of the source index are included
in the destination index except `FlightDelay` and `FlightDelayType` because
these are defined as excluded fields by the `excludes` parameter of the
`_source` object. The `FlightNum` field is included in the destination index,
however it is not included in the analysis because it is explicitly specified as
In this example, we can see that all the fields of the source index are included
in the destination index except `FlightDelay` and `FlightDelayType` because
these are defined as excluded fields by the `excludes` parameter of the
`_source` object. The `FlightNum` field is included in the destination index,
however it is not included in the analysis because it is explicitly specified as
excluded field by the `excludes` parameter of the `analyzed_fields` object.
[[ml-put-dfanalytics-example-od]]
=== {oldetection-cap} example
The following example creates the `loganalytics` {dfanalytics-job}, the analysis
The following example creates the `loganalytics` {dfanalytics-job}, the analysis
type is `outlier_detection`:
[source,console]
@ -525,7 +533,7 @@ The API returns the following result:
[[ml-put-dfanalytics-example-r]]
=== {regression-cap} examples
The following example creates the `house_price_regression_analysis`
The following example creates the `house_price_regression_analysis`
{dfanalytics-job}, the analysis type is `regression`:
[source,console]
@ -538,7 +546,7 @@ PUT _ml/data_frame/analytics/house_price_regression_analysis
"dest": {
"index": "house_price_predictions"
},
"analysis":
"analysis":
{
"regression": {
"dependent_variable": "price"
@ -614,7 +622,7 @@ PUT _ml/data_frame/analytics/student_performance_mathematics_0.3
[[ml-put-dfanalytics-example-c]]
=== {classification-cap} example
The following example creates the `loan_classification` {dfanalytics-job}, the
The following example creates the `loan_classification` {dfanalytics-job}, the
analysis type is `classification`:
[source,console]

View File

@ -453,10 +453,10 @@ Defaults to `true`.
end::delayed-data-check-config[]
tag::dependent-variable[]
Defines which field of the document is to be predicted.
This parameter is supplied by field name and must match one of the fields in
the index being used to train. If this field is missing from a document, then
that document will not be used for training, but a prediction with the trained
Defines which field of the document is to be predicted.
This parameter is supplied by field name and must match one of the fields in
the index being used to train. If this field is missing from a document, then
that document will not be used for training, but a prediction with the trained
model will be generated for it. It is also known as continuous target variable.
end::dependent-variable[]
@ -513,10 +513,18 @@ The value of the downsample factor.
end::dfas-downsample-factor[]
tag::dfas-eta-growth[]
Specifies the rate at which the `eta` increases for each new tree that is added to the
Specifies the rate at which the `eta` increases for each new tree that is added to the
forest. For example, a rate of `1.05` increases `eta` by 5%.
end::dfas-eta-growth[]
tag::dfas-feature-processors[]
A collection of feature preprocessors that modify one or more included fields.
The analysis uses the resulting one or more features instead of the
original document field. Multiple `feature_processors` entries can refer to the
same document fields.
Note, automatic categorical {ml-docs}/ml-feature-encoding.html[feature encoding] still occurs.
end::dfas-feature-processors[]
tag::dfas-iteration[]
The number of iterations on the analysis.
end::dfas-iteration[]
@ -529,9 +537,9 @@ training stops.
end::dfas-max-attempts[]
tag::dfas-max-optimization-rounds[]
A multiplier responsible for determining the maximum number of
hyperparameter optimization steps in the Bayesian optimization procedure.
The maximum number of steps is determined based on the number of undefined hyperparameters
A multiplier responsible for determining the maximum number of
hyperparameter optimization steps in the Bayesian optimization procedure.
The maximum number of steps is determined based on the number of undefined hyperparameters
times the maximum optimization rounds per hyperparameter.
end::dfas-max-optimization-rounds[]
@ -595,10 +603,10 @@ functions that are tolerant to gaps in data such as `mean`, `non_null_sum` or
end::empty-bucket-count[]
tag::eta[]
Advanced configuration option. The shrinkage applied to the weights. Smaller
Advanced configuration option. The shrinkage applied to the weights. Smaller
values result in larger forests which have a better generalization error.
However, the smaller the value the longer the training will take. For more
information, about shrinkage, see
information, about shrinkage, see
{wikipedia}/Gradient_boosting#Shrinkage[this wiki article]. By
default, this value is calcuated during hyperparameter optimization.
end::eta[]
@ -624,13 +632,13 @@ this value to determine the number of unique categories that were missed.
end::failed-category-count[]
tag::feature-bag-fraction[]
Advanced configuration option. Defines the fraction of features that will be
Advanced configuration option. Defines the fraction of features that will be
used when selecting a random bag for each candidate split. By default, this
value is calculated during hyperparameter optimization.
end::feature-bag-fraction[]
tag::feature-influence-threshold[]
The minimum {olscore} that a document needs to have to calculate its feature
The minimum {olscore} that a document needs to have to calculate its feature
influence score. Value range: 0-1 (`0.1` by default).
end::feature-influence-threshold[]
@ -675,10 +683,10 @@ The analysis function that is used. For example, `count`, `rare`, `mean`, `min`,
end::function[]
tag::gamma[]
Advanced configuration option. Regularization parameter to prevent overfitting
on the training data set. Multiplies a linear penalty associated with the size of
individual trees in the forest. The higher the value the more training will
prefer smaller trees. The smaller this parameter the larger individual trees
Advanced configuration option. Regularization parameter to prevent overfitting
on the training data set. Multiplies a linear penalty associated with the size of
individual trees in the forest. The higher the value the more training will
prefer smaller trees. The smaller this parameter the larger individual trees
will be and the longer training will take. By default, this value is calculated
during hyperparameter optimization.
end::gamma[]
@ -798,8 +806,8 @@ information for all {anomaly-jobs}.
end::job-id-anomaly-detection-default[]
tag::job-id-anomaly-detection-define[]
Identifier for the {anomaly-job}. This identifier can contain lowercase
alphanumeric characters (a-z and 0-9), hyphens, and underscores. It must start
Identifier for the {anomaly-job}. This identifier can contain lowercase
alphanumeric characters (a-z and 0-9), hyphens, and underscores. It must start
and end with alphanumeric characters.
end::job-id-anomaly-detection-define[]
@ -843,12 +851,12 @@ For more information, see <<ml-jobstats>>.
end::jobs-stats-anomaly-detection[]
tag::lambda[]
Advanced configuration option. Regularization parameter to prevent overfitting
on the training data set. Multiplies an L2 regularisation term which applies to
leaf weights of the individual trees in the forest. The higher the value the
more training will attempt to keep leaf weights small. This makes the prediction
function smoother at the expense of potentially not being able to capture
relevant relationships between the features and the {depvar}. The smaller this
Advanced configuration option. Regularization parameter to prevent overfitting
on the training data set. Multiplies an L2 regularisation term which applies to
leaf weights of the individual trees in the forest. The higher the value the
more training will attempt to keep leaf weights small. This makes the prediction
function smoother at the expense of potentially not being able to capture
relevant relationships between the features and the {depvar}. The smaller this
parameter the larger individual trees will be and the longer training will take.
By default, this value is calculated during hyperparameter optimization.
end::lambda[]
@ -1098,8 +1106,8 @@ For open jobs only, the elapsed time for which the job has been open.
end::open-time[]
tag::outlier-fraction[]
The proportion of the data set that is assumed to be outlying prior to
{oldetection}. For example, 0.05 means it is assumed that 5% of values are real
The proportion of the data set that is assumed to be outlying prior to
{oldetection}. For example, 0.05 means it is assumed that 5% of values are real
outliers and 95% are inliers.
end::outlier-fraction[]
@ -1185,7 +1193,7 @@ tag::randomize-seed[]
Defines the seed to the random generator that is used to pick
which documents will be used for training. By default it is randomly generated.
Set it to a specific value to ensure the same documents are used for training
assuming other related parameters (e.g. `source`, `analyzed_fields`, etc.) are
assuming other related parameters (e.g. `source`, `analyzed_fields`, etc.) are
the same.
end::randomize-seed[]
@ -1264,8 +1272,8 @@ end::sparse-bucket-count[]
tag::standardization-enabled[]
If `true`, the following operation is performed on the columns before computing
{olscores}: (x_i - mean(x_i)) / sd(x_i). Defaults to `true`. For
more information about this concept, see
{olscores}: (x_i - mean(x_i)) / sd(x_i). Defaults to `true`. For
more information about this concept, see
https://en.wikipedia.org/wiki/Feature_scaling#Standardization_(Z-score_Normalization)[Wikipedia].
end::standardization-enabled[]
@ -1340,12 +1348,12 @@ when the mode is set to `manual`. For example: `3h`.
end::time-span[]
tag::timeout-start[]
Controls the amount of time to wait until the {dfanalytics-job} starts. Defaults
Controls the amount of time to wait until the {dfanalytics-job} starts. Defaults
to 20 seconds.
end::timeout-start[]
tag::timeout-stop[]
Controls the amount of time to wait until the {dfanalytics-job} stops. Defaults
Controls the amount of time to wait until the {dfanalytics-job} stops. Defaults
to 20 seconds.
end::timeout-stop[]