[7.x] [ML] adding docs + hlrc for data frame analysis feature_processors () ()

* [ML] adding docs + hlrc for data frame analysis feature_processors ()

Adds HLRC and some docs for the new feature_processors field in Data frame analytics.

Co-authored-by: Przemysław Witek <przemyslaw.witek@elastic.co>
Co-authored-by: Lisa Cawley <lcawley@elastic.co>
This commit is contained in:
Benjamin Trent 2020-08-24 12:56:21 -04:00 committed by GitHub
parent d05649bfae
commit 1ae2923632
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 254 additions and 116 deletions
client/rest-high-level/src
main/java/org/elasticsearch/client/ml
test/java/org/elasticsearch/client
docs
java-rest/high-level/ml
reference/ml

@ -18,6 +18,8 @@
*/ */
package org.elasticsearch.client.ml.dataframe; package org.elasticsearch.client.ml.dataframe;
import org.elasticsearch.client.ml.inference.NamedXContentObjectHelper;
import org.elasticsearch.client.ml.inference.preprocessing.PreProcessor;
import org.elasticsearch.common.Nullable; import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.ParseField; import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.Strings; import org.elasticsearch.common.Strings;
@ -26,6 +28,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.XContentParser;
import java.io.IOException; import java.io.IOException;
import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Objects; import java.util.Objects;
@ -53,7 +56,9 @@ public class Classification implements DataFrameAnalysis {
static final ParseField CLASS_ASSIGNMENT_OBJECTIVE = new ParseField("class_assignment_objective"); static final ParseField CLASS_ASSIGNMENT_OBJECTIVE = new ParseField("class_assignment_objective");
static final ParseField NUM_TOP_CLASSES = new ParseField("num_top_classes"); static final ParseField NUM_TOP_CLASSES = new ParseField("num_top_classes");
static final ParseField RANDOMIZE_SEED = new ParseField("randomize_seed"); static final ParseField RANDOMIZE_SEED = new ParseField("randomize_seed");
static final ParseField FEATURE_PROCESSORS = new ParseField("feature_processors");
@SuppressWarnings("unchecked")
private static final ConstructingObjectParser<Classification, Void> PARSER = private static final ConstructingObjectParser<Classification, Void> PARSER =
new ConstructingObjectParser<>( new ConstructingObjectParser<>(
NAME.getPreferredName(), NAME.getPreferredName(),
@ -70,7 +75,8 @@ public class Classification implements DataFrameAnalysis {
(Double) a[8], (Double) a[8],
(Integer) a[9], (Integer) a[9],
(Long) a[10], (Long) a[10],
(ClassAssignmentObjective) a[11])); (ClassAssignmentObjective) a[11],
(List<PreProcessor>) a[12]));
static { static {
PARSER.declareString(ConstructingObjectParser.constructorArg(), DEPENDENT_VARIABLE); PARSER.declareString(ConstructingObjectParser.constructorArg(), DEPENDENT_VARIABLE);
@ -86,6 +92,10 @@ public class Classification implements DataFrameAnalysis {
PARSER.declareLong(ConstructingObjectParser.optionalConstructorArg(), RANDOMIZE_SEED); PARSER.declareLong(ConstructingObjectParser.optionalConstructorArg(), RANDOMIZE_SEED);
PARSER.declareString( PARSER.declareString(
ConstructingObjectParser.optionalConstructorArg(), ClassAssignmentObjective::fromString, CLASS_ASSIGNMENT_OBJECTIVE); ConstructingObjectParser.optionalConstructorArg(), ClassAssignmentObjective::fromString, CLASS_ASSIGNMENT_OBJECTIVE);
PARSER.declareNamedObjects(ConstructingObjectParser.optionalConstructorArg(),
(p, c, n) -> p.namedObject(PreProcessor.class, n, c),
(classification) -> {},
FEATURE_PROCESSORS);
} }
private final String dependentVariable; private final String dependentVariable;
@ -100,12 +110,13 @@ public class Classification implements DataFrameAnalysis {
private final ClassAssignmentObjective classAssignmentObjective; private final ClassAssignmentObjective classAssignmentObjective;
private final Integer numTopClasses; private final Integer numTopClasses;
private final Long randomizeSeed; private final Long randomizeSeed;
private final List<PreProcessor> featureProcessors;
private Classification(String dependentVariable, @Nullable Double lambda, @Nullable Double gamma, @Nullable Double eta, private Classification(String dependentVariable, @Nullable Double lambda, @Nullable Double gamma, @Nullable Double eta,
@Nullable Integer maxTrees, @Nullable Double featureBagFraction, @Nullable Integer maxTrees, @Nullable Double featureBagFraction,
@Nullable Integer numTopFeatureImportanceValues, @Nullable String predictionFieldName, @Nullable Integer numTopFeatureImportanceValues, @Nullable String predictionFieldName,
@Nullable Double trainingPercent, @Nullable Integer numTopClasses, @Nullable Long randomizeSeed, @Nullable Double trainingPercent, @Nullable Integer numTopClasses, @Nullable Long randomizeSeed,
@Nullable ClassAssignmentObjective classAssignmentObjective) { @Nullable ClassAssignmentObjective classAssignmentObjective, @Nullable List<PreProcessor> featureProcessors) {
this.dependentVariable = Objects.requireNonNull(dependentVariable); this.dependentVariable = Objects.requireNonNull(dependentVariable);
this.lambda = lambda; this.lambda = lambda;
this.gamma = gamma; this.gamma = gamma;
@ -118,6 +129,7 @@ public class Classification implements DataFrameAnalysis {
this.classAssignmentObjective = classAssignmentObjective; this.classAssignmentObjective = classAssignmentObjective;
this.numTopClasses = numTopClasses; this.numTopClasses = numTopClasses;
this.randomizeSeed = randomizeSeed; this.randomizeSeed = randomizeSeed;
this.featureProcessors = featureProcessors;
} }
@Override @Override
@ -173,6 +185,10 @@ public class Classification implements DataFrameAnalysis {
return numTopClasses; return numTopClasses;
} }
public List<PreProcessor> getFeatureProcessors() {
return featureProcessors;
}
@Override @Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(); builder.startObject();
@ -210,6 +226,9 @@ public class Classification implements DataFrameAnalysis {
if (numTopClasses != null) { if (numTopClasses != null) {
builder.field(NUM_TOP_CLASSES.getPreferredName(), numTopClasses); builder.field(NUM_TOP_CLASSES.getPreferredName(), numTopClasses);
} }
if (featureProcessors != null) {
NamedXContentObjectHelper.writeNamedObjects(builder, params, true, FEATURE_PROCESSORS.getPreferredName(), featureProcessors);
}
builder.endObject(); builder.endObject();
return builder; return builder;
} }
@ -217,7 +236,7 @@ public class Classification implements DataFrameAnalysis {
@Override @Override
public int hashCode() { public int hashCode() {
return Objects.hash(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction, numTopFeatureImportanceValues, return Objects.hash(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction, numTopFeatureImportanceValues,
predictionFieldName, trainingPercent, randomizeSeed, numTopClasses, classAssignmentObjective); predictionFieldName, trainingPercent, randomizeSeed, numTopClasses, classAssignmentObjective, featureProcessors);
} }
@Override @Override
@ -236,7 +255,8 @@ public class Classification implements DataFrameAnalysis {
&& Objects.equals(trainingPercent, that.trainingPercent) && Objects.equals(trainingPercent, that.trainingPercent)
&& Objects.equals(randomizeSeed, that.randomizeSeed) && Objects.equals(randomizeSeed, that.randomizeSeed)
&& Objects.equals(numTopClasses, that.numTopClasses) && Objects.equals(numTopClasses, that.numTopClasses)
&& Objects.equals(classAssignmentObjective, that.classAssignmentObjective); && Objects.equals(classAssignmentObjective, that.classAssignmentObjective)
&& Objects.equals(featureProcessors, that.featureProcessors);
} }
@Override @Override
@ -270,6 +290,7 @@ public class Classification implements DataFrameAnalysis {
private Integer numTopClasses; private Integer numTopClasses;
private Long randomizeSeed; private Long randomizeSeed;
private ClassAssignmentObjective classAssignmentObjective; private ClassAssignmentObjective classAssignmentObjective;
private List<PreProcessor> featureProcessors;
private Builder(String dependentVariable) { private Builder(String dependentVariable) {
this.dependentVariable = Objects.requireNonNull(dependentVariable); this.dependentVariable = Objects.requireNonNull(dependentVariable);
@ -330,10 +351,15 @@ public class Classification implements DataFrameAnalysis {
return this; return this;
} }
public Builder setFeatureProcessors(List<PreProcessor> featureProcessors) {
this.featureProcessors = featureProcessors;
return this;
}
public Classification build() { public Classification build() {
return new Classification(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction, return new Classification(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction,
numTopFeatureImportanceValues, predictionFieldName, trainingPercent, numTopClasses, randomizeSeed, numTopFeatureImportanceValues, predictionFieldName, trainingPercent, numTopClasses, randomizeSeed,
classAssignmentObjective); classAssignmentObjective, featureProcessors);
} }
} }
} }

@ -18,6 +18,8 @@
*/ */
package org.elasticsearch.client.ml.dataframe; package org.elasticsearch.client.ml.dataframe;
import org.elasticsearch.client.ml.inference.NamedXContentObjectHelper;
import org.elasticsearch.client.ml.inference.preprocessing.PreProcessor;
import org.elasticsearch.common.Nullable; import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.ParseField; import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.Strings; import org.elasticsearch.common.Strings;
@ -26,6 +28,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.XContentParser;
import java.io.IOException; import java.io.IOException;
import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Objects; import java.util.Objects;
@ -55,7 +58,9 @@ public class Regression implements DataFrameAnalysis {
static final ParseField RANDOMIZE_SEED = new ParseField("randomize_seed"); static final ParseField RANDOMIZE_SEED = new ParseField("randomize_seed");
static final ParseField LOSS_FUNCTION = new ParseField("loss_function"); static final ParseField LOSS_FUNCTION = new ParseField("loss_function");
static final ParseField LOSS_FUNCTION_PARAMETER = new ParseField("loss_function_parameter"); static final ParseField LOSS_FUNCTION_PARAMETER = new ParseField("loss_function_parameter");
static final ParseField FEATURE_PROCESSORS = new ParseField("feature_processors");
@SuppressWarnings("unchecked")
private static final ConstructingObjectParser<Regression, Void> PARSER = private static final ConstructingObjectParser<Regression, Void> PARSER =
new ConstructingObjectParser<>( new ConstructingObjectParser<>(
NAME.getPreferredName(), NAME.getPreferredName(),
@ -72,7 +77,8 @@ public class Regression implements DataFrameAnalysis {
(Double) a[8], (Double) a[8],
(Long) a[9], (Long) a[9],
(LossFunction) a[10], (LossFunction) a[10],
(Double) a[11] (Double) a[11],
(List<PreProcessor>) a[12]
)); ));
static { static {
@ -88,6 +94,10 @@ public class Regression implements DataFrameAnalysis {
PARSER.declareLong(ConstructingObjectParser.optionalConstructorArg(), RANDOMIZE_SEED); PARSER.declareLong(ConstructingObjectParser.optionalConstructorArg(), RANDOMIZE_SEED);
PARSER.declareString(optionalConstructorArg(), LossFunction::fromString, LOSS_FUNCTION); PARSER.declareString(optionalConstructorArg(), LossFunction::fromString, LOSS_FUNCTION);
PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), LOSS_FUNCTION_PARAMETER); PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), LOSS_FUNCTION_PARAMETER);
PARSER.declareNamedObjects(ConstructingObjectParser.optionalConstructorArg(),
(p, c, n) -> p.namedObject(PreProcessor.class, n, c),
(regression) -> {},
FEATURE_PROCESSORS);
} }
private final String dependentVariable; private final String dependentVariable;
@ -102,12 +112,13 @@ public class Regression implements DataFrameAnalysis {
private final Long randomizeSeed; private final Long randomizeSeed;
private final LossFunction lossFunction; private final LossFunction lossFunction;
private final Double lossFunctionParameter; private final Double lossFunctionParameter;
private final List<PreProcessor> featureProcessors;
private Regression(String dependentVariable, @Nullable Double lambda, @Nullable Double gamma, @Nullable Double eta, private Regression(String dependentVariable, @Nullable Double lambda, @Nullable Double gamma, @Nullable Double eta,
@Nullable Integer maxTrees, @Nullable Double featureBagFraction, @Nullable Integer maxTrees, @Nullable Double featureBagFraction,
@Nullable Integer numTopFeatureImportanceValues, @Nullable String predictionFieldName, @Nullable Integer numTopFeatureImportanceValues, @Nullable String predictionFieldName,
@Nullable Double trainingPercent, @Nullable Long randomizeSeed, @Nullable LossFunction lossFunction, @Nullable Double trainingPercent, @Nullable Long randomizeSeed, @Nullable LossFunction lossFunction,
@Nullable Double lossFunctionParameter) { @Nullable Double lossFunctionParameter, @Nullable List<PreProcessor> featureProcessors) {
this.dependentVariable = Objects.requireNonNull(dependentVariable); this.dependentVariable = Objects.requireNonNull(dependentVariable);
this.lambda = lambda; this.lambda = lambda;
this.gamma = gamma; this.gamma = gamma;
@ -120,6 +131,7 @@ public class Regression implements DataFrameAnalysis {
this.randomizeSeed = randomizeSeed; this.randomizeSeed = randomizeSeed;
this.lossFunction = lossFunction; this.lossFunction = lossFunction;
this.lossFunctionParameter = lossFunctionParameter; this.lossFunctionParameter = lossFunctionParameter;
this.featureProcessors = featureProcessors;
} }
@Override @Override
@ -175,6 +187,10 @@ public class Regression implements DataFrameAnalysis {
return lossFunctionParameter; return lossFunctionParameter;
} }
public List<PreProcessor> getFeatureProcessors() {
return featureProcessors;
}
@Override @Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(); builder.startObject();
@ -212,6 +228,9 @@ public class Regression implements DataFrameAnalysis {
if (lossFunctionParameter != null) { if (lossFunctionParameter != null) {
builder.field(LOSS_FUNCTION_PARAMETER.getPreferredName(), lossFunctionParameter); builder.field(LOSS_FUNCTION_PARAMETER.getPreferredName(), lossFunctionParameter);
} }
if (featureProcessors != null) {
NamedXContentObjectHelper.writeNamedObjects(builder, params, true, FEATURE_PROCESSORS.getPreferredName(), featureProcessors);
}
builder.endObject(); builder.endObject();
return builder; return builder;
} }
@ -219,7 +238,7 @@ public class Regression implements DataFrameAnalysis {
@Override @Override
public int hashCode() { public int hashCode() {
return Objects.hash(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction, numTopFeatureImportanceValues, return Objects.hash(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction, numTopFeatureImportanceValues,
predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter); predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter, featureProcessors);
} }
@Override @Override
@ -238,7 +257,8 @@ public class Regression implements DataFrameAnalysis {
&& Objects.equals(trainingPercent, that.trainingPercent) && Objects.equals(trainingPercent, that.trainingPercent)
&& Objects.equals(randomizeSeed, that.randomizeSeed) && Objects.equals(randomizeSeed, that.randomizeSeed)
&& Objects.equals(lossFunction, that.lossFunction) && Objects.equals(lossFunction, that.lossFunction)
&& Objects.equals(lossFunctionParameter, that.lossFunctionParameter); && Objects.equals(lossFunctionParameter, that.lossFunctionParameter)
&& Objects.equals(featureProcessors, that.featureProcessors);
} }
@Override @Override
@ -259,6 +279,7 @@ public class Regression implements DataFrameAnalysis {
private Long randomizeSeed; private Long randomizeSeed;
private LossFunction lossFunction; private LossFunction lossFunction;
private Double lossFunctionParameter; private Double lossFunctionParameter;
private List<PreProcessor> featureProcessors;
private Builder(String dependentVariable) { private Builder(String dependentVariable) {
this.dependentVariable = Objects.requireNonNull(dependentVariable); this.dependentVariable = Objects.requireNonNull(dependentVariable);
@ -319,9 +340,15 @@ public class Regression implements DataFrameAnalysis {
return this; return this;
} }
public Builder setFeatureProcessors(List<PreProcessor> featureProcessors) {
this.featureProcessors = featureProcessors;
return this;
}
public Regression build() { public Regression build() {
return new Regression(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction, return new Regression(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction,
numTopFeatureImportanceValues, predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter); numTopFeatureImportanceValues, predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter,
featureProcessors);
} }
} }

@ -114,7 +114,7 @@ public class OneHotEncoding implements PreProcessor {
return Objects.hash(field, hotMap, custom); return Objects.hash(field, hotMap, custom);
} }
public Builder builder(String field) { public static Builder builder(String field) {
return new Builder(field); return new Builder(field);
} }

@ -179,6 +179,7 @@ import org.elasticsearch.client.ml.inference.TrainedModelDefinition;
import org.elasticsearch.client.ml.inference.TrainedModelDefinitionTests; import org.elasticsearch.client.ml.inference.TrainedModelDefinitionTests;
import org.elasticsearch.client.ml.inference.TrainedModelInput; import org.elasticsearch.client.ml.inference.TrainedModelInput;
import org.elasticsearch.client.ml.inference.TrainedModelStats; import org.elasticsearch.client.ml.inference.TrainedModelStats;
import org.elasticsearch.client.ml.inference.preprocessing.OneHotEncoding;
import org.elasticsearch.client.ml.inference.trainedmodel.RegressionConfig; import org.elasticsearch.client.ml.inference.trainedmodel.RegressionConfig;
import org.elasticsearch.client.ml.inference.trainedmodel.TargetType; import org.elasticsearch.client.ml.inference.trainedmodel.TargetType;
import org.elasticsearch.client.ml.job.config.AnalysisConfig; import org.elasticsearch.client.ml.job.config.AnalysisConfig;
@ -3003,6 +3004,9 @@ public class MlClientDocumentationIT extends ESRestHighLevelClientTestCase {
.setRandomizeSeed(1234L) // <10> .setRandomizeSeed(1234L) // <10>
.setClassAssignmentObjective(Classification.ClassAssignmentObjective.MAXIMIZE_ACCURACY) // <11> .setClassAssignmentObjective(Classification.ClassAssignmentObjective.MAXIMIZE_ACCURACY) // <11>
.setNumTopClasses(1) // <12> .setNumTopClasses(1) // <12>
.setFeatureProcessors(Arrays.asList(OneHotEncoding.builder("categorical_feature") // <13>
.addOneHot("cat", "cat_column")
.build()))
.build(); .build();
// end::put-data-frame-analytics-classification // end::put-data-frame-analytics-classification
@ -3019,6 +3023,9 @@ public class MlClientDocumentationIT extends ESRestHighLevelClientTestCase {
.setRandomizeSeed(1234L) // <10> .setRandomizeSeed(1234L) // <10>
.setLossFunction(Regression.LossFunction.MSE) // <11> .setLossFunction(Regression.LossFunction.MSE) // <11>
.setLossFunctionParameter(1.0) // <12> .setLossFunctionParameter(1.0) // <12>
.setFeatureProcessors(Arrays.asList(OneHotEncoding.builder("categorical_feature") // <13>
.addOneHot("cat", "cat_column")
.build()))
.build(); .build();
// end::put-data-frame-analytics-regression // end::put-data-frame-analytics-regression

@ -18,10 +18,20 @@
*/ */
package org.elasticsearch.client.ml.dataframe; package org.elasticsearch.client.ml.dataframe;
import org.elasticsearch.client.ml.inference.MlInferenceNamedXContentProvider;
import org.elasticsearch.client.ml.inference.preprocessing.FrequencyEncodingTests;
import org.elasticsearch.client.ml.inference.preprocessing.OneHotEncodingTests;
import org.elasticsearch.client.ml.inference.preprocessing.TargetMeanEncodingTests;
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.test.AbstractXContentTestCase; import org.elasticsearch.test.AbstractXContentTestCase;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class ClassificationTests extends AbstractXContentTestCase<Classification> { public class ClassificationTests extends AbstractXContentTestCase<Classification> {
@ -38,9 +48,20 @@ public class ClassificationTests extends AbstractXContentTestCase<Classification
.setRandomizeSeed(randomBoolean() ? null : randomLong()) .setRandomizeSeed(randomBoolean() ? null : randomLong())
.setClassAssignmentObjective(randomBoolean() ? null : randomFrom(Classification.ClassAssignmentObjective.values())) .setClassAssignmentObjective(randomBoolean() ? null : randomFrom(Classification.ClassAssignmentObjective.values()))
.setNumTopClasses(randomBoolean() ? null : randomIntBetween(0, 10)) .setNumTopClasses(randomBoolean() ? null : randomIntBetween(0, 10))
.setFeatureProcessors(randomBoolean() ? null :
Stream.generate(() -> randomFrom(FrequencyEncodingTests.createRandom(),
OneHotEncodingTests.createRandom(),
TargetMeanEncodingTests.createRandom()))
.limit(randomIntBetween(1, 10))
.collect(Collectors.toList()))
.build(); .build();
} }
@Override
protected Predicate<String> getRandomFieldsExcludeFilter() {
return field -> field.startsWith("feature_processors");
}
@Override @Override
protected Classification createTestInstance() { protected Classification createTestInstance() {
return randomClassification(); return randomClassification();
@ -55,4 +76,11 @@ public class ClassificationTests extends AbstractXContentTestCase<Classification
protected boolean supportsUnknownFields() { protected boolean supportsUnknownFields() {
return true; return true;
} }
@Override
protected NamedXContentRegistry xContentRegistry() {
List<NamedXContentRegistry.Entry> namedXContent = new ArrayList<>();
namedXContent.addAll(new MlInferenceNamedXContentProvider().getNamedXContentParsers());
return new NamedXContentRegistry(namedXContent);
}
} }

@ -20,6 +20,7 @@
package org.elasticsearch.client.ml.dataframe; package org.elasticsearch.client.ml.dataframe;
import org.elasticsearch.Version; import org.elasticsearch.Version;
import org.elasticsearch.client.ml.inference.MlInferenceNamedXContentProvider;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.ByteSizeUnit; import org.elasticsearch.common.unit.ByteSizeUnit;
import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.common.unit.ByteSizeValue;
@ -101,6 +102,7 @@ public class DataFrameAnalyticsConfigTests extends AbstractXContentTestCase<Data
List<NamedXContentRegistry.Entry> namedXContent = new ArrayList<>(); List<NamedXContentRegistry.Entry> namedXContent = new ArrayList<>();
namedXContent.addAll(new SearchModule(Settings.EMPTY, false, Collections.emptyList()).getNamedXContents()); namedXContent.addAll(new SearchModule(Settings.EMPTY, false, Collections.emptyList()).getNamedXContents());
namedXContent.addAll(new MlDataFrameAnalysisNamedXContentProvider().getNamedXContentParsers()); namedXContent.addAll(new MlDataFrameAnalysisNamedXContentProvider().getNamedXContentParsers());
namedXContent.addAll(new MlInferenceNamedXContentProvider().getNamedXContentParsers());
return new NamedXContentRegistry(namedXContent); return new NamedXContentRegistry(namedXContent);
} }
} }

@ -18,10 +18,20 @@
*/ */
package org.elasticsearch.client.ml.dataframe; package org.elasticsearch.client.ml.dataframe;
import org.elasticsearch.client.ml.inference.MlInferenceNamedXContentProvider;
import org.elasticsearch.client.ml.inference.preprocessing.FrequencyEncodingTests;
import org.elasticsearch.client.ml.inference.preprocessing.OneHotEncodingTests;
import org.elasticsearch.client.ml.inference.preprocessing.TargetMeanEncodingTests;
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.test.AbstractXContentTestCase; import org.elasticsearch.test.AbstractXContentTestCase;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class RegressionTests extends AbstractXContentTestCase<Regression> { public class RegressionTests extends AbstractXContentTestCase<Regression> {
@ -37,9 +47,20 @@ public class RegressionTests extends AbstractXContentTestCase<Regression> {
.setTrainingPercent(randomBoolean() ? null : randomDoubleBetween(1.0, 100.0, true)) .setTrainingPercent(randomBoolean() ? null : randomDoubleBetween(1.0, 100.0, true))
.setLossFunction(randomBoolean() ? null : randomFrom(Regression.LossFunction.values())) .setLossFunction(randomBoolean() ? null : randomFrom(Regression.LossFunction.values()))
.setLossFunctionParameter(randomBoolean() ? null : randomDoubleBetween(1.0, Double.MAX_VALUE, true)) .setLossFunctionParameter(randomBoolean() ? null : randomDoubleBetween(1.0, Double.MAX_VALUE, true))
.setFeatureProcessors(randomBoolean() ? null :
Stream.generate(() -> randomFrom(FrequencyEncodingTests.createRandom(),
OneHotEncodingTests.createRandom(),
TargetMeanEncodingTests.createRandom()))
.limit(randomIntBetween(1, 10))
.collect(Collectors.toList()))
.build(); .build();
} }
@Override
protected Predicate<String> getRandomFieldsExcludeFilter() {
return field -> field.startsWith("feature_processors");
}
@Override @Override
protected Regression createTestInstance() { protected Regression createTestInstance() {
return randomRegression(); return randomRegression();
@ -54,4 +75,11 @@ public class RegressionTests extends AbstractXContentTestCase<Regression> {
protected boolean supportsUnknownFields() { protected boolean supportsUnknownFields() {
return true; return true;
} }
@Override
protected NamedXContentRegistry xContentRegistry() {
List<NamedXContentRegistry.Entry> namedXContent = new ArrayList<>();
namedXContent.addAll(new MlInferenceNamedXContentProvider().getNamedXContentParsers());
return new NamedXContentRegistry(namedXContent);
}
} }

@ -124,6 +124,8 @@ include-tagged::{doc-tests-file}[{api}-classification]
<10> The seed to be used by the random generator that picks which rows are used in training. <10> The seed to be used by the random generator that picks which rows are used in training.
<11> The optimization objective to target when assigning class labels. Defaults to maximize_minimum_recall. <11> The optimization objective to target when assigning class labels. Defaults to maximize_minimum_recall.
<12> The number of top classes to be reported in the results. Defaults to 2. <12> The number of top classes to be reported in the results. Defaults to 2.
<13> Custom feature processors that will create new features for analysis from the included document
fields. Note, automatic categorical {ml-docs}/ml-feature-encoding.html[feature encoding] still occurs for all features.
===== Regression ===== Regression
@ -146,6 +148,8 @@ include-tagged::{doc-tests-file}[{api}-regression]
<10> The seed to be used by the random generator that picks which rows are used in training. <10> The seed to be used by the random generator that picks which rows are used in training.
<11> The loss function used for regression. Defaults to `mse`. <11> The loss function used for regression. Defaults to `mse`.
<12> An optional parameter to the loss function. <12> An optional parameter to the loss function.
<13> Custom feature processors that will create new features for analysis from the included document
fields. Note, automatic categorical {ml-docs}/ml-feature-encoding.html[feature encoding] still occurs for all features.
==== Analyzed fields ==== Analyzed fields

@ -25,7 +25,7 @@ If the {es} {security-features} are enabled, you must have the following built-i
* `machine_learning_admin` * `machine_learning_admin`
* source indices: `read`, `view_index_metadata` * source indices: `read`, `view_index_metadata`
* destination index: `read`, `create_index`, `manage` and `index` * destination index: `read`, `create_index`, `manage` and `index`
For more information, see <<built-in-roles>>, <<security-privileges>>, and For more information, see <<built-in-roles>>, <<security-privileges>>, and
{ml-docs-setup-privileges}. {ml-docs-setup-privileges}.
@ -33,20 +33,20 @@ For more information, see <<built-in-roles>>, <<security-privileges>>, and
NOTE: The {dfanalytics-job} remembers which roles the user who created it had at NOTE: The {dfanalytics-job} remembers which roles the user who created it had at
the time of creation. When you start the job, it performs the analysis using the time of creation. When you start the job, it performs the analysis using
those same roles. If you provide those same roles. If you provide
<<http-clients-secondary-authorization,secondary authorization headers>>, <<http-clients-secondary-authorization,secondary authorization headers>>,
those credentials are used instead. those credentials are used instead.
[[ml-put-dfanalytics-desc]] [[ml-put-dfanalytics-desc]]
== {api-description-title} == {api-description-title}
This API creates a {dfanalytics-job} that performs an analysis on the source This API creates a {dfanalytics-job} that performs an analysis on the source
indices and stores the outcome in a destination index. indices and stores the outcome in a destination index.
If the destination index does not exist, it is created automatically when you If the destination index does not exist, it is created automatically when you
start the job. See <<start-dfanalytics>>. start the job. See <<start-dfanalytics>>.
If you supply only a subset of the {regression} or {classification} parameters, If you supply only a subset of the {regression} or {classification} parameters,
{ml-docs}/hyperparameters.html[hyperparameter optimization] occurs. It {ml-docs}/hyperparameters.html[hyperparameter optimization] occurs. It
determines a value for each of the undefined parameters. determines a value for each of the undefined parameters.
[[ml-put-dfanalytics-path-params]] [[ml-put-dfanalytics-path-params]]
@ -61,9 +61,9 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=job-id-data-frame-analytics-def
== {api-request-body-title} == {api-request-body-title}
`allow_lazy_start`:: `allow_lazy_start`::
(Optional, boolean) (Optional, boolean)
Specifies whether this job can start when there is insufficient {ml} node Specifies whether this job can start when there is insufficient {ml} node
capacity for it to be immediately assigned to a node. The default is `false`; if capacity for it to be immediately assigned to a node. The default is `false`; if
a {ml} node with capacity to run the job cannot immediately be found, the a {ml} node with capacity to run the job cannot immediately be found, the
<<start-dfanalytics>> API returns an error. However, this is also subject to the <<start-dfanalytics>> API returns an error. However, this is also subject to the
cluster-wide `xpack.ml.max_lazy_ml_nodes` setting. See <<advanced-ml-settings>>. cluster-wide `xpack.ml.max_lazy_ml_nodes` setting. See <<advanced-ml-settings>>.
@ -86,7 +86,7 @@ one of the following types of analysis: {classification}, {oldetection}, or
The configuration information necessary to perform The configuration information necessary to perform
{ml-docs}/dfa-classification.html[{classification}]. {ml-docs}/dfa-classification.html[{classification}].
+ +
TIP: Advanced parameters are for fine-tuning {classanalysis}. They are set TIP: Advanced parameters are for fine-tuning {classanalysis}. They are set
automatically by hyperparameter optimization to give the minimum validation automatically by hyperparameter optimization to give the minimum validation
error. It is highly recommended to use the default values unless you fully error. It is highly recommended to use the default values unless you fully
understand the function of these parameters. understand the function of these parameters.
@ -108,23 +108,27 @@ categorical (`ip` or `keyword`), or boolean. There must be no more than 30
different values in this field. different values in this field.
`eta`:::: `eta`::::
(Optional, double) (Optional, double)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=eta] include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=eta]
`feature_bag_fraction`:::: `feature_bag_fraction`::::
(Optional, double) (Optional, double)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-bag-fraction] include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-bag-fraction]
`feature_processors`::::
(Optional, list)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=dfas-feature-processors]
`gamma`:::: `gamma`::::
(Optional, double) (Optional, double)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=gamma] include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=gamma]
`lambda`:::: `lambda`::::
(Optional, double) (Optional, double)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=lambda] include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=lambda]
`max_trees`:::: `max_trees`::::
(Optional, integer) (Optional, integer)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=max-trees] include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=max-trees]
`num_top_classes`:::: `num_top_classes`::::
@ -136,11 +140,11 @@ categories, the API reports all category probabilities. Defaults to 2.
`num_top_feature_importance_values`:::: `num_top_feature_importance_values`::::
(Optional, integer) (Optional, integer)
Advanced configuration option. Specifies the maximum number of Advanced configuration option. Specifies the maximum number of
{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return. {ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return.
By default, it is zero and no {feat-imp} calculation occurs. By default, it is zero and no {feat-imp} calculation occurs.
`prediction_field_name`:::: `prediction_field_name`::::
(Optional, string) (Optional, string)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=prediction-field-name] include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=prediction-field-name]
`randomize_seed`:::: `randomize_seed`::::
@ -164,23 +168,23 @@ The configuration information necessary to perform
`compute_feature_influence`:::: `compute_feature_influence`::::
(Optional, boolean) (Optional, boolean)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=compute-feature-influence] include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=compute-feature-influence]
`feature_influence_threshold`:::: `feature_influence_threshold`::::
(Optional, double) (Optional, double)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-influence-threshold] include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-influence-threshold]
`method`:::: `method`::::
(Optional, string) (Optional, string)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=method] include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=method]
`n_neighbors`:::: `n_neighbors`::::
(Optional, integer) (Optional, integer)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=n-neighbors] include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=n-neighbors]
`outlier_fraction`:::: `outlier_fraction`::::
(Optional, double) (Optional, double)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=outlier-fraction] include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=outlier-fraction]
`standardization_enabled`:::: `standardization_enabled`::::
(Optional, boolean) (Optional, boolean)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=standardization-enabled] include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=standardization-enabled]
@ -192,7 +196,7 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=standardization-enabled]
The configuration information necessary to perform The configuration information necessary to perform
{ml-docs}/dfa-regression.html[{regression}]. {ml-docs}/dfa-regression.html[{regression}].
+ +
TIP: Advanced parameters are for fine-tuning {reganalysis}. They are set TIP: Advanced parameters are for fine-tuning {reganalysis}. They are set
automatically by hyperparameter optimization to give minimum validation error. automatically by hyperparameter optimization to give minimum validation error.
It is highly recommended to use the default values unless you fully understand It is highly recommended to use the default values unless you fully understand
the function of these parameters. the function of these parameters.
@ -215,20 +219,24 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=eta]
(Optional, double) (Optional, double)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-bag-fraction] include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-bag-fraction]
`feature_processors`::::
(Optional, list)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=dfas-feature-processors]
`gamma`:::: `gamma`::::
(Optional, double) (Optional, double)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=gamma] include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=gamma]
`lambda`:::: `lambda`::::
(Optional, double) (Optional, double)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=lambda] include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=lambda]
`loss_function`:::: `loss_function`::::
(Optional, string) (Optional, string)
The loss function used during {regression}. Available options are `mse` (mean The loss function used during {regression}. Available options are `mse` (mean
squared error), `msle` (mean squared logarithmic error), `huber` (Pseudo-Huber squared error), `msle` (mean squared logarithmic error), `huber` (Pseudo-Huber
loss). Defaults to `mse`. Refer to loss). Defaults to `mse`. Refer to
{ml-docs}/dfa-regression.html#dfa-regression-lossfunction[Loss functions for {regression} analyses] {ml-docs}/dfa-regression.html#dfa-regression-lossfunction[Loss functions for {regression} analyses]
to learn more. to learn more.
`loss_function_parameter`:::: `loss_function_parameter`::::
@ -236,13 +244,13 @@ to learn more.
A positive number that is used as a parameter to the `loss_function`. A positive number that is used as a parameter to the `loss_function`.
`max_trees`:::: `max_trees`::::
(Optional, integer) (Optional, integer)
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=max-trees] include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=max-trees]
`num_top_feature_importance_values`:::: `num_top_feature_importance_values`::::
(Optional, integer) (Optional, integer)
Advanced configuration option. Specifies the maximum number of Advanced configuration option. Specifies the maximum number of
{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return. {ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return.
By default, it is zero and no {feat-imp} calculation occurs. By default, it is zero and no {feat-imp} calculation occurs.
`prediction_field_name`:::: `prediction_field_name`::::
@ -264,31 +272,31 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=training-percent]
//Begin analyzed_fields //Begin analyzed_fields
`analyzed_fields`:: `analyzed_fields`::
(Optional, object) (Optional, object)
Specify `includes` and/or `excludes` patterns to select which fields will be Specify `includes` and/or `excludes` patterns to select which fields will be
included in the analysis. The patterns specified in `excludes` are applied last, included in the analysis. The patterns specified in `excludes` are applied last,
therefore `excludes` takes precedence. In other words, if the same field is therefore `excludes` takes precedence. In other words, if the same field is
specified in both `includes` and `excludes`, then the field will not be included specified in both `includes` and `excludes`, then the field will not be included
in the analysis. in the analysis.
+ +
-- --
[[dfa-supported-fields]] [[dfa-supported-fields]]
The supported fields for each type of analysis are as follows: The supported fields for each type of analysis are as follows:
* {oldetection-cap} requires numeric or boolean data to analyze. The algorithms * {oldetection-cap} requires numeric or boolean data to analyze. The algorithms
don't support missing values therefore fields that have data types other than don't support missing values therefore fields that have data types other than
numeric or boolean are ignored. Documents where included fields contain missing numeric or boolean are ignored. Documents where included fields contain missing
values, null values, or an array are also ignored. Therefore the `dest` index values, null values, or an array are also ignored. Therefore the `dest` index
may contain documents that don't have an {olscore}. may contain documents that don't have an {olscore}.
* {regression-cap} supports fields that are numeric, `boolean`, `text`, * {regression-cap} supports fields that are numeric, `boolean`, `text`,
`keyword`, and `ip`. It is also tolerant of missing values. Fields that are `keyword`, and `ip`. It is also tolerant of missing values. Fields that are
supported are included in the analysis, other fields are ignored. Documents supported are included in the analysis, other fields are ignored. Documents
where included fields contain an array with two or more values are also where included fields contain an array with two or more values are also
ignored. Documents in the `dest` index that dont contain a results field are ignored. Documents in the `dest` index that dont contain a results field are
not included in the {reganalysis}. not included in the {reganalysis}.
* {classification-cap} supports fields that are numeric, `boolean`, `text`, * {classification-cap} supports fields that are numeric, `boolean`, `text`,
`keyword`, and `ip`. It is also tolerant of missing values. Fields that are `keyword`, and `ip`. It is also tolerant of missing values. Fields that are
supported are included in the analysis, other fields are ignored. Documents supported are included in the analysis, other fields are ignored. Documents
where included fields contain an array with two or more values are also ignored. where included fields contain an array with two or more values are also ignored.
Documents in the `dest` index that dont contain a results field are not Documents in the `dest` index that dont contain a results field are not
included in the {classanalysis}. {classanalysis-cap} can be improved by mapping included in the {classanalysis}. {classanalysis-cap} can be improved by mapping
ordinal variable values to a single number. For example, in case of age ranges, ordinal variable values to a single number. For example, in case of age ranges,
@ -310,7 +318,7 @@ analysis. You do not need to add fields with unsupported data types to
`includes`::: `includes`:::
(Optional, array) (Optional, array)
An array of strings that defines the fields that will be included in the An array of strings that defines the fields that will be included in the
analysis. analysis.
//End analyzed_fields //End analyzed_fields
==== ====
@ -330,16 +338,16 @@ The default value is `1`. Using more threads may decrease the time
necessary to complete the analysis at the cost of using more CPU. necessary to complete the analysis at the cost of using more CPU.
Note that the process may use additional threads for operational Note that the process may use additional threads for operational
functionality other than the analysis itself. functionality other than the analysis itself.
`model_memory_limit`:: `model_memory_limit`::
(Optional, string) (Optional, string)
The approximate maximum amount of memory resources that are permitted for The approximate maximum amount of memory resources that are permitted for
analytical processing. The default value for {dfanalytics-jobs} is `1gb`. If analytical processing. The default value for {dfanalytics-jobs} is `1gb`. If
your `elasticsearch.yml` file contains an `xpack.ml.max_model_memory_limit` your `elasticsearch.yml` file contains an `xpack.ml.max_model_memory_limit`
setting, an error occurs when you try to create {dfanalytics-jobs} that have setting, an error occurs when you try to create {dfanalytics-jobs} that have
`model_memory_limit` values greater than that setting. For more information, see `model_memory_limit` values greater than that setting. For more information, see
<<ml-settings>>. <<ml-settings>>.
`source`:: `source`::
(object) (object)
The configuration of how to source the analysis data. It requires an `index`. The configuration of how to source the analysis data. It requires an `index`.
@ -353,7 +361,7 @@ Optionally, `query` and `_source` may be specified.
It can be a single index or index pattern as well as an array of indices or It can be a single index or index pattern as well as an array of indices or
patterns. patterns.
+ +
WARNING: If your source indices contain documents with the same IDs, only the WARNING: If your source indices contain documents with the same IDs, only the
document that is indexed last appears in the destination index. document that is indexed last appears in the destination index.
`query`::: `query`:::
@ -374,7 +382,7 @@ included in the analysis.
`includes`:::: `includes`::::
(array) An array of strings that defines the fields that will be included in the (array) An array of strings that defines the fields that will be included in the
destination. destination.
`excludes`:::: `excludes`::::
(array) An array of strings that defines the fields that will be excluded from (array) An array of strings that defines the fields that will be excluded from
the destination. the destination.
@ -390,8 +398,8 @@ the destination.
[[ml-put-dfanalytics-example-preprocess]] [[ml-put-dfanalytics-example-preprocess]]
=== Preprocessing actions example === Preprocessing actions example
The following example shows how to limit the scope of the analysis to certain The following example shows how to limit the scope of the analysis to certain
fields, specify excluded fields in the destination index, and use a query to fields, specify excluded fields in the destination index, and use a query to
filter your data before analysis. filter your data before analysis.
[source,console] [source,console]
@ -404,7 +412,7 @@ PUT _ml/data_frame/analytics/model-flight-delays-pre
], ],
"query": { <2> "query": { <2>
"range": { "range": {
"DistanceKilometers": { "DistanceKilometers": {
"gt": 0 "gt": 0
} }
} }
@ -429,7 +437,7 @@ PUT _ml/data_frame/analytics/model-flight-delays-pre
}, },
"analyzed_fields": { <5> "analyzed_fields": { <5>
"includes": [], "includes": [],
"excludes": [ "excludes": [
"FlightNum" "FlightNum"
] ]
}, },
@ -439,29 +447,29 @@ PUT _ml/data_frame/analytics/model-flight-delays-pre
// TEST[skip:setup kibana sample data] // TEST[skip:setup kibana sample data]
<1> Source index to analyze. <1> Source index to analyze.
<2> This query filters out entire documents that will not be present in the <2> This query filters out entire documents that will not be present in the
destination index. destination index.
<3> The `_source` object defines fields in the dataset that will be included or <3> The `_source` object defines fields in the dataset that will be included or
excluded in the destination index. excluded in the destination index.
<4> Defines the destination index that contains the results of the analysis and <4> Defines the destination index that contains the results of the analysis and
the fields of the source index specified in the `_source` object. Also defines the fields of the source index specified in the `_source` object. Also defines
the name of the `results_field`. the name of the `results_field`.
<5> Specifies fields to be included in or excluded from the analysis. This does <5> Specifies fields to be included in or excluded from the analysis. This does
not affect whether the fields will be present in the destination index, only not affect whether the fields will be present in the destination index, only
affects whether they are used in the analysis. affects whether they are used in the analysis.
In this example, we can see that all the fields of the source index are included In this example, we can see that all the fields of the source index are included
in the destination index except `FlightDelay` and `FlightDelayType` because in the destination index except `FlightDelay` and `FlightDelayType` because
these are defined as excluded fields by the `excludes` parameter of the these are defined as excluded fields by the `excludes` parameter of the
`_source` object. The `FlightNum` field is included in the destination index, `_source` object. The `FlightNum` field is included in the destination index,
however it is not included in the analysis because it is explicitly specified as however it is not included in the analysis because it is explicitly specified as
excluded field by the `excludes` parameter of the `analyzed_fields` object. excluded field by the `excludes` parameter of the `analyzed_fields` object.
[[ml-put-dfanalytics-example-od]] [[ml-put-dfanalytics-example-od]]
=== {oldetection-cap} example === {oldetection-cap} example
The following example creates the `loganalytics` {dfanalytics-job}, the analysis The following example creates the `loganalytics` {dfanalytics-job}, the analysis
type is `outlier_detection`: type is `outlier_detection`:
[source,console] [source,console]
@ -525,7 +533,7 @@ The API returns the following result:
[[ml-put-dfanalytics-example-r]] [[ml-put-dfanalytics-example-r]]
=== {regression-cap} examples === {regression-cap} examples
The following example creates the `house_price_regression_analysis` The following example creates the `house_price_regression_analysis`
{dfanalytics-job}, the analysis type is `regression`: {dfanalytics-job}, the analysis type is `regression`:
[source,console] [source,console]
@ -538,7 +546,7 @@ PUT _ml/data_frame/analytics/house_price_regression_analysis
"dest": { "dest": {
"index": "house_price_predictions" "index": "house_price_predictions"
}, },
"analysis": "analysis":
{ {
"regression": { "regression": {
"dependent_variable": "price" "dependent_variable": "price"
@ -614,7 +622,7 @@ PUT _ml/data_frame/analytics/student_performance_mathematics_0.3
[[ml-put-dfanalytics-example-c]] [[ml-put-dfanalytics-example-c]]
=== {classification-cap} example === {classification-cap} example
The following example creates the `loan_classification` {dfanalytics-job}, the The following example creates the `loan_classification` {dfanalytics-job}, the
analysis type is `classification`: analysis type is `classification`:
[source,console] [source,console]

@ -453,10 +453,10 @@ Defaults to `true`.
end::delayed-data-check-config[] end::delayed-data-check-config[]
tag::dependent-variable[] tag::dependent-variable[]
Defines which field of the document is to be predicted. Defines which field of the document is to be predicted.
This parameter is supplied by field name and must match one of the fields in This parameter is supplied by field name and must match one of the fields in
the index being used to train. If this field is missing from a document, then the index being used to train. If this field is missing from a document, then
that document will not be used for training, but a prediction with the trained that document will not be used for training, but a prediction with the trained
model will be generated for it. It is also known as continuous target variable. model will be generated for it. It is also known as continuous target variable.
end::dependent-variable[] end::dependent-variable[]
@ -513,10 +513,18 @@ The value of the downsample factor.
end::dfas-downsample-factor[] end::dfas-downsample-factor[]
tag::dfas-eta-growth[] tag::dfas-eta-growth[]
Specifies the rate at which the `eta` increases for each new tree that is added to the Specifies the rate at which the `eta` increases for each new tree that is added to the
forest. For example, a rate of `1.05` increases `eta` by 5%. forest. For example, a rate of `1.05` increases `eta` by 5%.
end::dfas-eta-growth[] end::dfas-eta-growth[]
tag::dfas-feature-processors[]
A collection of feature preprocessors that modify one or more included fields.
The analysis uses the resulting one or more features instead of the
original document field. Multiple `feature_processors` entries can refer to the
same document fields.
Note, automatic categorical {ml-docs}/ml-feature-encoding.html[feature encoding] still occurs.
end::dfas-feature-processors[]
tag::dfas-iteration[] tag::dfas-iteration[]
The number of iterations on the analysis. The number of iterations on the analysis.
end::dfas-iteration[] end::dfas-iteration[]
@ -529,9 +537,9 @@ training stops.
end::dfas-max-attempts[] end::dfas-max-attempts[]
tag::dfas-max-optimization-rounds[] tag::dfas-max-optimization-rounds[]
A multiplier responsible for determining the maximum number of A multiplier responsible for determining the maximum number of
hyperparameter optimization steps in the Bayesian optimization procedure. hyperparameter optimization steps in the Bayesian optimization procedure.
The maximum number of steps is determined based on the number of undefined hyperparameters The maximum number of steps is determined based on the number of undefined hyperparameters
times the maximum optimization rounds per hyperparameter. times the maximum optimization rounds per hyperparameter.
end::dfas-max-optimization-rounds[] end::dfas-max-optimization-rounds[]
@ -595,10 +603,10 @@ functions that are tolerant to gaps in data such as `mean`, `non_null_sum` or
end::empty-bucket-count[] end::empty-bucket-count[]
tag::eta[] tag::eta[]
Advanced configuration option. The shrinkage applied to the weights. Smaller Advanced configuration option. The shrinkage applied to the weights. Smaller
values result in larger forests which have a better generalization error. values result in larger forests which have a better generalization error.
However, the smaller the value the longer the training will take. For more However, the smaller the value the longer the training will take. For more
information, about shrinkage, see information, about shrinkage, see
{wikipedia}/Gradient_boosting#Shrinkage[this wiki article]. By {wikipedia}/Gradient_boosting#Shrinkage[this wiki article]. By
default, this value is calcuated during hyperparameter optimization. default, this value is calcuated during hyperparameter optimization.
end::eta[] end::eta[]
@ -624,13 +632,13 @@ this value to determine the number of unique categories that were missed.
end::failed-category-count[] end::failed-category-count[]
tag::feature-bag-fraction[] tag::feature-bag-fraction[]
Advanced configuration option. Defines the fraction of features that will be Advanced configuration option. Defines the fraction of features that will be
used when selecting a random bag for each candidate split. By default, this used when selecting a random bag for each candidate split. By default, this
value is calculated during hyperparameter optimization. value is calculated during hyperparameter optimization.
end::feature-bag-fraction[] end::feature-bag-fraction[]
tag::feature-influence-threshold[] tag::feature-influence-threshold[]
The minimum {olscore} that a document needs to have to calculate its feature The minimum {olscore} that a document needs to have to calculate its feature
influence score. Value range: 0-1 (`0.1` by default). influence score. Value range: 0-1 (`0.1` by default).
end::feature-influence-threshold[] end::feature-influence-threshold[]
@ -675,10 +683,10 @@ The analysis function that is used. For example, `count`, `rare`, `mean`, `min`,
end::function[] end::function[]
tag::gamma[] tag::gamma[]
Advanced configuration option. Regularization parameter to prevent overfitting Advanced configuration option. Regularization parameter to prevent overfitting
on the training data set. Multiplies a linear penalty associated with the size of on the training data set. Multiplies a linear penalty associated with the size of
individual trees in the forest. The higher the value the more training will individual trees in the forest. The higher the value the more training will
prefer smaller trees. The smaller this parameter the larger individual trees prefer smaller trees. The smaller this parameter the larger individual trees
will be and the longer training will take. By default, this value is calculated will be and the longer training will take. By default, this value is calculated
during hyperparameter optimization. during hyperparameter optimization.
end::gamma[] end::gamma[]
@ -798,8 +806,8 @@ information for all {anomaly-jobs}.
end::job-id-anomaly-detection-default[] end::job-id-anomaly-detection-default[]
tag::job-id-anomaly-detection-define[] tag::job-id-anomaly-detection-define[]
Identifier for the {anomaly-job}. This identifier can contain lowercase Identifier for the {anomaly-job}. This identifier can contain lowercase
alphanumeric characters (a-z and 0-9), hyphens, and underscores. It must start alphanumeric characters (a-z and 0-9), hyphens, and underscores. It must start
and end with alphanumeric characters. and end with alphanumeric characters.
end::job-id-anomaly-detection-define[] end::job-id-anomaly-detection-define[]
@ -843,12 +851,12 @@ For more information, see <<ml-jobstats>>.
end::jobs-stats-anomaly-detection[] end::jobs-stats-anomaly-detection[]
tag::lambda[] tag::lambda[]
Advanced configuration option. Regularization parameter to prevent overfitting Advanced configuration option. Regularization parameter to prevent overfitting
on the training data set. Multiplies an L2 regularisation term which applies to on the training data set. Multiplies an L2 regularisation term which applies to
leaf weights of the individual trees in the forest. The higher the value the leaf weights of the individual trees in the forest. The higher the value the
more training will attempt to keep leaf weights small. This makes the prediction more training will attempt to keep leaf weights small. This makes the prediction
function smoother at the expense of potentially not being able to capture function smoother at the expense of potentially not being able to capture
relevant relationships between the features and the {depvar}. The smaller this relevant relationships between the features and the {depvar}. The smaller this
parameter the larger individual trees will be and the longer training will take. parameter the larger individual trees will be and the longer training will take.
By default, this value is calculated during hyperparameter optimization. By default, this value is calculated during hyperparameter optimization.
end::lambda[] end::lambda[]
@ -1098,8 +1106,8 @@ For open jobs only, the elapsed time for which the job has been open.
end::open-time[] end::open-time[]
tag::outlier-fraction[] tag::outlier-fraction[]
The proportion of the data set that is assumed to be outlying prior to The proportion of the data set that is assumed to be outlying prior to
{oldetection}. For example, 0.05 means it is assumed that 5% of values are real {oldetection}. For example, 0.05 means it is assumed that 5% of values are real
outliers and 95% are inliers. outliers and 95% are inliers.
end::outlier-fraction[] end::outlier-fraction[]
@ -1185,7 +1193,7 @@ tag::randomize-seed[]
Defines the seed to the random generator that is used to pick Defines the seed to the random generator that is used to pick
which documents will be used for training. By default it is randomly generated. which documents will be used for training. By default it is randomly generated.
Set it to a specific value to ensure the same documents are used for training Set it to a specific value to ensure the same documents are used for training
assuming other related parameters (e.g. `source`, `analyzed_fields`, etc.) are assuming other related parameters (e.g. `source`, `analyzed_fields`, etc.) are
the same. the same.
end::randomize-seed[] end::randomize-seed[]
@ -1264,8 +1272,8 @@ end::sparse-bucket-count[]
tag::standardization-enabled[] tag::standardization-enabled[]
If `true`, the following operation is performed on the columns before computing If `true`, the following operation is performed on the columns before computing
{olscores}: (x_i - mean(x_i)) / sd(x_i). Defaults to `true`. For {olscores}: (x_i - mean(x_i)) / sd(x_i). Defaults to `true`. For
more information about this concept, see more information about this concept, see
https://en.wikipedia.org/wiki/Feature_scaling#Standardization_(Z-score_Normalization)[Wikipedia]. https://en.wikipedia.org/wiki/Feature_scaling#Standardization_(Z-score_Normalization)[Wikipedia].
end::standardization-enabled[] end::standardization-enabled[]
@ -1340,12 +1348,12 @@ when the mode is set to `manual`. For example: `3h`.
end::time-span[] end::time-span[]
tag::timeout-start[] tag::timeout-start[]
Controls the amount of time to wait until the {dfanalytics-job} starts. Defaults Controls the amount of time to wait until the {dfanalytics-job} starts. Defaults
to 20 seconds. to 20 seconds.
end::timeout-start[] end::timeout-start[]
tag::timeout-stop[] tag::timeout-stop[]
Controls the amount of time to wait until the {dfanalytics-job} stops. Defaults Controls the amount of time to wait until the {dfanalytics-job} stops. Defaults
to 20 seconds. to 20 seconds.
end::timeout-stop[] end::timeout-stop[]