* [ML] adding docs + hlrc for data frame analysis feature_processors (#61149) Adds HLRC and some docs for the new feature_processors field in Data frame analytics. Co-authored-by: Przemysław Witek <przemyslaw.witek@elastic.co> Co-authored-by: Lisa Cawley <lcawley@elastic.co>
This commit is contained in:
parent
d05649bfae
commit
1ae2923632
|
@ -18,6 +18,8 @@
|
|||
*/
|
||||
package org.elasticsearch.client.ml.dataframe;
|
||||
|
||||
import org.elasticsearch.client.ml.inference.NamedXContentObjectHelper;
|
||||
import org.elasticsearch.client.ml.inference.preprocessing.PreProcessor;
|
||||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.common.ParseField;
|
||||
import org.elasticsearch.common.Strings;
|
||||
|
@ -26,6 +28,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
|
|||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Objects;
|
||||
|
||||
|
@ -53,7 +56,9 @@ public class Classification implements DataFrameAnalysis {
|
|||
static final ParseField CLASS_ASSIGNMENT_OBJECTIVE = new ParseField("class_assignment_objective");
|
||||
static final ParseField NUM_TOP_CLASSES = new ParseField("num_top_classes");
|
||||
static final ParseField RANDOMIZE_SEED = new ParseField("randomize_seed");
|
||||
static final ParseField FEATURE_PROCESSORS = new ParseField("feature_processors");
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private static final ConstructingObjectParser<Classification, Void> PARSER =
|
||||
new ConstructingObjectParser<>(
|
||||
NAME.getPreferredName(),
|
||||
|
@ -70,7 +75,8 @@ public class Classification implements DataFrameAnalysis {
|
|||
(Double) a[8],
|
||||
(Integer) a[9],
|
||||
(Long) a[10],
|
||||
(ClassAssignmentObjective) a[11]));
|
||||
(ClassAssignmentObjective) a[11],
|
||||
(List<PreProcessor>) a[12]));
|
||||
|
||||
static {
|
||||
PARSER.declareString(ConstructingObjectParser.constructorArg(), DEPENDENT_VARIABLE);
|
||||
|
@ -86,6 +92,10 @@ public class Classification implements DataFrameAnalysis {
|
|||
PARSER.declareLong(ConstructingObjectParser.optionalConstructorArg(), RANDOMIZE_SEED);
|
||||
PARSER.declareString(
|
||||
ConstructingObjectParser.optionalConstructorArg(), ClassAssignmentObjective::fromString, CLASS_ASSIGNMENT_OBJECTIVE);
|
||||
PARSER.declareNamedObjects(ConstructingObjectParser.optionalConstructorArg(),
|
||||
(p, c, n) -> p.namedObject(PreProcessor.class, n, c),
|
||||
(classification) -> {},
|
||||
FEATURE_PROCESSORS);
|
||||
}
|
||||
|
||||
private final String dependentVariable;
|
||||
|
@ -100,12 +110,13 @@ public class Classification implements DataFrameAnalysis {
|
|||
private final ClassAssignmentObjective classAssignmentObjective;
|
||||
private final Integer numTopClasses;
|
||||
private final Long randomizeSeed;
|
||||
private final List<PreProcessor> featureProcessors;
|
||||
|
||||
private Classification(String dependentVariable, @Nullable Double lambda, @Nullable Double gamma, @Nullable Double eta,
|
||||
@Nullable Integer maxTrees, @Nullable Double featureBagFraction,
|
||||
@Nullable Integer numTopFeatureImportanceValues, @Nullable String predictionFieldName,
|
||||
@Nullable Double trainingPercent, @Nullable Integer numTopClasses, @Nullable Long randomizeSeed,
|
||||
@Nullable ClassAssignmentObjective classAssignmentObjective) {
|
||||
@Nullable ClassAssignmentObjective classAssignmentObjective, @Nullable List<PreProcessor> featureProcessors) {
|
||||
this.dependentVariable = Objects.requireNonNull(dependentVariable);
|
||||
this.lambda = lambda;
|
||||
this.gamma = gamma;
|
||||
|
@ -118,6 +129,7 @@ public class Classification implements DataFrameAnalysis {
|
|||
this.classAssignmentObjective = classAssignmentObjective;
|
||||
this.numTopClasses = numTopClasses;
|
||||
this.randomizeSeed = randomizeSeed;
|
||||
this.featureProcessors = featureProcessors;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -173,6 +185,10 @@ public class Classification implements DataFrameAnalysis {
|
|||
return numTopClasses;
|
||||
}
|
||||
|
||||
public List<PreProcessor> getFeatureProcessors() {
|
||||
return featureProcessors;
|
||||
}
|
||||
|
||||
@Override
|
||||
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
|
||||
builder.startObject();
|
||||
|
@ -210,6 +226,9 @@ public class Classification implements DataFrameAnalysis {
|
|||
if (numTopClasses != null) {
|
||||
builder.field(NUM_TOP_CLASSES.getPreferredName(), numTopClasses);
|
||||
}
|
||||
if (featureProcessors != null) {
|
||||
NamedXContentObjectHelper.writeNamedObjects(builder, params, true, FEATURE_PROCESSORS.getPreferredName(), featureProcessors);
|
||||
}
|
||||
builder.endObject();
|
||||
return builder;
|
||||
}
|
||||
|
@ -217,7 +236,7 @@ public class Classification implements DataFrameAnalysis {
|
|||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction, numTopFeatureImportanceValues,
|
||||
predictionFieldName, trainingPercent, randomizeSeed, numTopClasses, classAssignmentObjective);
|
||||
predictionFieldName, trainingPercent, randomizeSeed, numTopClasses, classAssignmentObjective, featureProcessors);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -236,7 +255,8 @@ public class Classification implements DataFrameAnalysis {
|
|||
&& Objects.equals(trainingPercent, that.trainingPercent)
|
||||
&& Objects.equals(randomizeSeed, that.randomizeSeed)
|
||||
&& Objects.equals(numTopClasses, that.numTopClasses)
|
||||
&& Objects.equals(classAssignmentObjective, that.classAssignmentObjective);
|
||||
&& Objects.equals(classAssignmentObjective, that.classAssignmentObjective)
|
||||
&& Objects.equals(featureProcessors, that.featureProcessors);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -270,6 +290,7 @@ public class Classification implements DataFrameAnalysis {
|
|||
private Integer numTopClasses;
|
||||
private Long randomizeSeed;
|
||||
private ClassAssignmentObjective classAssignmentObjective;
|
||||
private List<PreProcessor> featureProcessors;
|
||||
|
||||
private Builder(String dependentVariable) {
|
||||
this.dependentVariable = Objects.requireNonNull(dependentVariable);
|
||||
|
@ -330,10 +351,15 @@ public class Classification implements DataFrameAnalysis {
|
|||
return this;
|
||||
}
|
||||
|
||||
public Builder setFeatureProcessors(List<PreProcessor> featureProcessors) {
|
||||
this.featureProcessors = featureProcessors;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Classification build() {
|
||||
return new Classification(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction,
|
||||
numTopFeatureImportanceValues, predictionFieldName, trainingPercent, numTopClasses, randomizeSeed,
|
||||
classAssignmentObjective);
|
||||
classAssignmentObjective, featureProcessors);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,6 +18,8 @@
|
|||
*/
|
||||
package org.elasticsearch.client.ml.dataframe;
|
||||
|
||||
import org.elasticsearch.client.ml.inference.NamedXContentObjectHelper;
|
||||
import org.elasticsearch.client.ml.inference.preprocessing.PreProcessor;
|
||||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.common.ParseField;
|
||||
import org.elasticsearch.common.Strings;
|
||||
|
@ -26,6 +28,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
|
|||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Objects;
|
||||
|
||||
|
@ -55,7 +58,9 @@ public class Regression implements DataFrameAnalysis {
|
|||
static final ParseField RANDOMIZE_SEED = new ParseField("randomize_seed");
|
||||
static final ParseField LOSS_FUNCTION = new ParseField("loss_function");
|
||||
static final ParseField LOSS_FUNCTION_PARAMETER = new ParseField("loss_function_parameter");
|
||||
static final ParseField FEATURE_PROCESSORS = new ParseField("feature_processors");
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private static final ConstructingObjectParser<Regression, Void> PARSER =
|
||||
new ConstructingObjectParser<>(
|
||||
NAME.getPreferredName(),
|
||||
|
@ -72,7 +77,8 @@ public class Regression implements DataFrameAnalysis {
|
|||
(Double) a[8],
|
||||
(Long) a[9],
|
||||
(LossFunction) a[10],
|
||||
(Double) a[11]
|
||||
(Double) a[11],
|
||||
(List<PreProcessor>) a[12]
|
||||
));
|
||||
|
||||
static {
|
||||
|
@ -88,6 +94,10 @@ public class Regression implements DataFrameAnalysis {
|
|||
PARSER.declareLong(ConstructingObjectParser.optionalConstructorArg(), RANDOMIZE_SEED);
|
||||
PARSER.declareString(optionalConstructorArg(), LossFunction::fromString, LOSS_FUNCTION);
|
||||
PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), LOSS_FUNCTION_PARAMETER);
|
||||
PARSER.declareNamedObjects(ConstructingObjectParser.optionalConstructorArg(),
|
||||
(p, c, n) -> p.namedObject(PreProcessor.class, n, c),
|
||||
(regression) -> {},
|
||||
FEATURE_PROCESSORS);
|
||||
}
|
||||
|
||||
private final String dependentVariable;
|
||||
|
@ -102,12 +112,13 @@ public class Regression implements DataFrameAnalysis {
|
|||
private final Long randomizeSeed;
|
||||
private final LossFunction lossFunction;
|
||||
private final Double lossFunctionParameter;
|
||||
private final List<PreProcessor> featureProcessors;
|
||||
|
||||
private Regression(String dependentVariable, @Nullable Double lambda, @Nullable Double gamma, @Nullable Double eta,
|
||||
@Nullable Integer maxTrees, @Nullable Double featureBagFraction,
|
||||
@Nullable Integer numTopFeatureImportanceValues, @Nullable String predictionFieldName,
|
||||
@Nullable Double trainingPercent, @Nullable Long randomizeSeed, @Nullable LossFunction lossFunction,
|
||||
@Nullable Double lossFunctionParameter) {
|
||||
@Nullable Double lossFunctionParameter, @Nullable List<PreProcessor> featureProcessors) {
|
||||
this.dependentVariable = Objects.requireNonNull(dependentVariable);
|
||||
this.lambda = lambda;
|
||||
this.gamma = gamma;
|
||||
|
@ -120,6 +131,7 @@ public class Regression implements DataFrameAnalysis {
|
|||
this.randomizeSeed = randomizeSeed;
|
||||
this.lossFunction = lossFunction;
|
||||
this.lossFunctionParameter = lossFunctionParameter;
|
||||
this.featureProcessors = featureProcessors;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -175,6 +187,10 @@ public class Regression implements DataFrameAnalysis {
|
|||
return lossFunctionParameter;
|
||||
}
|
||||
|
||||
public List<PreProcessor> getFeatureProcessors() {
|
||||
return featureProcessors;
|
||||
}
|
||||
|
||||
@Override
|
||||
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
|
||||
builder.startObject();
|
||||
|
@ -212,6 +228,9 @@ public class Regression implements DataFrameAnalysis {
|
|||
if (lossFunctionParameter != null) {
|
||||
builder.field(LOSS_FUNCTION_PARAMETER.getPreferredName(), lossFunctionParameter);
|
||||
}
|
||||
if (featureProcessors != null) {
|
||||
NamedXContentObjectHelper.writeNamedObjects(builder, params, true, FEATURE_PROCESSORS.getPreferredName(), featureProcessors);
|
||||
}
|
||||
builder.endObject();
|
||||
return builder;
|
||||
}
|
||||
|
@ -219,7 +238,7 @@ public class Regression implements DataFrameAnalysis {
|
|||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction, numTopFeatureImportanceValues,
|
||||
predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter);
|
||||
predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter, featureProcessors);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -238,7 +257,8 @@ public class Regression implements DataFrameAnalysis {
|
|||
&& Objects.equals(trainingPercent, that.trainingPercent)
|
||||
&& Objects.equals(randomizeSeed, that.randomizeSeed)
|
||||
&& Objects.equals(lossFunction, that.lossFunction)
|
||||
&& Objects.equals(lossFunctionParameter, that.lossFunctionParameter);
|
||||
&& Objects.equals(lossFunctionParameter, that.lossFunctionParameter)
|
||||
&& Objects.equals(featureProcessors, that.featureProcessors);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -259,6 +279,7 @@ public class Regression implements DataFrameAnalysis {
|
|||
private Long randomizeSeed;
|
||||
private LossFunction lossFunction;
|
||||
private Double lossFunctionParameter;
|
||||
private List<PreProcessor> featureProcessors;
|
||||
|
||||
private Builder(String dependentVariable) {
|
||||
this.dependentVariable = Objects.requireNonNull(dependentVariable);
|
||||
|
@ -319,9 +340,15 @@ public class Regression implements DataFrameAnalysis {
|
|||
return this;
|
||||
}
|
||||
|
||||
public Builder setFeatureProcessors(List<PreProcessor> featureProcessors) {
|
||||
this.featureProcessors = featureProcessors;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Regression build() {
|
||||
return new Regression(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction,
|
||||
numTopFeatureImportanceValues, predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter);
|
||||
numTopFeatureImportanceValues, predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter,
|
||||
featureProcessors);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -114,7 +114,7 @@ public class OneHotEncoding implements PreProcessor {
|
|||
return Objects.hash(field, hotMap, custom);
|
||||
}
|
||||
|
||||
public Builder builder(String field) {
|
||||
public static Builder builder(String field) {
|
||||
return new Builder(field);
|
||||
}
|
||||
|
||||
|
|
|
@ -179,6 +179,7 @@ import org.elasticsearch.client.ml.inference.TrainedModelDefinition;
|
|||
import org.elasticsearch.client.ml.inference.TrainedModelDefinitionTests;
|
||||
import org.elasticsearch.client.ml.inference.TrainedModelInput;
|
||||
import org.elasticsearch.client.ml.inference.TrainedModelStats;
|
||||
import org.elasticsearch.client.ml.inference.preprocessing.OneHotEncoding;
|
||||
import org.elasticsearch.client.ml.inference.trainedmodel.RegressionConfig;
|
||||
import org.elasticsearch.client.ml.inference.trainedmodel.TargetType;
|
||||
import org.elasticsearch.client.ml.job.config.AnalysisConfig;
|
||||
|
@ -3003,6 +3004,9 @@ public class MlClientDocumentationIT extends ESRestHighLevelClientTestCase {
|
|||
.setRandomizeSeed(1234L) // <10>
|
||||
.setClassAssignmentObjective(Classification.ClassAssignmentObjective.MAXIMIZE_ACCURACY) // <11>
|
||||
.setNumTopClasses(1) // <12>
|
||||
.setFeatureProcessors(Arrays.asList(OneHotEncoding.builder("categorical_feature") // <13>
|
||||
.addOneHot("cat", "cat_column")
|
||||
.build()))
|
||||
.build();
|
||||
// end::put-data-frame-analytics-classification
|
||||
|
||||
|
@ -3019,6 +3023,9 @@ public class MlClientDocumentationIT extends ESRestHighLevelClientTestCase {
|
|||
.setRandomizeSeed(1234L) // <10>
|
||||
.setLossFunction(Regression.LossFunction.MSE) // <11>
|
||||
.setLossFunctionParameter(1.0) // <12>
|
||||
.setFeatureProcessors(Arrays.asList(OneHotEncoding.builder("categorical_feature") // <13>
|
||||
.addOneHot("cat", "cat_column")
|
||||
.build()))
|
||||
.build();
|
||||
// end::put-data-frame-analytics-regression
|
||||
|
||||
|
|
|
@ -18,10 +18,20 @@
|
|||
*/
|
||||
package org.elasticsearch.client.ml.dataframe;
|
||||
|
||||
import org.elasticsearch.client.ml.inference.MlInferenceNamedXContentProvider;
|
||||
import org.elasticsearch.client.ml.inference.preprocessing.FrequencyEncodingTests;
|
||||
import org.elasticsearch.client.ml.inference.preprocessing.OneHotEncodingTests;
|
||||
import org.elasticsearch.client.ml.inference.preprocessing.TargetMeanEncodingTests;
|
||||
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.test.AbstractXContentTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class ClassificationTests extends AbstractXContentTestCase<Classification> {
|
||||
|
||||
|
@ -38,9 +48,20 @@ public class ClassificationTests extends AbstractXContentTestCase<Classification
|
|||
.setRandomizeSeed(randomBoolean() ? null : randomLong())
|
||||
.setClassAssignmentObjective(randomBoolean() ? null : randomFrom(Classification.ClassAssignmentObjective.values()))
|
||||
.setNumTopClasses(randomBoolean() ? null : randomIntBetween(0, 10))
|
||||
.setFeatureProcessors(randomBoolean() ? null :
|
||||
Stream.generate(() -> randomFrom(FrequencyEncodingTests.createRandom(),
|
||||
OneHotEncodingTests.createRandom(),
|
||||
TargetMeanEncodingTests.createRandom()))
|
||||
.limit(randomIntBetween(1, 10))
|
||||
.collect(Collectors.toList()))
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Predicate<String> getRandomFieldsExcludeFilter() {
|
||||
return field -> field.startsWith("feature_processors");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Classification createTestInstance() {
|
||||
return randomClassification();
|
||||
|
@ -55,4 +76,11 @@ public class ClassificationTests extends AbstractXContentTestCase<Classification
|
|||
protected boolean supportsUnknownFields() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected NamedXContentRegistry xContentRegistry() {
|
||||
List<NamedXContentRegistry.Entry> namedXContent = new ArrayList<>();
|
||||
namedXContent.addAll(new MlInferenceNamedXContentProvider().getNamedXContentParsers());
|
||||
return new NamedXContentRegistry(namedXContent);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
package org.elasticsearch.client.ml.dataframe;
|
||||
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.client.ml.inference.MlInferenceNamedXContentProvider;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.unit.ByteSizeUnit;
|
||||
import org.elasticsearch.common.unit.ByteSizeValue;
|
||||
|
@ -101,6 +102,7 @@ public class DataFrameAnalyticsConfigTests extends AbstractXContentTestCase<Data
|
|||
List<NamedXContentRegistry.Entry> namedXContent = new ArrayList<>();
|
||||
namedXContent.addAll(new SearchModule(Settings.EMPTY, false, Collections.emptyList()).getNamedXContents());
|
||||
namedXContent.addAll(new MlDataFrameAnalysisNamedXContentProvider().getNamedXContentParsers());
|
||||
namedXContent.addAll(new MlInferenceNamedXContentProvider().getNamedXContentParsers());
|
||||
return new NamedXContentRegistry(namedXContent);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,10 +18,20 @@
|
|||
*/
|
||||
package org.elasticsearch.client.ml.dataframe;
|
||||
|
||||
import org.elasticsearch.client.ml.inference.MlInferenceNamedXContentProvider;
|
||||
import org.elasticsearch.client.ml.inference.preprocessing.FrequencyEncodingTests;
|
||||
import org.elasticsearch.client.ml.inference.preprocessing.OneHotEncodingTests;
|
||||
import org.elasticsearch.client.ml.inference.preprocessing.TargetMeanEncodingTests;
|
||||
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.test.AbstractXContentTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class RegressionTests extends AbstractXContentTestCase<Regression> {
|
||||
|
||||
|
@ -37,9 +47,20 @@ public class RegressionTests extends AbstractXContentTestCase<Regression> {
|
|||
.setTrainingPercent(randomBoolean() ? null : randomDoubleBetween(1.0, 100.0, true))
|
||||
.setLossFunction(randomBoolean() ? null : randomFrom(Regression.LossFunction.values()))
|
||||
.setLossFunctionParameter(randomBoolean() ? null : randomDoubleBetween(1.0, Double.MAX_VALUE, true))
|
||||
.setFeatureProcessors(randomBoolean() ? null :
|
||||
Stream.generate(() -> randomFrom(FrequencyEncodingTests.createRandom(),
|
||||
OneHotEncodingTests.createRandom(),
|
||||
TargetMeanEncodingTests.createRandom()))
|
||||
.limit(randomIntBetween(1, 10))
|
||||
.collect(Collectors.toList()))
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Predicate<String> getRandomFieldsExcludeFilter() {
|
||||
return field -> field.startsWith("feature_processors");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Regression createTestInstance() {
|
||||
return randomRegression();
|
||||
|
@ -54,4 +75,11 @@ public class RegressionTests extends AbstractXContentTestCase<Regression> {
|
|||
protected boolean supportsUnknownFields() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected NamedXContentRegistry xContentRegistry() {
|
||||
List<NamedXContentRegistry.Entry> namedXContent = new ArrayList<>();
|
||||
namedXContent.addAll(new MlInferenceNamedXContentProvider().getNamedXContentParsers());
|
||||
return new NamedXContentRegistry(namedXContent);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -124,6 +124,8 @@ include-tagged::{doc-tests-file}[{api}-classification]
|
|||
<10> The seed to be used by the random generator that picks which rows are used in training.
|
||||
<11> The optimization objective to target when assigning class labels. Defaults to maximize_minimum_recall.
|
||||
<12> The number of top classes to be reported in the results. Defaults to 2.
|
||||
<13> Custom feature processors that will create new features for analysis from the included document
|
||||
fields. Note, automatic categorical {ml-docs}/ml-feature-encoding.html[feature encoding] still occurs for all features.
|
||||
|
||||
===== Regression
|
||||
|
||||
|
@ -146,6 +148,8 @@ include-tagged::{doc-tests-file}[{api}-regression]
|
|||
<10> The seed to be used by the random generator that picks which rows are used in training.
|
||||
<11> The loss function used for regression. Defaults to `mse`.
|
||||
<12> An optional parameter to the loss function.
|
||||
<13> Custom feature processors that will create new features for analysis from the included document
|
||||
fields. Note, automatic categorical {ml-docs}/ml-feature-encoding.html[feature encoding] still occurs for all features.
|
||||
|
||||
==== Analyzed fields
|
||||
|
||||
|
|
|
@ -25,7 +25,7 @@ If the {es} {security-features} are enabled, you must have the following built-i
|
|||
* `machine_learning_admin`
|
||||
* source indices: `read`, `view_index_metadata`
|
||||
* destination index: `read`, `create_index`, `manage` and `index`
|
||||
|
||||
|
||||
For more information, see <<built-in-roles>>, <<security-privileges>>, and
|
||||
{ml-docs-setup-privileges}.
|
||||
|
||||
|
@ -33,20 +33,20 @@ For more information, see <<built-in-roles>>, <<security-privileges>>, and
|
|||
NOTE: The {dfanalytics-job} remembers which roles the user who created it had at
|
||||
the time of creation. When you start the job, it performs the analysis using
|
||||
those same roles. If you provide
|
||||
<<http-clients-secondary-authorization,secondary authorization headers>>,
|
||||
<<http-clients-secondary-authorization,secondary authorization headers>>,
|
||||
those credentials are used instead.
|
||||
|
||||
[[ml-put-dfanalytics-desc]]
|
||||
== {api-description-title}
|
||||
|
||||
This API creates a {dfanalytics-job} that performs an analysis on the source
|
||||
This API creates a {dfanalytics-job} that performs an analysis on the source
|
||||
indices and stores the outcome in a destination index.
|
||||
|
||||
If the destination index does not exist, it is created automatically when you
|
||||
start the job. See <<start-dfanalytics>>.
|
||||
|
||||
If you supply only a subset of the {regression} or {classification} parameters,
|
||||
{ml-docs}/hyperparameters.html[hyperparameter optimization] occurs. It
|
||||
{ml-docs}/hyperparameters.html[hyperparameter optimization] occurs. It
|
||||
determines a value for each of the undefined parameters.
|
||||
|
||||
[[ml-put-dfanalytics-path-params]]
|
||||
|
@ -61,9 +61,9 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=job-id-data-frame-analytics-def
|
|||
== {api-request-body-title}
|
||||
|
||||
`allow_lazy_start`::
|
||||
(Optional, boolean)
|
||||
Specifies whether this job can start when there is insufficient {ml} node
|
||||
capacity for it to be immediately assigned to a node. The default is `false`; if
|
||||
(Optional, boolean)
|
||||
Specifies whether this job can start when there is insufficient {ml} node
|
||||
capacity for it to be immediately assigned to a node. The default is `false`; if
|
||||
a {ml} node with capacity to run the job cannot immediately be found, the
|
||||
<<start-dfanalytics>> API returns an error. However, this is also subject to the
|
||||
cluster-wide `xpack.ml.max_lazy_ml_nodes` setting. See <<advanced-ml-settings>>.
|
||||
|
@ -86,7 +86,7 @@ one of the following types of analysis: {classification}, {oldetection}, or
|
|||
The configuration information necessary to perform
|
||||
{ml-docs}/dfa-classification.html[{classification}].
|
||||
+
|
||||
TIP: Advanced parameters are for fine-tuning {classanalysis}. They are set
|
||||
TIP: Advanced parameters are for fine-tuning {classanalysis}. They are set
|
||||
automatically by hyperparameter optimization to give the minimum validation
|
||||
error. It is highly recommended to use the default values unless you fully
|
||||
understand the function of these parameters.
|
||||
|
@ -108,23 +108,27 @@ categorical (`ip` or `keyword`), or boolean. There must be no more than 30
|
|||
different values in this field.
|
||||
|
||||
`eta`::::
|
||||
(Optional, double)
|
||||
(Optional, double)
|
||||
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=eta]
|
||||
|
||||
`feature_bag_fraction`::::
|
||||
(Optional, double)
|
||||
(Optional, double)
|
||||
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-bag-fraction]
|
||||
|
||||
`feature_processors`::::
|
||||
(Optional, list)
|
||||
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=dfas-feature-processors]
|
||||
|
||||
`gamma`::::
|
||||
(Optional, double)
|
||||
(Optional, double)
|
||||
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=gamma]
|
||||
|
||||
`lambda`::::
|
||||
(Optional, double)
|
||||
(Optional, double)
|
||||
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=lambda]
|
||||
|
||||
`max_trees`::::
|
||||
(Optional, integer)
|
||||
(Optional, integer)
|
||||
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=max-trees]
|
||||
|
||||
`num_top_classes`::::
|
||||
|
@ -136,11 +140,11 @@ categories, the API reports all category probabilities. Defaults to 2.
|
|||
`num_top_feature_importance_values`::::
|
||||
(Optional, integer)
|
||||
Advanced configuration option. Specifies the maximum number of
|
||||
{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return.
|
||||
{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return.
|
||||
By default, it is zero and no {feat-imp} calculation occurs.
|
||||
|
||||
`prediction_field_name`::::
|
||||
(Optional, string)
|
||||
(Optional, string)
|
||||
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=prediction-field-name]
|
||||
|
||||
`randomize_seed`::::
|
||||
|
@ -164,23 +168,23 @@ The configuration information necessary to perform
|
|||
`compute_feature_influence`::::
|
||||
(Optional, boolean)
|
||||
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=compute-feature-influence]
|
||||
|
||||
`feature_influence_threshold`::::
|
||||
|
||||
`feature_influence_threshold`::::
|
||||
(Optional, double)
|
||||
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-influence-threshold]
|
||||
|
||||
`method`::::
|
||||
(Optional, string)
|
||||
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=method]
|
||||
|
||||
|
||||
`n_neighbors`::::
|
||||
(Optional, integer)
|
||||
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=n-neighbors]
|
||||
|
||||
|
||||
`outlier_fraction`::::
|
||||
(Optional, double)
|
||||
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=outlier-fraction]
|
||||
|
||||
|
||||
`standardization_enabled`::::
|
||||
(Optional, boolean)
|
||||
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=standardization-enabled]
|
||||
|
@ -192,7 +196,7 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=standardization-enabled]
|
|||
The configuration information necessary to perform
|
||||
{ml-docs}/dfa-regression.html[{regression}].
|
||||
+
|
||||
TIP: Advanced parameters are for fine-tuning {reganalysis}. They are set
|
||||
TIP: Advanced parameters are for fine-tuning {reganalysis}. They are set
|
||||
automatically by hyperparameter optimization to give minimum validation error.
|
||||
It is highly recommended to use the default values unless you fully understand
|
||||
the function of these parameters.
|
||||
|
@ -215,20 +219,24 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=eta]
|
|||
(Optional, double)
|
||||
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-bag-fraction]
|
||||
|
||||
`feature_processors`::::
|
||||
(Optional, list)
|
||||
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=dfas-feature-processors]
|
||||
|
||||
`gamma`::::
|
||||
(Optional, double)
|
||||
(Optional, double)
|
||||
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=gamma]
|
||||
|
||||
`lambda`::::
|
||||
(Optional, double)
|
||||
(Optional, double)
|
||||
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=lambda]
|
||||
|
||||
`loss_function`::::
|
||||
(Optional, string)
|
||||
The loss function used during {regression}. Available options are `mse` (mean
|
||||
squared error), `msle` (mean squared logarithmic error), `huber` (Pseudo-Huber
|
||||
loss). Defaults to `mse`. Refer to
|
||||
{ml-docs}/dfa-regression.html#dfa-regression-lossfunction[Loss functions for {regression} analyses]
|
||||
The loss function used during {regression}. Available options are `mse` (mean
|
||||
squared error), `msle` (mean squared logarithmic error), `huber` (Pseudo-Huber
|
||||
loss). Defaults to `mse`. Refer to
|
||||
{ml-docs}/dfa-regression.html#dfa-regression-lossfunction[Loss functions for {regression} analyses]
|
||||
to learn more.
|
||||
|
||||
`loss_function_parameter`::::
|
||||
|
@ -236,13 +244,13 @@ to learn more.
|
|||
A positive number that is used as a parameter to the `loss_function`.
|
||||
|
||||
`max_trees`::::
|
||||
(Optional, integer)
|
||||
(Optional, integer)
|
||||
include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=max-trees]
|
||||
|
||||
`num_top_feature_importance_values`::::
|
||||
(Optional, integer)
|
||||
Advanced configuration option. Specifies the maximum number of
|
||||
{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return.
|
||||
{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return.
|
||||
By default, it is zero and no {feat-imp} calculation occurs.
|
||||
|
||||
`prediction_field_name`::::
|
||||
|
@ -264,31 +272,31 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=training-percent]
|
|||
//Begin analyzed_fields
|
||||
`analyzed_fields`::
|
||||
(Optional, object)
|
||||
Specify `includes` and/or `excludes` patterns to select which fields will be
|
||||
included in the analysis. The patterns specified in `excludes` are applied last,
|
||||
therefore `excludes` takes precedence. In other words, if the same field is
|
||||
specified in both `includes` and `excludes`, then the field will not be included
|
||||
Specify `includes` and/or `excludes` patterns to select which fields will be
|
||||
included in the analysis. The patterns specified in `excludes` are applied last,
|
||||
therefore `excludes` takes precedence. In other words, if the same field is
|
||||
specified in both `includes` and `excludes`, then the field will not be included
|
||||
in the analysis.
|
||||
+
|
||||
--
|
||||
[[dfa-supported-fields]]
|
||||
The supported fields for each type of analysis are as follows:
|
||||
|
||||
* {oldetection-cap} requires numeric or boolean data to analyze. The algorithms
|
||||
don't support missing values therefore fields that have data types other than
|
||||
numeric or boolean are ignored. Documents where included fields contain missing
|
||||
values, null values, or an array are also ignored. Therefore the `dest` index
|
||||
* {oldetection-cap} requires numeric or boolean data to analyze. The algorithms
|
||||
don't support missing values therefore fields that have data types other than
|
||||
numeric or boolean are ignored. Documents where included fields contain missing
|
||||
values, null values, or an array are also ignored. Therefore the `dest` index
|
||||
may contain documents that don't have an {olscore}.
|
||||
* {regression-cap} supports fields that are numeric, `boolean`, `text`,
|
||||
`keyword`, and `ip`. It is also tolerant of missing values. Fields that are
|
||||
supported are included in the analysis, other fields are ignored. Documents
|
||||
where included fields contain an array with two or more values are also
|
||||
ignored. Documents in the `dest` index that don’t contain a results field are
|
||||
* {regression-cap} supports fields that are numeric, `boolean`, `text`,
|
||||
`keyword`, and `ip`. It is also tolerant of missing values. Fields that are
|
||||
supported are included in the analysis, other fields are ignored. Documents
|
||||
where included fields contain an array with two or more values are also
|
||||
ignored. Documents in the `dest` index that don’t contain a results field are
|
||||
not included in the {reganalysis}.
|
||||
* {classification-cap} supports fields that are numeric, `boolean`, `text`,
|
||||
`keyword`, and `ip`. It is also tolerant of missing values. Fields that are
|
||||
`keyword`, and `ip`. It is also tolerant of missing values. Fields that are
|
||||
supported are included in the analysis, other fields are ignored. Documents
|
||||
where included fields contain an array with two or more values are also ignored.
|
||||
where included fields contain an array with two or more values are also ignored.
|
||||
Documents in the `dest` index that don’t contain a results field are not
|
||||
included in the {classanalysis}. {classanalysis-cap} can be improved by mapping
|
||||
ordinal variable values to a single number. For example, in case of age ranges,
|
||||
|
@ -310,7 +318,7 @@ analysis. You do not need to add fields with unsupported data types to
|
|||
|
||||
`includes`:::
|
||||
(Optional, array)
|
||||
An array of strings that defines the fields that will be included in the
|
||||
An array of strings that defines the fields that will be included in the
|
||||
analysis.
|
||||
//End analyzed_fields
|
||||
====
|
||||
|
@ -330,16 +338,16 @@ The default value is `1`. Using more threads may decrease the time
|
|||
necessary to complete the analysis at the cost of using more CPU.
|
||||
Note that the process may use additional threads for operational
|
||||
functionality other than the analysis itself.
|
||||
|
||||
|
||||
`model_memory_limit`::
|
||||
(Optional, string)
|
||||
The approximate maximum amount of memory resources that are permitted for
|
||||
analytical processing. The default value for {dfanalytics-jobs} is `1gb`. If
|
||||
your `elasticsearch.yml` file contains an `xpack.ml.max_model_memory_limit`
|
||||
setting, an error occurs when you try to create {dfanalytics-jobs} that have
|
||||
`model_memory_limit` values greater than that setting. For more information, see
|
||||
The approximate maximum amount of memory resources that are permitted for
|
||||
analytical processing. The default value for {dfanalytics-jobs} is `1gb`. If
|
||||
your `elasticsearch.yml` file contains an `xpack.ml.max_model_memory_limit`
|
||||
setting, an error occurs when you try to create {dfanalytics-jobs} that have
|
||||
`model_memory_limit` values greater than that setting. For more information, see
|
||||
<<ml-settings>>.
|
||||
|
||||
|
||||
`source`::
|
||||
(object)
|
||||
The configuration of how to source the analysis data. It requires an `index`.
|
||||
|
@ -353,7 +361,7 @@ Optionally, `query` and `_source` may be specified.
|
|||
It can be a single index or index pattern as well as an array of indices or
|
||||
patterns.
|
||||
+
|
||||
WARNING: If your source indices contain documents with the same IDs, only the
|
||||
WARNING: If your source indices contain documents with the same IDs, only the
|
||||
document that is indexed last appears in the destination index.
|
||||
|
||||
`query`:::
|
||||
|
@ -374,7 +382,7 @@ included in the analysis.
|
|||
`includes`::::
|
||||
(array) An array of strings that defines the fields that will be included in the
|
||||
destination.
|
||||
|
||||
|
||||
`excludes`::::
|
||||
(array) An array of strings that defines the fields that will be excluded from
|
||||
the destination.
|
||||
|
@ -390,8 +398,8 @@ the destination.
|
|||
[[ml-put-dfanalytics-example-preprocess]]
|
||||
=== Preprocessing actions example
|
||||
|
||||
The following example shows how to limit the scope of the analysis to certain
|
||||
fields, specify excluded fields in the destination index, and use a query to
|
||||
The following example shows how to limit the scope of the analysis to certain
|
||||
fields, specify excluded fields in the destination index, and use a query to
|
||||
filter your data before analysis.
|
||||
|
||||
[source,console]
|
||||
|
@ -404,7 +412,7 @@ PUT _ml/data_frame/analytics/model-flight-delays-pre
|
|||
],
|
||||
"query": { <2>
|
||||
"range": {
|
||||
"DistanceKilometers": {
|
||||
"DistanceKilometers": {
|
||||
"gt": 0
|
||||
}
|
||||
}
|
||||
|
@ -429,7 +437,7 @@ PUT _ml/data_frame/analytics/model-flight-delays-pre
|
|||
},
|
||||
"analyzed_fields": { <5>
|
||||
"includes": [],
|
||||
"excludes": [
|
||||
"excludes": [
|
||||
"FlightNum"
|
||||
]
|
||||
},
|
||||
|
@ -439,29 +447,29 @@ PUT _ml/data_frame/analytics/model-flight-delays-pre
|
|||
// TEST[skip:setup kibana sample data]
|
||||
|
||||
<1> Source index to analyze.
|
||||
<2> This query filters out entire documents that will not be present in the
|
||||
<2> This query filters out entire documents that will not be present in the
|
||||
destination index.
|
||||
<3> The `_source` object defines fields in the dataset that will be included or
|
||||
excluded in the destination index.
|
||||
<4> Defines the destination index that contains the results of the analysis and
|
||||
the fields of the source index specified in the `_source` object. Also defines
|
||||
<3> The `_source` object defines fields in the dataset that will be included or
|
||||
excluded in the destination index.
|
||||
<4> Defines the destination index that contains the results of the analysis and
|
||||
the fields of the source index specified in the `_source` object. Also defines
|
||||
the name of the `results_field`.
|
||||
<5> Specifies fields to be included in or excluded from the analysis. This does
|
||||
not affect whether the fields will be present in the destination index, only
|
||||
<5> Specifies fields to be included in or excluded from the analysis. This does
|
||||
not affect whether the fields will be present in the destination index, only
|
||||
affects whether they are used in the analysis.
|
||||
|
||||
In this example, we can see that all the fields of the source index are included
|
||||
in the destination index except `FlightDelay` and `FlightDelayType` because
|
||||
these are defined as excluded fields by the `excludes` parameter of the
|
||||
`_source` object. The `FlightNum` field is included in the destination index,
|
||||
however it is not included in the analysis because it is explicitly specified as
|
||||
In this example, we can see that all the fields of the source index are included
|
||||
in the destination index except `FlightDelay` and `FlightDelayType` because
|
||||
these are defined as excluded fields by the `excludes` parameter of the
|
||||
`_source` object. The `FlightNum` field is included in the destination index,
|
||||
however it is not included in the analysis because it is explicitly specified as
|
||||
excluded field by the `excludes` parameter of the `analyzed_fields` object.
|
||||
|
||||
|
||||
[[ml-put-dfanalytics-example-od]]
|
||||
=== {oldetection-cap} example
|
||||
|
||||
The following example creates the `loganalytics` {dfanalytics-job}, the analysis
|
||||
The following example creates the `loganalytics` {dfanalytics-job}, the analysis
|
||||
type is `outlier_detection`:
|
||||
|
||||
[source,console]
|
||||
|
@ -525,7 +533,7 @@ The API returns the following result:
|
|||
[[ml-put-dfanalytics-example-r]]
|
||||
=== {regression-cap} examples
|
||||
|
||||
The following example creates the `house_price_regression_analysis`
|
||||
The following example creates the `house_price_regression_analysis`
|
||||
{dfanalytics-job}, the analysis type is `regression`:
|
||||
|
||||
[source,console]
|
||||
|
@ -538,7 +546,7 @@ PUT _ml/data_frame/analytics/house_price_regression_analysis
|
|||
"dest": {
|
||||
"index": "house_price_predictions"
|
||||
},
|
||||
"analysis":
|
||||
"analysis":
|
||||
{
|
||||
"regression": {
|
||||
"dependent_variable": "price"
|
||||
|
@ -614,7 +622,7 @@ PUT _ml/data_frame/analytics/student_performance_mathematics_0.3
|
|||
[[ml-put-dfanalytics-example-c]]
|
||||
=== {classification-cap} example
|
||||
|
||||
The following example creates the `loan_classification` {dfanalytics-job}, the
|
||||
The following example creates the `loan_classification` {dfanalytics-job}, the
|
||||
analysis type is `classification`:
|
||||
|
||||
[source,console]
|
||||
|
|
|
@ -453,10 +453,10 @@ Defaults to `true`.
|
|||
end::delayed-data-check-config[]
|
||||
|
||||
tag::dependent-variable[]
|
||||
Defines which field of the document is to be predicted.
|
||||
This parameter is supplied by field name and must match one of the fields in
|
||||
the index being used to train. If this field is missing from a document, then
|
||||
that document will not be used for training, but a prediction with the trained
|
||||
Defines which field of the document is to be predicted.
|
||||
This parameter is supplied by field name and must match one of the fields in
|
||||
the index being used to train. If this field is missing from a document, then
|
||||
that document will not be used for training, but a prediction with the trained
|
||||
model will be generated for it. It is also known as continuous target variable.
|
||||
end::dependent-variable[]
|
||||
|
||||
|
@ -513,10 +513,18 @@ The value of the downsample factor.
|
|||
end::dfas-downsample-factor[]
|
||||
|
||||
tag::dfas-eta-growth[]
|
||||
Specifies the rate at which the `eta` increases for each new tree that is added to the
|
||||
Specifies the rate at which the `eta` increases for each new tree that is added to the
|
||||
forest. For example, a rate of `1.05` increases `eta` by 5%.
|
||||
end::dfas-eta-growth[]
|
||||
|
||||
tag::dfas-feature-processors[]
|
||||
A collection of feature preprocessors that modify one or more included fields.
|
||||
The analysis uses the resulting one or more features instead of the
|
||||
original document field. Multiple `feature_processors` entries can refer to the
|
||||
same document fields.
|
||||
Note, automatic categorical {ml-docs}/ml-feature-encoding.html[feature encoding] still occurs.
|
||||
end::dfas-feature-processors[]
|
||||
|
||||
tag::dfas-iteration[]
|
||||
The number of iterations on the analysis.
|
||||
end::dfas-iteration[]
|
||||
|
@ -529,9 +537,9 @@ training stops.
|
|||
end::dfas-max-attempts[]
|
||||
|
||||
tag::dfas-max-optimization-rounds[]
|
||||
A multiplier responsible for determining the maximum number of
|
||||
hyperparameter optimization steps in the Bayesian optimization procedure.
|
||||
The maximum number of steps is determined based on the number of undefined hyperparameters
|
||||
A multiplier responsible for determining the maximum number of
|
||||
hyperparameter optimization steps in the Bayesian optimization procedure.
|
||||
The maximum number of steps is determined based on the number of undefined hyperparameters
|
||||
times the maximum optimization rounds per hyperparameter.
|
||||
end::dfas-max-optimization-rounds[]
|
||||
|
||||
|
@ -595,10 +603,10 @@ functions that are tolerant to gaps in data such as `mean`, `non_null_sum` or
|
|||
end::empty-bucket-count[]
|
||||
|
||||
tag::eta[]
|
||||
Advanced configuration option. The shrinkage applied to the weights. Smaller
|
||||
Advanced configuration option. The shrinkage applied to the weights. Smaller
|
||||
values result in larger forests which have a better generalization error.
|
||||
However, the smaller the value the longer the training will take. For more
|
||||
information, about shrinkage, see
|
||||
information, about shrinkage, see
|
||||
{wikipedia}/Gradient_boosting#Shrinkage[this wiki article]. By
|
||||
default, this value is calcuated during hyperparameter optimization.
|
||||
end::eta[]
|
||||
|
@ -624,13 +632,13 @@ this value to determine the number of unique categories that were missed.
|
|||
end::failed-category-count[]
|
||||
|
||||
tag::feature-bag-fraction[]
|
||||
Advanced configuration option. Defines the fraction of features that will be
|
||||
Advanced configuration option. Defines the fraction of features that will be
|
||||
used when selecting a random bag for each candidate split. By default, this
|
||||
value is calculated during hyperparameter optimization.
|
||||
end::feature-bag-fraction[]
|
||||
|
||||
tag::feature-influence-threshold[]
|
||||
The minimum {olscore} that a document needs to have to calculate its feature
|
||||
The minimum {olscore} that a document needs to have to calculate its feature
|
||||
influence score. Value range: 0-1 (`0.1` by default).
|
||||
end::feature-influence-threshold[]
|
||||
|
||||
|
@ -675,10 +683,10 @@ The analysis function that is used. For example, `count`, `rare`, `mean`, `min`,
|
|||
end::function[]
|
||||
|
||||
tag::gamma[]
|
||||
Advanced configuration option. Regularization parameter to prevent overfitting
|
||||
on the training data set. Multiplies a linear penalty associated with the size of
|
||||
individual trees in the forest. The higher the value the more training will
|
||||
prefer smaller trees. The smaller this parameter the larger individual trees
|
||||
Advanced configuration option. Regularization parameter to prevent overfitting
|
||||
on the training data set. Multiplies a linear penalty associated with the size of
|
||||
individual trees in the forest. The higher the value the more training will
|
||||
prefer smaller trees. The smaller this parameter the larger individual trees
|
||||
will be and the longer training will take. By default, this value is calculated
|
||||
during hyperparameter optimization.
|
||||
end::gamma[]
|
||||
|
@ -798,8 +806,8 @@ information for all {anomaly-jobs}.
|
|||
end::job-id-anomaly-detection-default[]
|
||||
|
||||
tag::job-id-anomaly-detection-define[]
|
||||
Identifier for the {anomaly-job}. This identifier can contain lowercase
|
||||
alphanumeric characters (a-z and 0-9), hyphens, and underscores. It must start
|
||||
Identifier for the {anomaly-job}. This identifier can contain lowercase
|
||||
alphanumeric characters (a-z and 0-9), hyphens, and underscores. It must start
|
||||
and end with alphanumeric characters.
|
||||
end::job-id-anomaly-detection-define[]
|
||||
|
||||
|
@ -843,12 +851,12 @@ For more information, see <<ml-jobstats>>.
|
|||
end::jobs-stats-anomaly-detection[]
|
||||
|
||||
tag::lambda[]
|
||||
Advanced configuration option. Regularization parameter to prevent overfitting
|
||||
on the training data set. Multiplies an L2 regularisation term which applies to
|
||||
leaf weights of the individual trees in the forest. The higher the value the
|
||||
more training will attempt to keep leaf weights small. This makes the prediction
|
||||
function smoother at the expense of potentially not being able to capture
|
||||
relevant relationships between the features and the {depvar}. The smaller this
|
||||
Advanced configuration option. Regularization parameter to prevent overfitting
|
||||
on the training data set. Multiplies an L2 regularisation term which applies to
|
||||
leaf weights of the individual trees in the forest. The higher the value the
|
||||
more training will attempt to keep leaf weights small. This makes the prediction
|
||||
function smoother at the expense of potentially not being able to capture
|
||||
relevant relationships between the features and the {depvar}. The smaller this
|
||||
parameter the larger individual trees will be and the longer training will take.
|
||||
By default, this value is calculated during hyperparameter optimization.
|
||||
end::lambda[]
|
||||
|
@ -1098,8 +1106,8 @@ For open jobs only, the elapsed time for which the job has been open.
|
|||
end::open-time[]
|
||||
|
||||
tag::outlier-fraction[]
|
||||
The proportion of the data set that is assumed to be outlying prior to
|
||||
{oldetection}. For example, 0.05 means it is assumed that 5% of values are real
|
||||
The proportion of the data set that is assumed to be outlying prior to
|
||||
{oldetection}. For example, 0.05 means it is assumed that 5% of values are real
|
||||
outliers and 95% are inliers.
|
||||
end::outlier-fraction[]
|
||||
|
||||
|
@ -1185,7 +1193,7 @@ tag::randomize-seed[]
|
|||
Defines the seed to the random generator that is used to pick
|
||||
which documents will be used for training. By default it is randomly generated.
|
||||
Set it to a specific value to ensure the same documents are used for training
|
||||
assuming other related parameters (e.g. `source`, `analyzed_fields`, etc.) are
|
||||
assuming other related parameters (e.g. `source`, `analyzed_fields`, etc.) are
|
||||
the same.
|
||||
end::randomize-seed[]
|
||||
|
||||
|
@ -1264,8 +1272,8 @@ end::sparse-bucket-count[]
|
|||
|
||||
tag::standardization-enabled[]
|
||||
If `true`, the following operation is performed on the columns before computing
|
||||
{olscores}: (x_i - mean(x_i)) / sd(x_i). Defaults to `true`. For
|
||||
more information about this concept, see
|
||||
{olscores}: (x_i - mean(x_i)) / sd(x_i). Defaults to `true`. For
|
||||
more information about this concept, see
|
||||
https://en.wikipedia.org/wiki/Feature_scaling#Standardization_(Z-score_Normalization)[Wikipedia].
|
||||
end::standardization-enabled[]
|
||||
|
||||
|
@ -1340,12 +1348,12 @@ when the mode is set to `manual`. For example: `3h`.
|
|||
end::time-span[]
|
||||
|
||||
tag::timeout-start[]
|
||||
Controls the amount of time to wait until the {dfanalytics-job} starts. Defaults
|
||||
Controls the amount of time to wait until the {dfanalytics-job} starts. Defaults
|
||||
to 20 seconds.
|
||||
end::timeout-start[]
|
||||
|
||||
tag::timeout-stop[]
|
||||
Controls the amount of time to wait until the {dfanalytics-job} stops. Defaults
|
||||
Controls the amount of time to wait until the {dfanalytics-job} stops. Defaults
|
||||
to 20 seconds.
|
||||
end::timeout-stop[]
|
||||
|
||||
|
|
Loading…
Reference in New Issue