[7.x] [ML] adding docs + hlrc for data frame analysis feature_processors (#61149) (#61493)

* [ML] adding docs + hlrc for data frame analysis feature_processors (#61149) Adds HLRC and some docs for the new feature_processors field in Data frame analytics. Co-authored-by: Przemysław Witek <przemyslaw.witek@elastic.co> Co-authored-by: Lisa Cawley <lcawley@elastic.co>
2020-08-24 12:56:21 -04:00 · 2020-08-24 12:56:21 -04:00 · 1ae2923632
parent d05649bfae
commit 1ae2923632
10 changed files with 254 additions and 116 deletions
--- a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/Classification.java
+++ b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/Classification.java
@ -18,6 +18,8 @@
 */
 package org.elasticsearch.client.ml.dataframe;

+import org.elasticsearch.client.ml.inference.NamedXContentObjectHelper;
+import org.elasticsearch.client.ml.inference.preprocessing.PreProcessor;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.Strings;
@ -26,6 +28,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentParser;

 import java.io.IOException;
+import java.util.List;
 import java.util.Locale;
 import java.util.Objects;

@ -53,7 +56,9 @@ public class Classification implements DataFrameAnalysis {
    static final ParseField CLASS_ASSIGNMENT_OBJECTIVE = new ParseField("class_assignment_objective");
    static final ParseField NUM_TOP_CLASSES = new ParseField("num_top_classes");
    static final ParseField RANDOMIZE_SEED = new ParseField("randomize_seed");
+    static final ParseField FEATURE_PROCESSORS = new ParseField("feature_processors");

+    @SuppressWarnings("unchecked")
    private static final ConstructingObjectParser<Classification, Void> PARSER =
        new ConstructingObjectParser<>(
            NAME.getPreferredName(),
@ -70,7 +75,8 @@ public class Classification implements DataFrameAnalysis {
                (Double) a[8],
                (Integer) a[9],
                (Long) a[10],
-                (ClassAssignmentObjective) a[11]));
+                (ClassAssignmentObjective) a[11],
+                (List<PreProcessor>) a[12]));

    static {
        PARSER.declareString(ConstructingObjectParser.constructorArg(), DEPENDENT_VARIABLE);
@ -86,6 +92,10 @@ public class Classification implements DataFrameAnalysis {
        PARSER.declareLong(ConstructingObjectParser.optionalConstructorArg(), RANDOMIZE_SEED);
        PARSER.declareString(
            ConstructingObjectParser.optionalConstructorArg(), ClassAssignmentObjective::fromString, CLASS_ASSIGNMENT_OBJECTIVE);
+        PARSER.declareNamedObjects(ConstructingObjectParser.optionalConstructorArg(),
+            (p, c, n) -> p.namedObject(PreProcessor.class, n, c),
+            (classification) -> {},
+            FEATURE_PROCESSORS);
    }

    private final String dependentVariable;
@ -100,12 +110,13 @@ public class Classification implements DataFrameAnalysis {
    private final ClassAssignmentObjective classAssignmentObjective;
    private final Integer numTopClasses;
    private final Long randomizeSeed;
+    private final List<PreProcessor> featureProcessors;

    private Classification(String dependentVariable, @Nullable Double lambda, @Nullable Double gamma, @Nullable Double eta,
                           @Nullable Integer maxTrees, @Nullable Double featureBagFraction,
                           @Nullable Integer numTopFeatureImportanceValues, @Nullable String predictionFieldName,
                           @Nullable Double trainingPercent, @Nullable Integer numTopClasses, @Nullable Long randomizeSeed,
-                           @Nullable ClassAssignmentObjective classAssignmentObjective) {
+                           @Nullable ClassAssignmentObjective classAssignmentObjective, @Nullable List<PreProcessor> featureProcessors) {
        this.dependentVariable = Objects.requireNonNull(dependentVariable);
        this.lambda = lambda;
        this.gamma = gamma;
@ -118,6 +129,7 @@ public class Classification implements DataFrameAnalysis {
        this.classAssignmentObjective = classAssignmentObjective;
        this.numTopClasses = numTopClasses;
        this.randomizeSeed = randomizeSeed;
+        this.featureProcessors = featureProcessors;
    }

    @Override
@ -173,6 +185,10 @@ public class Classification implements DataFrameAnalysis {
        return numTopClasses;
    }

+    public List<PreProcessor> getFeatureProcessors() {
+        return featureProcessors;
+    }
+
    @Override
    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
        builder.startObject();
@ -210,6 +226,9 @@ public class Classification implements DataFrameAnalysis {
        if (numTopClasses != null) {
            builder.field(NUM_TOP_CLASSES.getPreferredName(), numTopClasses);
        }
+        if (featureProcessors != null) {
+            NamedXContentObjectHelper.writeNamedObjects(builder, params, true, FEATURE_PROCESSORS.getPreferredName(), featureProcessors);
+        }
        builder.endObject();
        return builder;
    }
@ -217,7 +236,7 @@ public class Classification implements DataFrameAnalysis {
    @Override
    public int hashCode() {
        return Objects.hash(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction, numTopFeatureImportanceValues,
-            predictionFieldName, trainingPercent, randomizeSeed, numTopClasses, classAssignmentObjective);
+            predictionFieldName, trainingPercent, randomizeSeed, numTopClasses, classAssignmentObjective, featureProcessors);
    }

    @Override
@ -236,7 +255,8 @@ public class Classification implements DataFrameAnalysis {
            && Objects.equals(trainingPercent, that.trainingPercent)
            && Objects.equals(randomizeSeed, that.randomizeSeed)
            && Objects.equals(numTopClasses, that.numTopClasses)
-            && Objects.equals(classAssignmentObjective, that.classAssignmentObjective);
+            && Objects.equals(classAssignmentObjective, that.classAssignmentObjective)
+            && Objects.equals(featureProcessors, that.featureProcessors);
    }

    @Override
@ -270,6 +290,7 @@ public class Classification implements DataFrameAnalysis {
        private Integer numTopClasses;
        private Long randomizeSeed;
        private ClassAssignmentObjective classAssignmentObjective;
+        private List<PreProcessor> featureProcessors;

        private Builder(String dependentVariable) {
            this.dependentVariable = Objects.requireNonNull(dependentVariable);
@ -330,10 +351,15 @@ public class Classification implements DataFrameAnalysis {
            return this;
        }

+        public Builder setFeatureProcessors(List<PreProcessor> featureProcessors) {
+            this.featureProcessors = featureProcessors;
+            return this;
+        }
+
        public Classification build() {
            return new Classification(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction,
                numTopFeatureImportanceValues, predictionFieldName, trainingPercent, numTopClasses, randomizeSeed,
-                classAssignmentObjective);
+                classAssignmentObjective, featureProcessors);
        }
    }
 }
--- a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/Regression.java
+++ b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/dataframe/Regression.java
@ -18,6 +18,8 @@
 */
 package org.elasticsearch.client.ml.dataframe;

+import org.elasticsearch.client.ml.inference.NamedXContentObjectHelper;
+import org.elasticsearch.client.ml.inference.preprocessing.PreProcessor;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.Strings;
@ -26,6 +28,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentParser;

 import java.io.IOException;
+import java.util.List;
 import java.util.Locale;
 import java.util.Objects;

@ -55,7 +58,9 @@ public class Regression implements DataFrameAnalysis {
    static final ParseField RANDOMIZE_SEED = new ParseField("randomize_seed");
    static final ParseField LOSS_FUNCTION = new ParseField("loss_function");
    static final ParseField LOSS_FUNCTION_PARAMETER = new ParseField("loss_function_parameter");
+    static final ParseField FEATURE_PROCESSORS = new ParseField("feature_processors");

+    @SuppressWarnings("unchecked")
    private static final ConstructingObjectParser<Regression, Void> PARSER =
        new ConstructingObjectParser<>(
            NAME.getPreferredName(),
@ -72,7 +77,8 @@ public class Regression implements DataFrameAnalysis {
                (Double) a[8],
                (Long) a[9],
                (LossFunction) a[10],
-                (Double) a[11]
+                (Double) a[11],
+                (List<PreProcessor>) a[12]
            ));

    static {
@ -88,6 +94,10 @@ public class Regression implements DataFrameAnalysis {
        PARSER.declareLong(ConstructingObjectParser.optionalConstructorArg(), RANDOMIZE_SEED);
        PARSER.declareString(optionalConstructorArg(), LossFunction::fromString, LOSS_FUNCTION);
        PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), LOSS_FUNCTION_PARAMETER);
+        PARSER.declareNamedObjects(ConstructingObjectParser.optionalConstructorArg(),
+            (p, c, n) -> p.namedObject(PreProcessor.class, n, c),
+            (regression) -> {},
+            FEATURE_PROCESSORS);
    }

    private final String dependentVariable;
@ -102,12 +112,13 @@ public class Regression implements DataFrameAnalysis {
    private final Long randomizeSeed;
    private final LossFunction lossFunction;
    private final Double lossFunctionParameter;
+    private final List<PreProcessor> featureProcessors;

    private Regression(String dependentVariable, @Nullable Double lambda, @Nullable Double gamma, @Nullable Double eta,
                       @Nullable Integer maxTrees, @Nullable Double featureBagFraction,
                       @Nullable Integer numTopFeatureImportanceValues, @Nullable String predictionFieldName,
                       @Nullable Double trainingPercent, @Nullable Long randomizeSeed, @Nullable LossFunction lossFunction,
-                       @Nullable Double lossFunctionParameter) {
+                       @Nullable Double lossFunctionParameter, @Nullable List<PreProcessor> featureProcessors) {
        this.dependentVariable = Objects.requireNonNull(dependentVariable);
        this.lambda = lambda;
        this.gamma = gamma;
@ -120,6 +131,7 @@ public class Regression implements DataFrameAnalysis {
        this.randomizeSeed = randomizeSeed;
        this.lossFunction = lossFunction;
        this.lossFunctionParameter = lossFunctionParameter;
+        this.featureProcessors = featureProcessors;
    }

    @Override
@ -175,6 +187,10 @@ public class Regression implements DataFrameAnalysis {
        return lossFunctionParameter;
    }

+    public List<PreProcessor> getFeatureProcessors() {
+        return featureProcessors;
+    }
+
    @Override
    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
        builder.startObject();
@ -212,6 +228,9 @@ public class Regression implements DataFrameAnalysis {
        if (lossFunctionParameter != null) {
            builder.field(LOSS_FUNCTION_PARAMETER.getPreferredName(), lossFunctionParameter);
        }
+        if (featureProcessors != null) {
+            NamedXContentObjectHelper.writeNamedObjects(builder, params, true, FEATURE_PROCESSORS.getPreferredName(), featureProcessors);
+        }
        builder.endObject();
        return builder;
    }
@ -219,7 +238,7 @@ public class Regression implements DataFrameAnalysis {
    @Override
    public int hashCode() {
        return Objects.hash(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction, numTopFeatureImportanceValues,
-            predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter);
+            predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter, featureProcessors);
    }

    @Override
@ -238,7 +257,8 @@ public class Regression implements DataFrameAnalysis {
            && Objects.equals(trainingPercent, that.trainingPercent)
            && Objects.equals(randomizeSeed, that.randomizeSeed)
            && Objects.equals(lossFunction, that.lossFunction)
-            && Objects.equals(lossFunctionParameter, that.lossFunctionParameter);
+            && Objects.equals(lossFunctionParameter, that.lossFunctionParameter)
+            && Objects.equals(featureProcessors, that.featureProcessors);
    }

    @Override
@ -259,6 +279,7 @@ public class Regression implements DataFrameAnalysis {
        private Long randomizeSeed;
        private LossFunction lossFunction;
        private Double lossFunctionParameter;
+        private List<PreProcessor> featureProcessors;

        private Builder(String dependentVariable) {
            this.dependentVariable = Objects.requireNonNull(dependentVariable);
@ -319,9 +340,15 @@ public class Regression implements DataFrameAnalysis {
            return this;
        }

+        public Builder setFeatureProcessors(List<PreProcessor> featureProcessors) {
+            this.featureProcessors = featureProcessors;
+            return this;
+        }
+
        public Regression build() {
            return new Regression(dependentVariable, lambda, gamma, eta, maxTrees, featureBagFraction,
-                numTopFeatureImportanceValues, predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter);
+                numTopFeatureImportanceValues, predictionFieldName, trainingPercent, randomizeSeed, lossFunction, lossFunctionParameter,
+                featureProcessors);
        }
    }

--- a/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/inference/preprocessing/OneHotEncoding.java
+++ b/client/rest-high-level/src/main/java/org/elasticsearch/client/ml/inference/preprocessing/OneHotEncoding.java
@ -114,7 +114,7 @@ public class OneHotEncoding implements PreProcessor {
        return Objects.hash(field, hotMap, custom);
    }

-    public Builder builder(String field) {
+    public static Builder builder(String field) {
        return new Builder(field);
    }

--- a/client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java
+++ b/client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java
@ -179,6 +179,7 @@ import org.elasticsearch.client.ml.inference.TrainedModelDefinition;
 import org.elasticsearch.client.ml.inference.TrainedModelDefinitionTests;
 import org.elasticsearch.client.ml.inference.TrainedModelInput;
 import org.elasticsearch.client.ml.inference.TrainedModelStats;
+import org.elasticsearch.client.ml.inference.preprocessing.OneHotEncoding;
 import org.elasticsearch.client.ml.inference.trainedmodel.RegressionConfig;
 import org.elasticsearch.client.ml.inference.trainedmodel.TargetType;
 import org.elasticsearch.client.ml.job.config.AnalysisConfig;
@ -3003,6 +3004,9 @@ public class MlClientDocumentationIT extends ESRestHighLevelClientTestCase {
                .setRandomizeSeed(1234L) // <10>
                .setClassAssignmentObjective(Classification.ClassAssignmentObjective.MAXIMIZE_ACCURACY) // <11>
                .setNumTopClasses(1) // <12>
+                .setFeatureProcessors(Arrays.asList(OneHotEncoding.builder("categorical_feature") // <13>
+                    .addOneHot("cat", "cat_column")
+                    .build()))
                .build();
            // end::put-data-frame-analytics-classification

@ -3019,6 +3023,9 @@ public class MlClientDocumentationIT extends ESRestHighLevelClientTestCase {
                .setRandomizeSeed(1234L) // <10>
                .setLossFunction(Regression.LossFunction.MSE) // <11>
                .setLossFunctionParameter(1.0) // <12>
+                .setFeatureProcessors(Arrays.asList(OneHotEncoding.builder("categorical_feature") // <13>
+                    .addOneHot("cat", "cat_column")
+                    .build()))
                .build();
            // end::put-data-frame-analytics-regression

--- a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/ClassificationTests.java
+++ b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/ClassificationTests.java
@ -18,10 +18,20 @@
 */
 package org.elasticsearch.client.ml.dataframe;

+import org.elasticsearch.client.ml.inference.MlInferenceNamedXContentProvider;
+import org.elasticsearch.client.ml.inference.preprocessing.FrequencyEncodingTests;
+import org.elasticsearch.client.ml.inference.preprocessing.OneHotEncodingTests;
+import org.elasticsearch.client.ml.inference.preprocessing.TargetMeanEncodingTests;
+import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.test.AbstractXContentTestCase;

 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;

 public class ClassificationTests extends AbstractXContentTestCase<Classification> {

@ -38,9 +48,20 @@ public class ClassificationTests extends AbstractXContentTestCase<Classification
            .setRandomizeSeed(randomBoolean() ? null : randomLong())
            .setClassAssignmentObjective(randomBoolean() ? null : randomFrom(Classification.ClassAssignmentObjective.values()))
            .setNumTopClasses(randomBoolean() ? null : randomIntBetween(0, 10))
+            .setFeatureProcessors(randomBoolean() ? null :
+                Stream.generate(() -> randomFrom(FrequencyEncodingTests.createRandom(),
+                    OneHotEncodingTests.createRandom(),
+                    TargetMeanEncodingTests.createRandom()))
+                    .limit(randomIntBetween(1, 10))
+                    .collect(Collectors.toList()))
            .build();
    }

+    @Override
+    protected Predicate<String> getRandomFieldsExcludeFilter() {
+        return field -> field.startsWith("feature_processors");
+    }
+
    @Override
    protected Classification createTestInstance() {
        return randomClassification();
@ -55,4 +76,11 @@ public class ClassificationTests extends AbstractXContentTestCase<Classification
    protected boolean supportsUnknownFields() {
        return true;
    }
+
+    @Override
+    protected NamedXContentRegistry xContentRegistry() {
+        List<NamedXContentRegistry.Entry> namedXContent = new ArrayList<>();
+        namedXContent.addAll(new MlInferenceNamedXContentProvider().getNamedXContentParsers());
+        return new NamedXContentRegistry(namedXContent);
+    }
 }
--- a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/DataFrameAnalyticsConfigTests.java
+++ b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/DataFrameAnalyticsConfigTests.java
@ -20,6 +20,7 @@
 package org.elasticsearch.client.ml.dataframe;

 import org.elasticsearch.Version;
+import org.elasticsearch.client.ml.inference.MlInferenceNamedXContentProvider;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.ByteSizeUnit;
 import org.elasticsearch.common.unit.ByteSizeValue;
@ -101,6 +102,7 @@ public class DataFrameAnalyticsConfigTests extends AbstractXContentTestCase<Data
        List<NamedXContentRegistry.Entry> namedXContent = new ArrayList<>();
        namedXContent.addAll(new SearchModule(Settings.EMPTY, false, Collections.emptyList()).getNamedXContents());
        namedXContent.addAll(new MlDataFrameAnalysisNamedXContentProvider().getNamedXContentParsers());
+        namedXContent.addAll(new MlInferenceNamedXContentProvider().getNamedXContentParsers());
        return new NamedXContentRegistry(namedXContent);
    }
 }
--- a/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/RegressionTests.java
+++ b/client/rest-high-level/src/test/java/org/elasticsearch/client/ml/dataframe/RegressionTests.java
@ -18,10 +18,20 @@
 */
 package org.elasticsearch.client.ml.dataframe;

+import org.elasticsearch.client.ml.inference.MlInferenceNamedXContentProvider;
+import org.elasticsearch.client.ml.inference.preprocessing.FrequencyEncodingTests;
+import org.elasticsearch.client.ml.inference.preprocessing.OneHotEncodingTests;
+import org.elasticsearch.client.ml.inference.preprocessing.TargetMeanEncodingTests;
+import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.test.AbstractXContentTestCase;

 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;

 public class RegressionTests extends AbstractXContentTestCase<Regression> {

@ -37,9 +47,20 @@ public class RegressionTests extends AbstractXContentTestCase<Regression> {
            .setTrainingPercent(randomBoolean() ? null : randomDoubleBetween(1.0, 100.0, true))
            .setLossFunction(randomBoolean() ? null : randomFrom(Regression.LossFunction.values()))
            .setLossFunctionParameter(randomBoolean() ? null : randomDoubleBetween(1.0, Double.MAX_VALUE, true))
+            .setFeatureProcessors(randomBoolean() ? null :
+                Stream.generate(() -> randomFrom(FrequencyEncodingTests.createRandom(),
+                    OneHotEncodingTests.createRandom(),
+                    TargetMeanEncodingTests.createRandom()))
+                    .limit(randomIntBetween(1, 10))
+                    .collect(Collectors.toList()))
            .build();
    }

+    @Override
+    protected Predicate<String> getRandomFieldsExcludeFilter() {
+        return field -> field.startsWith("feature_processors");
+    }
+
    @Override
    protected Regression createTestInstance() {
        return randomRegression();
@ -54,4 +75,11 @@ public class RegressionTests extends AbstractXContentTestCase<Regression> {
    protected boolean supportsUnknownFields() {
        return true;
    }
+
+    @Override
+    protected NamedXContentRegistry xContentRegistry() {
+        List<NamedXContentRegistry.Entry> namedXContent = new ArrayList<>();
+        namedXContent.addAll(new MlInferenceNamedXContentProvider().getNamedXContentParsers());
+        return new NamedXContentRegistry(namedXContent);
+    }
 }
--- a/docs/java-rest/high-level/ml/put-data-frame-analytics.asciidoc
+++ b/docs/java-rest/high-level/ml/put-data-frame-analytics.asciidoc
@ -124,6 +124,8 @@ include-tagged::{doc-tests-file}[{api}-classification]
 <10> The seed to be used by the random generator that picks which rows are used in training.
 <11> The optimization objective to target when assigning class labels. Defaults to maximize_minimum_recall.
 <12> The number of top classes to be reported in the results. Defaults to 2.
+<13> Custom feature processors that will create new features for analysis from the included document
+     fields. Note, automatic categorical {ml-docs}/ml-feature-encoding.html[feature encoding] still occurs for all features.

 ===== Regression

@ -146,6 +148,8 @@ include-tagged::{doc-tests-file}[{api}-regression]
 <10> The seed to be used by the random generator that picks which rows are used in training.
 <11> The loss function used for regression. Defaults to `mse`.
 <12> An optional parameter to the loss function.
+<13> Custom feature processors that will create new features for analysis from the included document
+fields. Note, automatic categorical {ml-docs}/ml-feature-encoding.html[feature encoding] still occurs for all features.

 ==== Analyzed fields

--- a/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc
+++ b/docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc
@ -25,7 +25,7 @@ If the {es} {security-features} are enabled, you must have the following built-i
 * `machine_learning_admin`
 * source indices: `read`, `view_index_metadata`
 * destination index: `read`, `create_index`, `manage` and `index`
-  
+
 For more information, see <<built-in-roles>>, <<security-privileges>>, and
 {ml-docs-setup-privileges}.

@ -33,20 +33,20 @@ For more information, see <<built-in-roles>>, <<security-privileges>>, and
 NOTE: The {dfanalytics-job} remembers which roles the user who created it had at
 the time of creation. When you start the job, it performs the analysis using
 those same roles. If you provide
-<<http-clients-secondary-authorization,secondary authorization headers>>, 
+<<http-clients-secondary-authorization,secondary authorization headers>>,
 those credentials are used instead.

 [[ml-put-dfanalytics-desc]]
 == {api-description-title}

-This API creates a {dfanalytics-job} that performs an analysis on the source 
+This API creates a {dfanalytics-job} that performs an analysis on the source
 indices and stores the outcome in a destination index.

 If the destination index does not exist, it is created automatically when you
 start the job. See <<start-dfanalytics>>.

 If you supply only a subset of the {regression} or {classification} parameters,
-{ml-docs}/hyperparameters.html[hyperparameter optimization] occurs. It 
+{ml-docs}/hyperparameters.html[hyperparameter optimization] occurs. It
 determines a value for each of the undefined parameters.

 [[ml-put-dfanalytics-path-params]]
@ -61,9 +61,9 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=job-id-data-frame-analytics-def
 == {api-request-body-title}

 `allow_lazy_start`::
-(Optional, boolean) 
-Specifies whether this job can start when there is insufficient {ml} node 
-capacity for it to be immediately assigned to a node. The default is `false`; if 
+(Optional, boolean)
+Specifies whether this job can start when there is insufficient {ml} node
+capacity for it to be immediately assigned to a node. The default is `false`; if
 a {ml} node with capacity to run the job cannot immediately be found, the
 <<start-dfanalytics>> API returns an error. However, this is also subject to the
 cluster-wide `xpack.ml.max_lazy_ml_nodes` setting. See <<advanced-ml-settings>>.
@ -86,7 +86,7 @@ one of the following types of analysis: {classification}, {oldetection}, or
 The configuration information necessary to perform
 {ml-docs}/dfa-classification.html[{classification}].
 +
-TIP: Advanced parameters are for fine-tuning {classanalysis}. They are set 
+TIP: Advanced parameters are for fine-tuning {classanalysis}. They are set
 automatically by hyperparameter optimization to give the minimum validation
 error. It is highly recommended to use the default values unless you fully
 understand the function of these parameters.
@ -108,23 +108,27 @@ categorical (`ip` or `keyword`), or boolean. There must be no more than 30
 different values in this field.

 `eta`::::
-(Optional, double) 
+(Optional, double)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=eta]

 `feature_bag_fraction`::::
-(Optional, double) 
+(Optional, double)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-bag-fraction]

+`feature_processors`::::
+(Optional, list)
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=dfas-feature-processors]
+
 `gamma`::::
-(Optional, double) 
+(Optional, double)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=gamma]

 `lambda`::::
-(Optional, double) 
+(Optional, double)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=lambda]

 `max_trees`::::
-(Optional, integer) 
+(Optional, integer)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=max-trees]

 `num_top_classes`::::
@ -136,11 +140,11 @@ categories, the API reports all category probabilities. Defaults to 2.
 `num_top_feature_importance_values`::::
 (Optional, integer)
 Advanced configuration option. Specifies the maximum number of
-{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return. 
+{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return.
 By default, it is zero and no {feat-imp} calculation occurs.

 `prediction_field_name`::::
-(Optional, string) 
+(Optional, string)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=prediction-field-name]

 `randomize_seed`::::
@ -164,23 +168,23 @@ The configuration information necessary to perform
 `compute_feature_influence`::::
 (Optional, boolean)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=compute-feature-influence]
-  
-`feature_influence_threshold`:::: 
+
+`feature_influence_threshold`::::
 (Optional, double)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-influence-threshold]

 `method`::::
 (Optional, string)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=method]
-  
+
 `n_neighbors`::::
 (Optional, integer)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=n-neighbors]
-  
+
 `outlier_fraction`::::
 (Optional, double)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=outlier-fraction]
-  
+
 `standardization_enabled`::::
 (Optional, boolean)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=standardization-enabled]
@ -192,7 +196,7 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=standardization-enabled]
 The configuration information necessary to perform
 {ml-docs}/dfa-regression.html[{regression}].
 +
-TIP: Advanced parameters are for fine-tuning {reganalysis}. They are set 
+TIP: Advanced parameters are for fine-tuning {reganalysis}. They are set
 automatically by hyperparameter optimization to give minimum validation error.
 It is highly recommended to use the default values unless you fully understand
 the function of these parameters.
@ -215,20 +219,24 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=eta]
 (Optional, double)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=feature-bag-fraction]

+`feature_processors`::::
+(Optional, list)
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=dfas-feature-processors]
+
 `gamma`::::
-(Optional, double) 
+(Optional, double)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=gamma]

 `lambda`::::
-(Optional, double) 
+(Optional, double)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=lambda]

 `loss_function`::::
 (Optional, string)
-The loss function used during {regression}. Available options are `mse` (mean 
-squared error), `msle` (mean squared logarithmic error),  `huber` (Pseudo-Huber 
-loss). Defaults to `mse`. Refer to 
-{ml-docs}/dfa-regression.html#dfa-regression-lossfunction[Loss functions for {regression} analyses] 
+The loss function used during {regression}. Available options are `mse` (mean
+squared error), `msle` (mean squared logarithmic error),  `huber` (Pseudo-Huber
+loss). Defaults to `mse`. Refer to
+{ml-docs}/dfa-regression.html#dfa-regression-lossfunction[Loss functions for {regression} analyses]
 to learn more.

 `loss_function_parameter`::::
@ -236,13 +244,13 @@ to learn more.
 A positive number that is used as a parameter to the `loss_function`.

 `max_trees`::::
-(Optional, integer) 
+(Optional, integer)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=max-trees]

 `num_top_feature_importance_values`::::
 (Optional, integer)
 Advanced configuration option. Specifies the maximum number of
-{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return. 
+{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document to return.
 By default, it is zero and no {feat-imp} calculation occurs.

 `prediction_field_name`::::
@ -264,31 +272,31 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=training-percent]
 //Begin analyzed_fields
 `analyzed_fields`::
 (Optional, object)
-Specify `includes` and/or `excludes` patterns to select which fields will be 
-included in the analysis. The patterns specified in `excludes` are applied last, 
-therefore `excludes` takes precedence. In other words, if the same field is 
-specified in both `includes` and `excludes`, then the field will not be included 
+Specify `includes` and/or `excludes` patterns to select which fields will be
+included in the analysis. The patterns specified in `excludes` are applied last,
+therefore `excludes` takes precedence. In other words, if the same field is
+specified in both `includes` and `excludes`, then the field will not be included
 in the analysis.
 +
 --
 [[dfa-supported-fields]]
 The supported fields for each type of analysis are as follows:

-* {oldetection-cap} requires numeric or boolean data to analyze. The algorithms 
-don't support missing values therefore fields that have data types other than 
-numeric or boolean are ignored. Documents where included fields contain missing 
-values, null values, or an array are also ignored. Therefore the `dest` index 
+* {oldetection-cap} requires numeric or boolean data to analyze. The algorithms
+don't support missing values therefore fields that have data types other than
+numeric or boolean are ignored. Documents where included fields contain missing
+values, null values, or an array are also ignored. Therefore the `dest` index
 may contain documents that don't have an {olscore}.
-* {regression-cap} supports fields that are numeric, `boolean`, `text`, 
-`keyword`, and `ip`. It is also tolerant of missing values. Fields that are 
-supported are included in the analysis, other fields are ignored. Documents 
-where included fields contain  an array with two or more values are also 
-ignored. Documents in the `dest` index  that don’t contain a results field are 
+* {regression-cap} supports fields that are numeric, `boolean`, `text`,
+`keyword`, and `ip`. It is also tolerant of missing values. Fields that are
+supported are included in the analysis, other fields are ignored. Documents
+where included fields contain  an array with two or more values are also
+ignored. Documents in the `dest` index  that don’t contain a results field are
 not included in the {reganalysis}.
 * {classification-cap} supports fields that are numeric, `boolean`, `text`,
-`keyword`, and `ip`. It is also tolerant of missing values. Fields that are 
+`keyword`, and `ip`. It is also tolerant of missing values. Fields that are
 supported are included in the analysis, other fields are ignored. Documents
-where included fields contain an array with two or more values are also ignored. 
+where included fields contain an array with two or more values are also ignored.
 Documents in the `dest` index that don’t contain a results field are not
 included in the {classanalysis}. {classanalysis-cap} can be improved by mapping
 ordinal variable values to a  single number. For example, in case of age ranges,
@ -310,7 +318,7 @@ analysis. You do not need to add fields with unsupported data types to

 `includes`:::
 (Optional, array)
-An array of strings that defines the fields that will be included in the 
+An array of strings that defines the fields that will be included in the
 analysis.
 //End analyzed_fields
 ====
@ -330,16 +338,16 @@ The default value is `1`. Using more threads may decrease the time
 necessary to complete the analysis at the cost of using more CPU.
 Note that the process may use additional threads for operational
 functionality other than the analysis itself.
-  
+
 `model_memory_limit`::
 (Optional, string)
-The approximate maximum amount of memory resources that are permitted for 
-analytical processing. The default value for {dfanalytics-jobs} is `1gb`. If 
-your `elasticsearch.yml` file contains an `xpack.ml.max_model_memory_limit` 
-setting, an error occurs when you try to create {dfanalytics-jobs} that have 
-`model_memory_limit` values greater than that setting. For more information, see 
+The approximate maximum amount of memory resources that are permitted for
+analytical processing. The default value for {dfanalytics-jobs} is `1gb`. If
+your `elasticsearch.yml` file contains an `xpack.ml.max_model_memory_limit`
+setting, an error occurs when you try to create {dfanalytics-jobs} that have
+`model_memory_limit` values greater than that setting. For more information, see
 <<ml-settings>>.
-  
+
 `source`::
 (object)
 The configuration of how to source the analysis data. It requires an `index`.
@ -353,7 +361,7 @@ Optionally, `query` and `_source` may be specified.
 It can be a single index or index pattern as well as an array of indices or
 patterns.
 +
-WARNING: If your source indices contain documents with the same IDs, only the 
+WARNING: If your source indices contain documents with the same IDs, only the
 document that is indexed last appears in the destination index.

 `query`:::
@ -374,7 +382,7 @@ included in the analysis.
 `includes`::::
 (array) An array of strings that defines the fields that will be included in the
 destination.
-        
+
 `excludes`::::
 (array) An array of strings that defines the fields that will be excluded from
 the destination.
@ -390,8 +398,8 @@ the destination.
 [[ml-put-dfanalytics-example-preprocess]]
 === Preprocessing actions example

-The following example shows how to limit the scope of the analysis to certain 
-fields, specify excluded fields in the destination index, and use a query to 
+The following example shows how to limit the scope of the analysis to certain
+fields, specify excluded fields in the destination index, and use a query to
 filter your data before analysis.

 [source,console]
@ -404,7 +412,7 @@ PUT _ml/data_frame/analytics/model-flight-delays-pre
    ],
    "query": { <2>
      "range": {
-        "DistanceKilometers": { 
+        "DistanceKilometers": {
          "gt": 0
        }
      }
@ -429,7 +437,7 @@ PUT _ml/data_frame/analytics/model-flight-delays-pre
  },
  "analyzed_fields": { <5>
    "includes": [],
-    "excludes": [   
+    "excludes": [
      "FlightNum"
    ]
  },
@ -439,29 +447,29 @@ PUT _ml/data_frame/analytics/model-flight-delays-pre
 // TEST[skip:setup kibana sample data]

 <1> Source index to analyze.
-<2> This query filters out entire documents that will not be present in the 
+<2> This query filters out entire documents that will not be present in the
 destination index.
-<3> The `_source` object defines fields in the dataset that will be included or 
-excluded in the destination index. 
-<4> Defines the destination index that contains the results of the analysis and 
-the fields of the source index specified in the `_source` object. Also defines 
+<3> The `_source` object defines fields in the dataset that will be included or
+excluded in the destination index.
+<4> Defines the destination index that contains the results of the analysis and
+the fields of the source index specified in the `_source` object. Also defines
 the name of the `results_field`.
-<5> Specifies fields to be included in or excluded from the analysis. This does 
-not affect whether the fields will be present in the destination index, only 
+<5> Specifies fields to be included in or excluded from the analysis. This does
+not affect whether the fields will be present in the destination index, only
 affects whether they are used in the analysis.

-In this example, we can see that all the fields of the source index are included 
-in the destination index except `FlightDelay` and `FlightDelayType` because 
-these are defined as excluded fields by the `excludes` parameter of the 
-`_source` object. The `FlightNum` field is included in the destination index, 
-however it is not included in the analysis because it is explicitly specified as 
+In this example, we can see that all the fields of the source index are included
+in the destination index except `FlightDelay` and `FlightDelayType` because
+these are defined as excluded fields by the `excludes` parameter of the
+`_source` object. The `FlightNum` field is included in the destination index,
+however it is not included in the analysis because it is explicitly specified as
 excluded field by the `excludes` parameter of the `analyzed_fields` object.


 [[ml-put-dfanalytics-example-od]]
 === {oldetection-cap} example

-The following example creates the `loganalytics` {dfanalytics-job}, the analysis 
+The following example creates the `loganalytics` {dfanalytics-job}, the analysis
 type is `outlier_detection`:

 [source,console]
@ -525,7 +533,7 @@ The API returns the following result:
 [[ml-put-dfanalytics-example-r]]
 === {regression-cap} examples

-The following example creates the `house_price_regression_analysis` 
+The following example creates the `house_price_regression_analysis`
 {dfanalytics-job}, the analysis type is `regression`:

 [source,console]
@ -538,7 +546,7 @@ PUT _ml/data_frame/analytics/house_price_regression_analysis
  "dest": {
    "index": "house_price_predictions"
  },
-  "analysis": 
+  "analysis":
    {
      "regression": {
        "dependent_variable": "price"
@ -614,7 +622,7 @@ PUT _ml/data_frame/analytics/student_performance_mathematics_0.3
 [[ml-put-dfanalytics-example-c]]
 === {classification-cap} example

-The following example creates the `loan_classification` {dfanalytics-job}, the 
+The following example creates the `loan_classification` {dfanalytics-job}, the
 analysis type is `classification`:

 [source,console]
--- a/docs/reference/ml/ml-shared.asciidoc
+++ b/docs/reference/ml/ml-shared.asciidoc
@ -453,10 +453,10 @@ Defaults to `true`.
 end::delayed-data-check-config[]

 tag::dependent-variable[]
-Defines which field of the document is to be predicted. 
-This parameter is supplied by field name and must match one of the fields in 
-the index being used to train. If this field is missing from a document, then 
-that document will not be used for training, but a prediction with the trained 
+Defines which field of the document is to be predicted.
+This parameter is supplied by field name and must match one of the fields in
+the index being used to train. If this field is missing from a document, then
+that document will not be used for training, but a prediction with the trained
 model will be generated for it. It is also known as continuous target variable.
 end::dependent-variable[]

@ -513,10 +513,18 @@ The value of the downsample factor.
 end::dfas-downsample-factor[]

 tag::dfas-eta-growth[]
-Specifies the rate at which the `eta` increases for each new tree that is added to the 
+Specifies the rate at which the `eta` increases for each new tree that is added to the
 forest. For example, a rate of `1.05` increases `eta` by 5%.
 end::dfas-eta-growth[]

+tag::dfas-feature-processors[]
+A collection of feature preprocessors that modify one or more included fields.
+The analysis uses the resulting one or more features instead of the
+original document field. Multiple `feature_processors` entries can refer to the
+same document fields.
+Note, automatic categorical {ml-docs}/ml-feature-encoding.html[feature encoding] still occurs.
+end::dfas-feature-processors[]
+
 tag::dfas-iteration[]
 The number of iterations on the analysis.
 end::dfas-iteration[]
@ -529,9 +537,9 @@ training stops.
 end::dfas-max-attempts[]

 tag::dfas-max-optimization-rounds[]
-A multiplier responsible for determining the maximum number of 
-hyperparameter optimization steps in the Bayesian optimization procedure. 
-The maximum number of steps is determined based on the number of undefined hyperparameters 
+A multiplier responsible for determining the maximum number of
+hyperparameter optimization steps in the Bayesian optimization procedure.
+The maximum number of steps is determined based on the number of undefined hyperparameters
 times the maximum optimization rounds per hyperparameter.
 end::dfas-max-optimization-rounds[]

@ -595,10 +603,10 @@ functions that are tolerant to gaps in data such as `mean`, `non_null_sum` or
 end::empty-bucket-count[]

 tag::eta[]
-Advanced configuration option. The shrinkage applied to the weights. Smaller 
+Advanced configuration option. The shrinkage applied to the weights. Smaller
 values result in larger forests which have a better generalization error.
 However, the smaller the value the longer the training will take. For more
-information, about shrinkage, see 
+information, about shrinkage, see
 {wikipedia}/Gradient_boosting#Shrinkage[this wiki article]. By
 default, this value is calcuated during hyperparameter optimization.
 end::eta[]
@ -624,13 +632,13 @@ this value to determine the number of unique categories that were missed.
 end::failed-category-count[]

 tag::feature-bag-fraction[]
-Advanced configuration option. Defines the fraction of features that will be 
+Advanced configuration option. Defines the fraction of features that will be
 used when selecting a random bag for each candidate split. By default, this
 value is calculated during hyperparameter optimization.
 end::feature-bag-fraction[]

 tag::feature-influence-threshold[]
-The minimum {olscore} that a document needs to have to calculate its feature 
+The minimum {olscore} that a document needs to have to calculate its feature
 influence score. Value range: 0-1 (`0.1` by default).
 end::feature-influence-threshold[]

@ -675,10 +683,10 @@ The analysis function that is used. For example, `count`, `rare`, `mean`, `min`,
 end::function[]

 tag::gamma[]
-Advanced configuration option. Regularization parameter to prevent overfitting 
-on the training data set. Multiplies a linear penalty associated with the size of 
-individual trees in the forest. The higher the value the more training will 
-prefer smaller trees. The smaller this parameter the larger individual trees 
+Advanced configuration option. Regularization parameter to prevent overfitting
+on the training data set. Multiplies a linear penalty associated with the size of
+individual trees in the forest. The higher the value the more training will
+prefer smaller trees. The smaller this parameter the larger individual trees
 will be and the longer training will take. By default, this value is calculated
 during hyperparameter optimization.
 end::gamma[]
@ -798,8 +806,8 @@ information for all {anomaly-jobs}.
 end::job-id-anomaly-detection-default[]

 tag::job-id-anomaly-detection-define[]
-Identifier for the {anomaly-job}. This identifier can contain lowercase 
-alphanumeric characters (a-z and 0-9), hyphens, and underscores. It must start 
+Identifier for the {anomaly-job}. This identifier can contain lowercase
+alphanumeric characters (a-z and 0-9), hyphens, and underscores. It must start
 and end with alphanumeric characters.
 end::job-id-anomaly-detection-define[]

@ -843,12 +851,12 @@ For more information, see <<ml-jobstats>>.
 end::jobs-stats-anomaly-detection[]

 tag::lambda[]
-Advanced configuration option. Regularization parameter to prevent overfitting 
-on the training data set. Multiplies an L2 regularisation term which applies to 
-leaf weights of the individual trees in the forest. The higher the value the 
-more training will attempt to keep leaf weights small. This makes the prediction  
-function smoother at the expense of potentially not being able to capture 
-relevant relationships between the features and the {depvar}. The smaller this 
+Advanced configuration option. Regularization parameter to prevent overfitting
+on the training data set. Multiplies an L2 regularisation term which applies to
+leaf weights of the individual trees in the forest. The higher the value the
+more training will attempt to keep leaf weights small. This makes the prediction
+function smoother at the expense of potentially not being able to capture
+relevant relationships between the features and the {depvar}. The smaller this
 parameter the larger individual trees will be and the longer training will take.
 By default, this value is calculated during hyperparameter optimization.
 end::lambda[]
@ -1098,8 +1106,8 @@ For open jobs only, the elapsed time for which the job has been open.
 end::open-time[]

 tag::outlier-fraction[]
-The proportion of the data set that is assumed to be outlying prior to 
-{oldetection}. For example, 0.05 means it is assumed that 5% of values are real 
+The proportion of the data set that is assumed to be outlying prior to
+{oldetection}. For example, 0.05 means it is assumed that 5% of values are real
 outliers and 95% are inliers.
 end::outlier-fraction[]

@ -1185,7 +1193,7 @@ tag::randomize-seed[]
 Defines the seed to the random generator that is used to pick
 which documents will be used for training. By default it is randomly generated.
 Set it to a specific value to ensure the same documents are used for training
-assuming other related parameters (e.g. `source`, `analyzed_fields`, etc.) are 
+assuming other related parameters (e.g. `source`, `analyzed_fields`, etc.) are
 the same.
 end::randomize-seed[]

@ -1264,8 +1272,8 @@ end::sparse-bucket-count[]

 tag::standardization-enabled[]
 If `true`, the following operation is performed on the columns before computing
-{olscores}: (x_i - mean(x_i)) / sd(x_i). Defaults to `true`. For 
-more information about this concept, see 
+{olscores}: (x_i - mean(x_i)) / sd(x_i). Defaults to `true`. For
+more information about this concept, see
 https://en.wikipedia.org/wiki/Feature_scaling#Standardization_(Z-score_Normalization)[Wikipedia].
 end::standardization-enabled[]

@ -1340,12 +1348,12 @@ when the mode is set to `manual`. For example: `3h`.
 end::time-span[]

 tag::timeout-start[]
-Controls the amount of time to wait until the {dfanalytics-job} starts. Defaults 
+Controls the amount of time to wait until the {dfanalytics-job} starts. Defaults
 to 20 seconds.
 end::timeout-start[]

 tag::timeout-stop[]
-Controls the amount of time to wait until the {dfanalytics-job} stops. Defaults 
+Controls the amount of time to wait until the {dfanalytics-job} stops. Defaults
 to 20 seconds.
 end::timeout-stop[]