[ML] Increase tokenization flexibility for categorization (elastic/x-pack-elasticsearch#3372)

By moving tokenization for categorization to Java we give users access to considerably more options for tokenizing their log messages prior to using ML to categorize them. Now all Elasticsearch analyzer functionality is available, which opens up the possibility to sensibly categorize non-English log messages. Relates elastic/machine-learning-cpp#491 Original commit: elastic/x-pack-elasticsearch@5d61b67614
2018-01-11 15:40:13 +00:00 · 2018-01-11 15:40:13 +00:00 · a386b5727e
parent 992a7af126
commit a386b5727e
45 changed files with 2335 additions and 321 deletions
--- a/plugin/core/src/main/java/org/elasticsearch/xpack/ml/job/config/AnalysisConfig.java
+++ b/plugin/core/src/main/java/org/elasticsearch/xpack/ml/job/config/AnalysisConfig.java
@ -5,6 +5,7 @@
 */
 package org.elasticsearch.xpack.ml.job.config;

+import org.elasticsearch.Version;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.io.stream.StreamInput;
@ -12,6 +13,7 @@ import org.elasticsearch.common.io.stream.StreamOutput;
 import org.elasticsearch.common.io.stream.Writeable;
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.common.xcontent.ConstructingObjectParser;
+import org.elasticsearch.common.xcontent.ObjectParser;
 import org.elasticsearch.common.xcontent.ToXContentObject;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.xpack.ml.MlParserType;
@ -54,10 +56,11 @@ public class AnalysisConfig implements ToXContentObject, Writeable {
    /**
     * Serialisation names
     */
-    private static final ParseField ANALYSIS_CONFIG = new ParseField("analysis_config");
+    public static final ParseField ANALYSIS_CONFIG = new ParseField("analysis_config");
    private static final ParseField BUCKET_SPAN = new ParseField("bucket_span");
    private static final ParseField CATEGORIZATION_FIELD_NAME = new ParseField("categorization_field_name");
    static final ParseField CATEGORIZATION_FILTERS = new ParseField("categorization_filters");
+    private static final ParseField CATEGORIZATION_ANALYZER = CategorizationAnalyzerConfig.CATEGORIZATION_ANALYZER;
    private static final ParseField LATENCY = new ParseField("latency");
    private static final ParseField SUMMARY_COUNT_FIELD_NAME = new ParseField("summary_count_field_name");
    private static final ParseField DETECTORS = new ParseField("detectors");
@ -97,6 +100,11 @@ public class AnalysisConfig implements ToXContentObject, Writeable {
                    builder.setBucketSpan(TimeValue.parseTimeValue(val, BUCKET_SPAN.getPreferredName())), BUCKET_SPAN);
            parser.declareString(Builder::setCategorizationFieldName, CATEGORIZATION_FIELD_NAME);
            parser.declareStringArray(Builder::setCategorizationFilters, CATEGORIZATION_FILTERS);
+            // This one is nasty - the syntax for analyzers takes either names or objects at many levels, hence it's not
+            // possible to simply declare whether the field is a string or object and a completely custom parser is required
+            parser.declareField(Builder::setCategorizationAnalyzerConfig,
+                    (p, c) -> CategorizationAnalyzerConfig.buildFromXContentFragment(p, parserType),
+                    CATEGORIZATION_ANALYZER, ObjectParser.ValueType.OBJECT_OR_STRING);
            parser.declareString((builder, val) ->
                    builder.setLatency(TimeValue.parseTimeValue(val, LATENCY.getPreferredName())), LATENCY);
            parser.declareString(Builder::setSummaryCountFieldName, SUMMARY_COUNT_FIELD_NAME);
@ -117,6 +125,7 @@ public class AnalysisConfig implements ToXContentObject, Writeable {
    private final TimeValue bucketSpan;
    private final String categorizationFieldName;
    private final List<String> categorizationFilters;
+    private final CategorizationAnalyzerConfig categorizationAnalyzerConfig;
    private final TimeValue latency;
    private final String summaryCountFieldName;
    private final List<Detector> detectors;
@ -128,13 +137,14 @@ public class AnalysisConfig implements ToXContentObject, Writeable {
    private final boolean usePerPartitionNormalization;

    private AnalysisConfig(TimeValue bucketSpan, String categorizationFieldName, List<String> categorizationFilters,
-                           TimeValue latency, String summaryCountFieldName, List<Detector> detectors,
-                           List<String> influencers, Boolean overlappingBuckets, Long resultFinalizationWindow,
+                           CategorizationAnalyzerConfig categorizationAnalyzerConfig, TimeValue latency, String summaryCountFieldName,
+                           List<Detector> detectors, List<String> influencers, Boolean overlappingBuckets, Long resultFinalizationWindow,
                           Boolean multivariateByFields, List<TimeValue> multipleBucketSpans, boolean usePerPartitionNormalization) {
        this.detectors = detectors;
        this.bucketSpan = bucketSpan;
        this.latency = latency;
        this.categorizationFieldName = categorizationFieldName;
+        this.categorizationAnalyzerConfig = categorizationAnalyzerConfig;
        this.categorizationFilters = categorizationFilters;
        this.summaryCountFieldName = summaryCountFieldName;
        this.influencers = influencers;
@ -149,6 +159,12 @@ public class AnalysisConfig implements ToXContentObject, Writeable {
        bucketSpan = new TimeValue(in);
        categorizationFieldName = in.readOptionalString();
        categorizationFilters = in.readBoolean() ? in.readList(StreamInput::readString) : null;
+        // TODO: change to 6.2.0 after backporting
+        if (in.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
+            categorizationAnalyzerConfig = in.readOptionalWriteable(CategorizationAnalyzerConfig::new);
+        } else {
+            categorizationAnalyzerConfig = null;
+        }
        latency = in.readOptionalWriteable(TimeValue::new);
        summaryCountFieldName = in.readOptionalString();
        detectors = in.readList(Detector::new);
@ -170,6 +186,10 @@ public class AnalysisConfig implements ToXContentObject, Writeable {
        } else {
            out.writeBoolean(false);
        }
+        // TODO: change to 6.2.0 after backporting
+        if (out.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
+            out.writeOptionalWriteable(categorizationAnalyzerConfig);
+        }
        out.writeOptionalWriteable(latency);
        out.writeOptionalString(summaryCountFieldName);
        out.writeList(detectors);
@ -203,6 +223,10 @@ public class AnalysisConfig implements ToXContentObject, Writeable {
        return categorizationFilters;
    }

+    public CategorizationAnalyzerConfig getCategorizationAnalyzerConfig() {
+        return categorizationAnalyzerConfig;
+    }
+
    /**
     * The latency interval during which out-of-order records should be handled.
     *
@ -364,6 +388,12 @@ public class AnalysisConfig implements ToXContentObject, Writeable {
        if (categorizationFilters != null) {
            builder.field(CATEGORIZATION_FILTERS.getPreferredName(), categorizationFilters);
        }
+        if (categorizationAnalyzerConfig != null) {
+            // This cannot be builder.field(CATEGORIZATION_ANALYZER.getPreferredName(), categorizationAnalyzerConfig, params);
+            // because that always writes categorizationAnalyzerConfig as an object, and in the case of a global analyzer it
+            // gets written as a single string.
+            categorizationAnalyzerConfig.toXContent(builder, params);
+        }
        if (latency != null) {
            builder.field(LATENCY.getPreferredName(), latency.getStringRep());
        }
@ -406,6 +436,7 @@ public class AnalysisConfig implements ToXContentObject, Writeable {
                Objects.equals(bucketSpan, that.bucketSpan) &&
                Objects.equals(categorizationFieldName, that.categorizationFieldName) &&
                Objects.equals(categorizationFilters, that.categorizationFilters) &&
+                Objects.equals(categorizationAnalyzerConfig, that.categorizationAnalyzerConfig) &&
                Objects.equals(summaryCountFieldName, that.summaryCountFieldName) &&
                Objects.equals(detectors, that.detectors) &&
                Objects.equals(influencers, that.influencers) &&
@ -418,7 +449,7 @@ public class AnalysisConfig implements ToXContentObject, Writeable {
    @Override
    public int hashCode() {
        return Objects.hash(
-                bucketSpan, categorizationFieldName, categorizationFilters, latency,
+                bucketSpan, categorizationFieldName, categorizationFilters, categorizationAnalyzerConfig, latency,
                summaryCountFieldName, detectors, influencers, overlappingBuckets, resultFinalizationWindow,
                multivariateByFields, multipleBucketSpans, usePerPartitionNormalization
        );
@ -433,6 +464,7 @@ public class AnalysisConfig implements ToXContentObject, Writeable {
        private TimeValue latency;
        private String categorizationFieldName;
        private List<String> categorizationFilters;
+        private CategorizationAnalyzerConfig categorizationAnalyzerConfig;
        private String summaryCountFieldName;
        private List<String> influencers = new ArrayList<>();
        private Boolean overlappingBuckets;
@ -451,6 +483,7 @@ public class AnalysisConfig implements ToXContentObject, Writeable {
            this.latency = analysisConfig.latency;
            this.categorizationFieldName = analysisConfig.categorizationFieldName;
            this.categorizationFilters = analysisConfig.categorizationFilters;
+            this.categorizationAnalyzerConfig = analysisConfig.categorizationAnalyzerConfig;
            this.summaryCountFieldName = analysisConfig.summaryCountFieldName;
            this.influencers = analysisConfig.influencers;
            this.overlappingBuckets = analysisConfig.overlappingBuckets;
@ -492,6 +525,10 @@ public class AnalysisConfig implements ToXContentObject, Writeable {
            this.categorizationFilters = categorizationFilters;
        }

+        public void setCategorizationAnalyzerConfig(CategorizationAnalyzerConfig categorizationAnalyzerConfig) {
+            this.categorizationAnalyzerConfig = categorizationAnalyzerConfig;
+        }
+
        public void setSummaryCountFieldName(String summaryCountFieldName) {
            this.summaryCountFieldName = summaryCountFieldName;
        }
@ -544,6 +581,7 @@ public class AnalysisConfig implements ToXContentObject, Writeable {
            Detector.Builder.verifyFieldName(categorizationFieldName);

            verifyMlCategoryIsUsedWhenCategorizationFieldNameIsSet();
+            verifyCategorizationAnalyzer();
            verifyCategorizationFilters();
            checkFieldIsNotNegativeIfSpecified(RESULT_FINALIZATION_WINDOW.getPreferredName(), resultFinalizationWindow);
            verifyMultipleBucketSpans();
@ -559,7 +597,7 @@ public class AnalysisConfig implements ToXContentObject, Writeable {

            verifyNoInconsistentNestedFieldNames();

-            return new AnalysisConfig(bucketSpan, categorizationFieldName, categorizationFilters,
+            return new AnalysisConfig(bucketSpan, categorizationFieldName, categorizationFilters, categorizationAnalyzerConfig,
                    latency, summaryCountFieldName, detectors, influencers, overlappingBuckets,
                    resultFinalizationWindow, multivariateByFields, multipleBucketSpans, usePerPartitionNormalization);
        }
@ -622,17 +660,40 @@ public class AnalysisConfig implements ToXContentObject, Writeable {
            }
        }

+        private void verifyCategorizationAnalyzer() {
+            if (categorizationAnalyzerConfig == null) {
+                return;
+            }
+
+            verifyCategorizationFieldNameSetIfAnalyzerIsSet();
+        }
+
+        private void verifyCategorizationFieldNameSetIfAnalyzerIsSet() {
+            if (categorizationFieldName == null) {
+                throw ExceptionsHelper.badRequestException(Messages.getMessage(
+                        Messages.JOB_CONFIG_CATEGORIZATION_ANALYZER_REQUIRES_CATEGORIZATION_FIELD_NAME));
+            }
+        }
+
        private void verifyCategorizationFilters() {
            if (categorizationFilters == null || categorizationFilters.isEmpty()) {
                return;
            }

+            verifyCategorizationAnalyzerNotSetIfFiltersAreSet();
            verifyCategorizationFieldNameSetIfFiltersAreSet();
            verifyCategorizationFiltersAreDistinct();
            verifyCategorizationFiltersContainNoneEmpty();
            verifyCategorizationFiltersAreValidRegex();
        }

+        private void verifyCategorizationAnalyzerNotSetIfFiltersAreSet() {
+            if (categorizationAnalyzerConfig != null) {
+                throw ExceptionsHelper.badRequestException(Messages.getMessage(
+                        Messages.JOB_CONFIG_CATEGORIZATION_FILTERS_INCOMPATIBLE_WITH_CATEGORIZATION_ANALYZER));
+            }
+        }
+
        private void verifyCategorizationFieldNameSetIfFiltersAreSet() {
            if (categorizationFieldName == null) {
                throw ExceptionsHelper.badRequestException(Messages.getMessage(
--- a/plugin/core/src/main/java/org/elasticsearch/xpack/ml/job/config/CategorizationAnalyzerConfig.java
+++ b/plugin/core/src/main/java/org/elasticsearch/xpack/ml/job/config/CategorizationAnalyzerConfig.java
@ -0,0 +1,621 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.job.config;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.elasticsearch.Version;
+import org.elasticsearch.action.admin.indices.analyze.TransportAnalyzeAction;
+import org.elasticsearch.cluster.metadata.IndexMetaData;
+import org.elasticsearch.common.ParseField;
+import org.elasticsearch.common.UUIDs;
+import org.elasticsearch.common.collect.Tuple;
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.common.io.stream.Writeable;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.xcontent.ToXContentFragment;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.common.xcontent.XContentFactory;
+import org.elasticsearch.common.xcontent.XContentParser;
+import org.elasticsearch.common.xcontent.XContentType;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AnalysisRegistry;
+import org.elasticsearch.index.analysis.CharFilterFactory;
+import org.elasticsearch.index.analysis.CustomAnalyzer;
+import org.elasticsearch.index.analysis.CustomAnalyzerProvider;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
+import org.elasticsearch.index.analysis.TokenizerFactory;
+import org.elasticsearch.indices.analysis.AnalysisModule;
+import org.elasticsearch.rest.action.admin.indices.RestAnalyzeAction;
+import org.elasticsearch.xpack.ml.MlParserType;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+
+
+/**
+ * Configuration for the categorization analyzer.
+ *
+ * The syntax is a subset of what can be supplied to the {@linkplain RestAnalyzeAction <code>_analyze</code> endpoint}.
+ * To summarise, the first option is to specify the name of an out-of-the-box analyzer:
+ * <code>
+ *     "categorization_analyzer" : "standard"
+ * </code>
+ *
+ * The second option is to specify a custom analyzer by combining the <code>char_filters</code>, <code>tokenizer</code>
+ * and <code>token_filters</code> fields.  In turn, each of these can be specified as the name of an out-of-the-box
+ * one or as an object defining a custom one.  For example:
+ * <code>
+ *     "char_filters" : [
+ *         "html_strip",
+ *         { "type" : "pattern_replace", "pattern": "SQL: .*" }
+ *     ],
+ *     "tokenizer" : "thai",
+ *     "token_filters" : [
+ *         "lowercase",
+ *         { "type" : "pattern_replace", "pattern": "^[0-9].*" }
+ *     ]
+ * </code>
+ *
+ * Unfortunately there is no easy to to reuse a subset of the <code>_analyze</code> action implementation, so much
+ * of the code in this file is copied from {@link TransportAnalyzeAction}.  Unfortunately the logic required here is
+ * not quite identical to that of {@link TransportAnalyzeAction}, and the required code is hard to partially reuse.
+ * TODO: consider refactoring ES core to allow more reuse.
+ */
+public class CategorizationAnalyzerConfig implements ToXContentFragment, Writeable {
+
+    public static final ParseField CATEGORIZATION_ANALYZER = new ParseField("categorization_analyzer");
+    private static final ParseField TOKENIZER = RestAnalyzeAction.Fields.TOKENIZER;
+    private static final ParseField TOKEN_FILTERS = RestAnalyzeAction.Fields.TOKEN_FILTERS;
+    private static final ParseField CHAR_FILTERS = RestAnalyzeAction.Fields.CHAR_FILTERS;
+
+    /**
+     * This method is only used in the unit tests - in production code this config is always parsed as a fragment.
+     */
+    static CategorizationAnalyzerConfig buildFromXContentObject(XContentParser parser, MlParserType parserType) throws IOException {
+
+        if (parser.nextToken() != XContentParser.Token.START_OBJECT) {
+            throw new IllegalArgumentException("Expected start object but got [" + parser.currentToken() + "]");
+        }
+        if (parser.nextToken() != XContentParser.Token.FIELD_NAME || CATEGORIZATION_ANALYZER.match(parser.currentName()) == false) {
+            throw new IllegalArgumentException("Expected [" + CATEGORIZATION_ANALYZER + "] field but got [" + parser.currentToken() + "]");
+        }
+        parser.nextToken();
+        CategorizationAnalyzerConfig categorizationAnalyzerConfig = buildFromXContentFragment(parser, parserType);
+        parser.nextToken();
+        return categorizationAnalyzerConfig;
+    }
+
+    /**
+     * Parse a <code>categorization_analyzer</code> from configuration or cluster state.  A custom parser is needed
+     * due to the complexity of the format, with many elements able to be specified as either the name of a built-in
+     * element or an object containing a custom definition.
+     *
+     * The parser is strict when parsing config and lenient when parsing cluster state.
+     */
+    static CategorizationAnalyzerConfig buildFromXContentFragment(XContentParser parser, MlParserType parserType) throws IOException {
+
+        CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder();
+
+        XContentParser.Token token = parser.currentToken();
+        if (token == XContentParser.Token.VALUE_STRING) {
+            builder.setAnalyzer(parser.text());
+        } else if (token != XContentParser.Token.START_OBJECT) {
+            throw new IllegalArgumentException("[" + CATEGORIZATION_ANALYZER + "] should be analyzer's name or settings [" + token + "]");
+        } else {
+            String currentFieldName = null;
+            while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
+                if (token == XContentParser.Token.FIELD_NAME) {
+                    currentFieldName = parser.currentName();
+                } else if (CHAR_FILTERS.match(currentFieldName) && token == XContentParser.Token.START_ARRAY) {
+                    while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
+                        if (token == XContentParser.Token.VALUE_STRING) {
+                            builder.addCharFilter(parser.text());
+                        } else if (token == XContentParser.Token.START_OBJECT) {
+                            builder.addCharFilter(parser.map());
+                        } else {
+                            throw new IllegalArgumentException("[" + currentFieldName + "] in [" + CATEGORIZATION_ANALYZER +
+                                    "] array element should contain char_filter's name or settings [" + token + "]");
+                        }
+                    }
+                } else if (TOKENIZER.match(currentFieldName)) {
+                    if (token == XContentParser.Token.VALUE_STRING) {
+                        builder.setTokenizer(parser.text());
+                    } else if (token == XContentParser.Token.START_OBJECT) {
+                        builder.setTokenizer(parser.map());
+                    } else {
+                        throw new IllegalArgumentException("[" + currentFieldName + "] in [" + CATEGORIZATION_ANALYZER +
+                                "] should be tokenizer's name or settings [" + token + "]");
+                    }
+                } else if (TOKEN_FILTERS.match(currentFieldName) && token == XContentParser.Token.START_ARRAY) {
+                    while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
+                        if (token == XContentParser.Token.VALUE_STRING) {
+                            builder.addTokenFilter(parser.text());
+                        } else if (token == XContentParser.Token.START_OBJECT) {
+                            builder.addTokenFilter(parser.map());
+                        } else {
+                            throw new IllegalArgumentException("[" + currentFieldName + "] in [" + CATEGORIZATION_ANALYZER +
+                                    "] array element should contain token_filter's name or settings [" + token + "]");
+                        }
+                    }
+                // Be lenient when parsing cluster state - assume unknown fields are from future versions
+                } else if (parserType == MlParserType.CONFIG) {
+                    throw new IllegalArgumentException("Parameter [" + currentFieldName + "] in [" + CATEGORIZATION_ANALYZER +
+                            "] is unknown or of the wrong type [" + token + "]");
+                }
+            }
+        }
+
+        return builder.build();
+    }
+
+    /**
+     * Create a <code>categorization_analyzer</code> that mimics what the tokenizer and filters built into the ML C++
+     * code do.  This is the default analyzer for categorization to ensure that people upgrading from previous versions
+     * get the same behaviour from their categorization jobs before and after upgrade.
+     * @param categorizationFilters Categorization filters (if any) from the <code>analysis_config</code>.
+     * @return The default categorization analyzer.
+     */
+    public static CategorizationAnalyzerConfig buildDefaultCategorizationAnalyzer(List<String> categorizationFilters) {
+
+        CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder();
+
+        if (categorizationFilters != null) {
+            for (String categorizationFilter : categorizationFilters) {
+                Map<String, Object> charFilter = new HashMap<>();
+                charFilter.put("type", "pattern_replace");
+                charFilter.put("pattern", categorizationFilter);
+                builder.addCharFilter(charFilter);
+            }
+        }
+
+        builder.setTokenizer("ml_classic");
+
+        Map<String, Object> tokenFilter = new HashMap<>();
+        tokenFilter.put("type", "stop");
+        tokenFilter.put("stopwords", Arrays.asList(
+                "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
+                "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun",
+                "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December",
+                "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
+                "GMT", "UTC"));
+        builder.addTokenFilter(tokenFilter);
+
+        return builder.build();
+    }
+
+    /**
+     * Simple store of either a name of a built-in analyzer element or a custom definition.
+     */
+    public static class NameOrDefinition implements ToXContentFragment, Writeable {
+
+        // Exactly one of these two members is not null
+        public final String name;
+        public final Settings definition;
+
+        NameOrDefinition(String name) {
+            this.name = Objects.requireNonNull(name);
+            this.definition = null;
+        }
+
+        NameOrDefinition(ParseField field, Map<String, Object> definition) {
+            this.name = null;
+            Objects.requireNonNull(definition);
+            try {
+                XContentBuilder builder = XContentFactory.contentBuilder(XContentType.JSON);
+                builder.map(definition);
+                this.definition = Settings.builder().loadFromSource(builder.string(), builder.contentType()).build();
+            } catch (IOException e) {
+                throw new IllegalArgumentException("Failed to parse [" + definition + "] in [" + field.getPreferredName() + "]", e);
+            }
+        }
+
+        NameOrDefinition(StreamInput in) throws IOException {
+            name = in.readOptionalString();
+            if (in.readBoolean()) {
+                definition = Settings.readSettingsFromStream(in);
+            } else {
+                definition = null;
+            }
+        }
+
+        @Override
+        public void writeTo(StreamOutput out) throws IOException {
+            out.writeOptionalString(name);
+            boolean isNotNullDefinition = this.definition != null;
+            out.writeBoolean(isNotNullDefinition);
+            if (isNotNullDefinition) {
+                Settings.writeSettingsToStream(definition, out);
+            }
+        }
+
+        @Override
+        public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
+            if (definition == null) {
+                builder.value(name);
+            } else {
+                builder.startObject();
+                definition.toXContent(builder, params);
+                builder.endObject();
+            }
+            return builder;
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) return true;
+            if (o == null || getClass() != o.getClass()) return false;
+            NameOrDefinition that = (NameOrDefinition) o;
+            return Objects.equals(name, that.name) &&
+                    Objects.equals(definition, that.definition);
+        }
+
+        @Override
+        public int hashCode() {
+            return Objects.hash(name, definition);
+        }
+
+        @Override
+        public String toString() {
+            if (definition == null) {
+                return name;
+            } else {
+                return definition.toDelimitedString(';');
+            }
+        }
+    }
+
+    private final String analyzer;
+    private final List<NameOrDefinition> charFilters;
+    private final NameOrDefinition tokenizer;
+    private final List<NameOrDefinition> tokenFilters;
+
+    private CategorizationAnalyzerConfig(String analyzer, List<NameOrDefinition> charFilters, NameOrDefinition tokenizer,
+                                         List<NameOrDefinition> tokenFilters) {
+        this.analyzer = analyzer;
+        this.charFilters = Objects.requireNonNull(charFilters);
+        this.tokenizer = tokenizer;
+        this.tokenFilters = Objects.requireNonNull(tokenFilters);
+    }
+
+    public CategorizationAnalyzerConfig(StreamInput in) throws IOException {
+        analyzer = in.readOptionalString();
+        charFilters = in.readList(NameOrDefinition::new);
+        tokenizer = in.readOptionalWriteable(NameOrDefinition::new);
+        tokenFilters = in.readList(NameOrDefinition::new);
+    }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        out.writeOptionalString(analyzer);
+        out.writeList(charFilters);
+        out.writeOptionalWriteable(tokenizer);
+        out.writeList(tokenFilters);
+    }
+
+    public String getAnalyzer() {
+        return analyzer;
+    }
+
+    public List<NameOrDefinition> getCharFilters() {
+        return charFilters;
+    }
+
+    public NameOrDefinition getTokenizer() {
+        return tokenizer;
+    }
+
+    public List<NameOrDefinition> getTokenFilters() {
+        return tokenFilters;
+    }
+
+    @Override
+    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
+        if (analyzer != null) {
+            builder.field(CATEGORIZATION_ANALYZER.getPreferredName(), analyzer);
+        } else {
+            builder.startObject(CATEGORIZATION_ANALYZER.getPreferredName());
+            if (charFilters.isEmpty() == false) {
+                builder.startArray(CHAR_FILTERS.getPreferredName());
+                for (NameOrDefinition charFilter : charFilters) {
+                    charFilter.toXContent(builder, params);
+                }
+                builder.endArray();
+            }
+            if (tokenizer != null) {
+                builder.field(TOKENIZER.getPreferredName(), tokenizer);
+            }
+            if (tokenFilters.isEmpty() == false) {
+                builder.startArray(TOKEN_FILTERS.getPreferredName());
+                for (NameOrDefinition tokenFilter : tokenFilters) {
+                    tokenFilter.toXContent(builder, params);
+                }
+                builder.endArray();
+            }
+            builder.endObject();
+        }
+        return builder;
+    }
+
+    /**
+     * Convert the config to an {@link Analyzer}.  This may be a global analyzer or a newly created custom analyzer.
+     * In the case of a global analyzer the caller must NOT close it when they have finished with it.  In the case of
+     * a newly created custom analyzer the caller is responsible for closing it.
+     * @return The first tuple member is the {@link Analyzer}; the second indicates whether the caller is responsible
+     *         for closing it.
+     */
+    public Tuple<Analyzer, Boolean> toAnalyzer(AnalysisRegistry analysisRegistry, Environment environment) throws IOException {
+        if (analyzer != null) {
+            Analyzer globalAnalyzer = analysisRegistry.getAnalyzer(analyzer);
+            if (globalAnalyzer == null) {
+                throw new IllegalArgumentException("Failed to find global analyzer [" + analyzer + "]");
+            }
+            return new Tuple<>(globalAnalyzer, Boolean.FALSE);
+        } else {
+            List<CharFilterFactory> charFilterFactoryList =
+                    parseCharFilterFactories(analysisRegistry, environment);
+
+            Tuple<String, TokenizerFactory> tokenizerFactory = parseTokenizerFactory(analysisRegistry,
+                    environment);
+
+            List<TokenFilterFactory> tokenFilterFactoryList = parseTokenFilterFactories(analysisRegistry,
+                    environment, tokenizerFactory, charFilterFactoryList);
+
+            return new Tuple<>(new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(),
+                    charFilterFactoryList.toArray(new CharFilterFactory[charFilterFactoryList.size()]),
+                    tokenFilterFactoryList.toArray(new TokenFilterFactory[tokenFilterFactoryList.size()])), Boolean.TRUE);
+        }
+    }
+
+
+    /**
+     * Get char filter factories for each configured char filter.  Each configuration
+     * element can be the name of an out-of-the-box char filter, or a custom definition.
+     */
+    private List<CharFilterFactory> parseCharFilterFactories(AnalysisRegistry analysisRegistry,
+                                                             Environment environment) throws IOException {
+        final List<CharFilterFactory> charFilterFactoryList = new ArrayList<>();
+        for (NameOrDefinition charFilter : charFilters) {
+            final CharFilterFactory charFilterFactory;
+            if (charFilter.name != null) {
+                AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory =
+                        analysisRegistry.getCharFilterProvider(charFilter.name);
+                if (charFilterFactoryFactory == null) {
+                    throw new IllegalArgumentException("Failed to find global char filter under [" + charFilter.name + "]");
+                }
+                charFilterFactory = charFilterFactoryFactory.get(environment, charFilter.name);
+            } else {
+                String charFilterTypeName = charFilter.definition.get("type");
+                if (charFilterTypeName == null) {
+                    throw new IllegalArgumentException("Missing [type] setting for char filter: " + charFilter.definition);
+                }
+                AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory =
+                        analysisRegistry.getCharFilterProvider(charFilterTypeName);
+                if (charFilterFactoryFactory == null) {
+                    throw new IllegalArgumentException("Failed to find global char filter under [" + charFilterTypeName + "]");
+                }
+                Settings settings = augmentSettings(charFilter.definition);
+                // Need to set anonymous "name" of char_filter
+                charFilterFactory = charFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment,
+                        "_anonymous_charfilter", settings);
+            }
+            if (charFilterFactory == null) {
+                throw new IllegalArgumentException("Failed to find char filter [" + charFilter + "]");
+            }
+            charFilterFactoryList.add(charFilterFactory);
+        }
+        return charFilterFactoryList;
+    }
+
+    /**
+     * Get the tokenizer factory for the configured tokenizer.  The configuration
+     * can be the name of an out-of-the-box tokenizer, or a custom definition.
+     */
+    private Tuple<String, TokenizerFactory> parseTokenizerFactory(AnalysisRegistry analysisRegistry,
+                                                                  Environment environment) throws IOException {
+        final String name;
+        final TokenizerFactory tokenizerFactory;
+        if (tokenizer.name != null) {
+            name = tokenizer.name;
+            AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory = analysisRegistry.getTokenizerProvider(name);
+            if (tokenizerFactoryFactory == null) {
+                throw new IllegalArgumentException("Failed to find global tokenizer under [" + name + "]");
+            }
+            tokenizerFactory = tokenizerFactoryFactory.get(environment, name);
+        } else {
+            String tokenizerTypeName = tokenizer.definition.get("type");
+            if (tokenizerTypeName == null) {
+                throw new IllegalArgumentException("Missing [type] setting for tokenizer: " + tokenizer.definition);
+            }
+            AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory =
+                    analysisRegistry.getTokenizerProvider(tokenizerTypeName);
+            if (tokenizerFactoryFactory == null) {
+                throw new IllegalArgumentException("Failed to find global tokenizer under [" + tokenizerTypeName + "]");
+            }
+            Settings settings = augmentSettings(tokenizer.definition);
+            // Need to set anonymous "name" of tokenizer
+            name = "_anonymous_tokenizer";
+            tokenizerFactory = tokenizerFactoryFactory.get(buildDummyIndexSettings(settings), environment, name, settings);
+        }
+        return new Tuple<>(name, tokenizerFactory);
+    }
+
+    /**
+     * Get token filter factories for each configured token filter.  Each configuration
+     * element can be the name of an out-of-the-box token filter, or a custom definition.
+     */
+    private List<TokenFilterFactory> parseTokenFilterFactories(AnalysisRegistry analysisRegistry, Environment environment,
+                                                               Tuple<String, TokenizerFactory> tokenizerFactory,
+                                                               List<CharFilterFactory> charFilterFactoryList) throws IOException {
+        final List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
+        for (NameOrDefinition tokenFilter : tokenFilters) {
+            TokenFilterFactory tokenFilterFactory;
+            if (tokenFilter.name != null) {
+                AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory;
+                tokenFilterFactoryFactory = analysisRegistry.getTokenFilterProvider(tokenFilter.name);
+                if (tokenFilterFactoryFactory == null) {
+                    throw new IllegalArgumentException("Failed to find global token filter under [" + tokenFilter.name + "]");
+                }
+                tokenFilterFactory = tokenFilterFactoryFactory.get(environment, tokenFilter.name);
+            } else {
+                String filterTypeName = tokenFilter.definition.get("type");
+                if (filterTypeName == null) {
+                    throw new IllegalArgumentException("Missing [type] setting for token filter: " + tokenFilter.definition);
+                }
+                AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory =
+                        analysisRegistry.getTokenFilterProvider(filterTypeName);
+                if (tokenFilterFactoryFactory == null) {
+                    throw new IllegalArgumentException("Failed to find global token filter under [" + filterTypeName + "]");
+                }
+                Settings settings = augmentSettings(tokenFilter.definition);
+                // Need to set anonymous "name" of token_filter
+                tokenFilterFactory = tokenFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment,
+                        "_anonymous_tokenfilter", settings);
+                tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(),
+                        tokenizerFactory.v2(), tokenFilterFactoryList, charFilterFactoryList, environment);
+            }
+            if (tokenFilterFactory == null) {
+                throw new IllegalArgumentException("Failed to find or create token filter [" + tokenFilter + "]");
+            }
+            tokenFilterFactoryList.add(tokenFilterFactory);
+        }
+        return tokenFilterFactoryList;
+    }
+
+    /**
+     * The Elasticsearch analysis functionality is designed to work with indices.  For
+     * categorization we have to pretend we've got some index settings.
+     */
+    private IndexSettings buildDummyIndexSettings(Settings settings) {
+        IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build();
+        return new IndexSettings(metaData, Settings.EMPTY);
+    }
+
+    /**
+     * The behaviour of Elasticsearch analyzers can vary between versions.
+     * For categorization we'll always use the latest version of the text analysis.
+     * The other settings are just to stop classes that expect to be associated with
+     * an index from complaining.
+     */
+    private Settings augmentSettings(Settings settings) {
+        return Settings.builder().put(settings)
+                .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
+                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
+                .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
+                .put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
+                .build();
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        CategorizationAnalyzerConfig that = (CategorizationAnalyzerConfig) o;
+        return Objects.equals(analyzer, that.analyzer) &&
+                Objects.equals(charFilters, that.charFilters) &&
+                Objects.equals(tokenizer, that.tokenizer) &&
+                Objects.equals(tokenFilters, that.tokenFilters);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(analyzer, charFilters, tokenizer, tokenFilters);
+    }
+
+    public static class Builder {
+
+        private String analyzer;
+        private List<NameOrDefinition> charFilters = new ArrayList<>();
+        private NameOrDefinition tokenizer;
+        private List<NameOrDefinition> tokenFilters = new ArrayList<>();
+
+        public Builder() {
+        }
+
+        public Builder(CategorizationAnalyzerConfig categorizationAnalyzerConfig) {
+            this.analyzer = categorizationAnalyzerConfig.analyzer;
+            this.charFilters = new ArrayList<>(categorizationAnalyzerConfig.charFilters);
+            this.tokenizer = categorizationAnalyzerConfig.tokenizer;
+            this.tokenFilters = new ArrayList<>(categorizationAnalyzerConfig.tokenFilters);
+        }
+
+        public Builder setAnalyzer(String analyzer) {
+            this.analyzer = analyzer;
+            return this;
+        }
+
+        public Builder addCharFilter(String charFilter) {
+            this.charFilters.add(new NameOrDefinition(charFilter));
+            return this;
+        }
+
+        public Builder addCharFilter(Map<String, Object> charFilter) {
+            this.charFilters.add(new NameOrDefinition(CHAR_FILTERS, charFilter));
+            return this;
+        }
+
+        public Builder setTokenizer(String tokenizer) {
+            this.tokenizer = new NameOrDefinition(tokenizer);
+            return this;
+        }
+
+        public Builder setTokenizer(Map<String, Object> tokenizer) {
+            this.tokenizer = new NameOrDefinition(TOKENIZER, tokenizer);
+            return this;
+        }
+
+        public Builder addTokenFilter(String tokenFilter) {
+            this.tokenFilters.add(new NameOrDefinition(tokenFilter));
+            return this;
+        }
+
+        public Builder addTokenFilter(Map<String, Object> tokenFilter) {
+            this.tokenFilters.add(new NameOrDefinition(TOKEN_FILTERS, tokenFilter));
+            return this;
+        }
+
+        /**
+         * Create a config validating only structure, not exact analyzer/tokenizer/filter names
+         */
+        public CategorizationAnalyzerConfig build() {
+            if (analyzer == null && tokenizer == null) {
+                throw new IllegalArgumentException(CATEGORIZATION_ANALYZER + " that is not a global analyzer must specify a ["
+                        + TOKENIZER + "] field");
+            }
+            if (analyzer != null && charFilters.isEmpty() == false) {
+                throw new IllegalArgumentException(CATEGORIZATION_ANALYZER + " that is a global analyzer cannot also specify a ["
+                        + CHAR_FILTERS + "] field");
+            }
+            if (analyzer != null && tokenizer != null) {
+                throw new IllegalArgumentException(CATEGORIZATION_ANALYZER + " that is a global analyzer cannot also specify a ["
+                        + TOKENIZER + "] field");
+            }
+            if (analyzer != null && tokenFilters.isEmpty() == false) {
+                throw new IllegalArgumentException(CATEGORIZATION_ANALYZER + " that is a global analyzer cannot also specify a ["
+                        + TOKEN_FILTERS + "] field");
+            }
+            return new CategorizationAnalyzerConfig(analyzer, charFilters, tokenizer, tokenFilters);
+        }
+
+        /**
+         * Verify that the builder will build a valid config.  This is not done as part of the basic build
+         * because it verifies that the names of analyzers/tokenizers/filters referenced by the config are
+         * known, and the validity of these names could change over time.
+         */
+        public void verify(AnalysisRegistry analysisRegistry, Environment environment) throws IOException {
+            Tuple<Analyzer, Boolean> tuple = build().toAnalyzer(analysisRegistry, environment);
+            if (tuple.v2()) {
+                tuple.v1().close();
+            }
+        }
+    }
+}
--- a/plugin/core/src/main/java/org/elasticsearch/xpack/ml/job/config/Job.java
+++ b/plugin/core/src/main/java/org/elasticsearch/xpack/ml/job/config/Job.java
@ -21,6 +21,8 @@ import org.elasticsearch.common.xcontent.ObjectParser.ValueType;
 import org.elasticsearch.common.xcontent.ToXContentObject;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentParser.Token;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.xpack.ml.MlParserType;
 import org.elasticsearch.xpack.ml.job.messages.Messages;
 import org.elasticsearch.xpack.ml.job.persistence.AnomalyDetectorsIndexFields;
@ -30,6 +32,7 @@ import org.elasticsearch.xpack.ml.utils.time.TimeUtils;

 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.Date;
 import java.util.EnumMap;
@ -60,7 +63,7 @@ public class Job extends AbstractDiffable<Job> implements Writeable, ToXContentO
    public static final ParseField JOB_TYPE = new ParseField("job_type");
    public static final ParseField JOB_VERSION = new ParseField("job_version");
    public static final ParseField GROUPS = new ParseField("groups");
-    public static final ParseField ANALYSIS_CONFIG = new ParseField("analysis_config");
+    public static final ParseField ANALYSIS_CONFIG = AnalysisConfig.ANALYSIS_CONFIG;
    public static final ParseField ANALYSIS_LIMITS = new ParseField("analysis_limits");
    public static final ParseField CREATE_TIME = new ParseField("create_time");
    public static final ParseField CUSTOM_SETTINGS = new ParseField("custom_settings");
@ -400,12 +403,12 @@ public class Job extends AbstractDiffable<Job> implements Writeable, ToXContentO
    }

    /**
-     * Get a list of all input data fields mentioned in the job configuration,
+     * Get all input data fields mentioned in the job configuration,
     * namely analysis fields and the time field.
     *
-     * @return the list of fields - never <code>null</code>
+     * @return the collection of fields - never <code>null</code>
     */
-    public List<String> allFields() {
+    public Collection<String> allInputFields() {
        Set<String> allFields = new TreeSet<>();

        // analysis fields
@ -424,7 +427,10 @@ public class Job extends AbstractDiffable<Job> implements Writeable, ToXContentO
        // remove empty strings
        allFields.remove("");

-        return new ArrayList<>(allFields);
+        // the categorisation field isn't an input field
+        allFields.remove(AnalysisConfig.ML_CATEGORY_FIELD);
+
+        return allFields;
    }

    /**
@ -1077,6 +1083,18 @@ public class Job extends AbstractDiffable<Job> implements Writeable, ToXContentO
            analysisLimits = new AnalysisLimits(modelMemoryLimit, categorizationExampleLimit);
        }

+        /**
+         * Validate the char filter/tokenizer/token filter names used in the categorization analyzer config (if any).
+         * The overall structure can be validated at parse time, but the exact names need to be checked separately,
+         * as plugins that provide the functionality can be installed/uninstalled.
+         */
+        public void validateCategorizationAnalyzer(AnalysisRegistry analysisRegistry, Environment environment) throws IOException {
+            CategorizationAnalyzerConfig categorizationAnalyzerConfig = analysisConfig.getCategorizationAnalyzerConfig();
+            if (categorizationAnalyzerConfig != null) {
+                new CategorizationAnalyzerConfig.Builder(categorizationAnalyzerConfig).verify(analysisRegistry, environment);
+            }
+        }
+
        private void validateGroups() {
            for (String group : this.groups) {
                if (MlStrings.isValidId(group) == false) {
--- a/plugin/core/src/main/java/org/elasticsearch/xpack/ml/job/messages/Messages.java
+++ b/plugin/core/src/main/java/org/elasticsearch/xpack/ml/job/messages/Messages.java
@ -75,8 +75,13 @@ public final class Messages {
            "categorization_filters are not allowed to contain empty strings";
    public static final String JOB_CONFIG_CATEGORIZATION_FILTERS_CONTAINS_INVALID_REGEX =
            "categorization_filters contains invalid regular expression ''{0}''";
+    public static final String JOB_CONFIG_CATEGORIZATION_FILTERS_INCOMPATIBLE_WITH_CATEGORIZATION_ANALYZER =
+            "categorization_filters cannot be used with categorization_analyzer - " +
+                    "instead specify them as pattern_replace char_filters in the analyzer";
    public static final String JOB_CONFIG_CATEGORIZATION_FILTERS_REQUIRE_CATEGORIZATION_FIELD_NAME =
            "categorization_filters require setting categorization_field_name";
+    public static final String JOB_CONFIG_CATEGORIZATION_ANALYZER_REQUIRES_CATEGORIZATION_FIELD_NAME =
+            "categorization_analyzer requires setting categorization_field_name";
    public static final String JOB_CONFIG_CONDITION_INVALID_VALUE_NULL = "Invalid condition: the value field cannot be null";
    public static final String JOB_CONFIG_CONDITION_INVALID_VALUE_NUMBER =
            "Invalid condition value: cannot parse a double from string ''{0}''";
--- a/plugin/core/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/RecordWriter.java
+++ b/plugin/core/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/RecordWriter.java
@ -19,6 +19,12 @@ public interface RecordWriter {
     */
    String CONTROL_FIELD_NAME = ".";

+    /**
+     * Value must match api::CBaseTokenListDataTyper::PRETOKENISED_TOKEN_FIELD in the C++
+     * code.
+     */
+    String PRETOKENISED_TOKEN_FIELD = "...";
+
    /**
     * Write each String in the record array
     */
@ -34,4 +40,4 @@ public interface RecordWriter {
     */
    void flush() throws IOException;

-}
+}
--- a/plugin/src/main/java/org/elasticsearch/xpack/XPackPlugin.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/XPackPlugin.java
@ -38,12 +38,15 @@ import org.elasticsearch.env.Environment;
 import org.elasticsearch.env.NodeEnvironment;
 import org.elasticsearch.http.HttpServerTransport;
 import org.elasticsearch.index.IndexModule;
+import org.elasticsearch.index.analysis.TokenizerFactory;
+import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
 import org.elasticsearch.indices.breaker.CircuitBreakerService;
 import org.elasticsearch.ingest.Processor;
 import org.elasticsearch.license.LicenseService;
 import org.elasticsearch.license.Licensing;
 import org.elasticsearch.license.XPackLicenseState;
 import org.elasticsearch.plugins.ActionPlugin;
+import org.elasticsearch.plugins.AnalysisPlugin;
 import org.elasticsearch.plugins.ClusterPlugin;
 import org.elasticsearch.plugins.DiscoveryPlugin;
 import org.elasticsearch.plugins.IngestPlugin;
@ -104,6 +107,7 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.TreeMap;
 import java.util.function.BiConsumer;
 import java.util.function.Function;
 import java.util.function.Predicate;
@ -113,7 +117,7 @@ import java.util.stream.Collectors;
 import java.util.stream.Stream;

 public class XPackPlugin extends Plugin implements ScriptPlugin, ActionPlugin, IngestPlugin, NetworkPlugin, ClusterPlugin,
-        DiscoveryPlugin, MapperPlugin {
+        DiscoveryPlugin, MapperPlugin, AnalysisPlugin {

    // TODO: clean up this library to not ask for write access to all system properties!
    static {
@ -427,6 +431,13 @@ public class XPackPlugin extends Plugin implements ScriptPlugin, ActionPlugin, I
        };
    }

+    @Override
+    public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
+        Map<String, AnalysisProvider<TokenizerFactory>> tokenizers = new TreeMap<>();
+        tokenizers.putAll(machineLearning.getTokenizers());
+        return tokenizers;
+    }
+
    public void onIndexModule(IndexModule module) {
        security.onIndexModule(module);
        watcher.onIndexModule(module);
--- a/plugin/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java
@ -36,10 +36,13 @@ import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.TokenizerFactory;
+import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
 import org.elasticsearch.license.XPackLicenseState;
 import org.elasticsearch.monitor.os.OsProbe;
 import org.elasticsearch.monitor.os.OsStats;
 import org.elasticsearch.plugins.ActionPlugin;
+import org.elasticsearch.plugins.AnalysisPlugin;
 import org.elasticsearch.rest.RestController;
 import org.elasticsearch.rest.RestHandler;
 import org.elasticsearch.tasks.Task;
@ -143,6 +146,8 @@ import org.elasticsearch.xpack.ml.datafeed.DatafeedManager;
 import org.elasticsearch.xpack.ml.datafeed.DatafeedState;
 import org.elasticsearch.xpack.ml.job.JobManager;
 import org.elasticsearch.xpack.ml.job.UpdateJobProcessNotifier;
+import org.elasticsearch.xpack.ml.job.categorization.MlClassicTokenizer;
+import org.elasticsearch.xpack.ml.job.categorization.MlClassicTokenizerFactory;
 import org.elasticsearch.xpack.ml.job.config.JobTaskStatus;
 import org.elasticsearch.xpack.ml.job.persistence.AnomalyDetectorsIndex;
 import org.elasticsearch.xpack.ml.job.persistence.ElasticsearchMappings;
@ -225,13 +230,17 @@ import java.util.function.UnaryOperator;
 import static java.util.Collections.emptyList;
 import static org.elasticsearch.xpack.XpackField.MACHINE_LEARNING;

-public class MachineLearning implements MachineLearningClientActionPlugin, ActionPlugin {
+public class MachineLearning implements MachineLearningClientActionPlugin, ActionPlugin, AnalysisPlugin {
    public static final String NAME = "ml";
    public static final String BASE_PATH = "/_xpack/ml/";
    public static final String DATAFEED_THREAD_POOL_NAME = NAME + "_datafeed";
    public static final String AUTODETECT_THREAD_POOL_NAME = NAME + "_autodetect";
    public static final String UTILITY_THREAD_POOL_NAME = NAME + "_utility";

+    // This is for performance testing.  It's not exposed to the end user.
+    // Recompile if you want to compare performance with C++ tokenization.
+    public static final boolean CATEGORIZATION_TOKENIZATION_IN_JAVA = true;
+
    public static final Setting<Boolean> AUTODETECT_PROCESS =
            Setting.boolSetting("xpack.ml.autodetect_process", true, Property.NodeScope);
    public static final Setting<Boolean> ML_ENABLED =
@ -390,7 +399,7 @@ public class MachineLearning implements MachineLearningClientActionPlugin, Actio
        Auditor auditor = new Auditor(client, clusterService);
        JobProvider jobProvider = new JobProvider(client, settings);
        UpdateJobProcessNotifier notifier = new UpdateJobProcessNotifier(settings, client, clusterService, threadPool);
-        JobManager jobManager = new JobManager(settings, jobProvider, clusterService, auditor, client, notifier);
+        JobManager jobManager = new JobManager(env, settings, jobProvider, clusterService, auditor, client, notifier);

        JobDataCountsPersister jobDataCountsPersister = new JobDataCountsPersister(settings, client);
        JobResultsPersister jobResultsPersister = new JobResultsPersister(settings, client);
@ -420,7 +429,7 @@ public class MachineLearning implements MachineLearningClientActionPlugin, Actio
        }
        NormalizerFactory normalizerFactory = new NormalizerFactory(normalizerProcessFactory,
                threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME));
-        AutodetectProcessManager autodetectProcessManager = new AutodetectProcessManager(settings, client, threadPool,
+        AutodetectProcessManager autodetectProcessManager = new AutodetectProcessManager(env, settings, client, threadPool,
                jobManager, jobProvider, jobResultsPersister, jobDataCountsPersister, autodetectProcessFactory,
                normalizerFactory, xContentRegistry, auditor);
        this.autodetectProcessManager.set(autodetectProcessManager);
@ -601,6 +610,11 @@ public class MachineLearning implements MachineLearningClientActionPlugin, Actio
        return Arrays.asList(autoDetect, renormalizer, datafeed);
    }

+    @Override
+    public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
+        return Collections.singletonMap(MlClassicTokenizer.NAME, MlClassicTokenizerFactory::new);
+    }
+
    public UnaryOperator<Map<String, IndexTemplateMetaData>> getIndexTemplateMetaDataUpgrader() {
        return templates -> {
            final TimeValue delayedNodeTimeOutSetting;
--- a/plugin/src/main/java/org/elasticsearch/xpack/ml/action/TransportPostDataAction.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/ml/action/TransportPostDataAction.java
@ -12,6 +12,8 @@ import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.transport.TransportService;
 import org.elasticsearch.xpack.ml.job.process.autodetect.AutodetectProcessManager;
@ -23,14 +25,17 @@ import java.util.Optional;

 public class TransportPostDataAction extends TransportJobTaskAction<PostDataAction.Request, PostDataAction.Response> {

+    private final AnalysisRegistry analysisRegistry;
+
    @Inject
    public TransportPostDataAction(Settings settings, TransportService transportService, ThreadPool threadPool,
                                   ClusterService clusterService, ActionFilters actionFilters,
-                                   IndexNameExpressionResolver indexNameExpressionResolver,
-                                   AutodetectProcessManager processManager) {
+                                   IndexNameExpressionResolver indexNameExpressionResolver, AutodetectProcessManager processManager,
+                                   AnalysisRegistry analysisRegistry) {
        super(settings, PostDataAction.NAME, threadPool, clusterService, transportService, actionFilters, indexNameExpressionResolver,
                PostDataAction.Request::new, PostDataAction.Response::new, ThreadPool.Names.SAME, processManager);
        // ThreadPool.Names.SAME, because operations is executed by autodetect worker thread
+        this.analysisRegistry = analysisRegistry;
    }

    @Override
@ -46,7 +51,8 @@ public class TransportPostDataAction extends TransportJobTaskAction<PostDataActi
        TimeRange timeRange = TimeRange.builder().startTime(request.getResetStart()).endTime(request.getResetEnd()).build();
        DataLoadParams params = new DataLoadParams(timeRange, Optional.ofNullable(request.getDataDescription()));
        try {
-            processManager.processData(task, request.getContent().streamInput(), request.getXContentType(), params, (dataCounts, e) -> {
+            processManager.processData(task, analysisRegistry, request.getContent().streamInput(), request.getXContentType(),
+                    params, (dataCounts, e) -> {
                if (dataCounts != null) {
                    listener.onResponse(new PostDataAction.Response(dataCounts));
                } else {
--- a/plugin/src/main/java/org/elasticsearch/xpack/ml/action/TransportPutJobAction.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/ml/action/TransportPutJobAction.java
@ -15,6 +15,8 @@ import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver;
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.license.LicenseUtils;
 import org.elasticsearch.license.XPackLicenseState;
 import org.elasticsearch.tasks.Task;
@ -27,15 +29,18 @@ public class TransportPutJobAction extends TransportMasterNodeAction<PutJobActio

    private final JobManager jobManager;
    private final XPackLicenseState licenseState;
+    private final AnalysisRegistry analysisRegistry;

    @Inject
    public TransportPutJobAction(Settings settings, TransportService transportService, ClusterService clusterService,
                                 ThreadPool threadPool, XPackLicenseState licenseState, ActionFilters actionFilters,
-                                 IndexNameExpressionResolver indexNameExpressionResolver, JobManager jobManager) {
+                                 IndexNameExpressionResolver indexNameExpressionResolver, JobManager jobManager,
+                                 AnalysisRegistry analysisRegistry) {
        super(settings, PutJobAction.NAME, transportService, clusterService, threadPool, actionFilters,
                indexNameExpressionResolver, PutJobAction.Request::new);
        this.licenseState = licenseState;
        this.jobManager = jobManager;
+        this.analysisRegistry = analysisRegistry;
    }

    @Override
@ -51,7 +56,7 @@ public class TransportPutJobAction extends TransportMasterNodeAction<PutJobActio
    @Override
    protected void masterOperation(PutJobAction.Request request, ClusterState state,
                                   ActionListener<PutJobAction.Response> listener) throws Exception {
-        jobManager.putJob(request, state, listener);
+        jobManager.putJob(request, analysisRegistry, state, listener);
    }

    @Override
--- a/plugin/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/scroll/ExtractedFields.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/scroll/ExtractedFields.java
@ -9,7 +9,6 @@ import org.elasticsearch.action.fieldcaps.FieldCapabilities;
 import org.elasticsearch.action.fieldcaps.FieldCapabilitiesResponse;
 import org.elasticsearch.search.SearchHit;
 import org.elasticsearch.xpack.ml.datafeed.DatafeedConfig;
-import org.elasticsearch.xpack.ml.job.config.AnalysisConfig;
 import org.elasticsearch.xpack.ml.job.config.Job;
 import org.elasticsearch.xpack.ml.utils.ExceptionsHelper;
 import org.elasticsearch.xpack.ml.utils.MlStrings;
@ -94,8 +93,7 @@ class ExtractedFields {
        }
        ExtractedField timeExtractedField = ExtractedField.newTimeField(timeField, scriptFields.contains(timeField) ?
                ExtractedField.ExtractionMethod.SCRIPT_FIELD : ExtractedField.ExtractionMethod.DOC_VALUE);
-        List<String> remainingFields = job.allFields().stream().filter(
-                f -> !(f.equals(timeField) || f.equals(AnalysisConfig.ML_CATEGORY_FIELD))).collect(Collectors.toList());
+        List<String> remainingFields = job.allInputFields().stream().filter(f -> !f.equals(timeField)).collect(Collectors.toList());
        List<ExtractedField> allExtractedFields = new ArrayList<>(remainingFields.size() + 1);
        allExtractedFields.add(timeExtractedField);
        remainingFields.stream().forEach(field -> allExtractedFields.add(extractionMethodDetector.detect(field)));
--- a/plugin/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/scroll/ScrollDataExtractorFactory.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/ml/datafeed/extractor/scroll/ScrollDataExtractorFactory.java
@ -73,7 +73,8 @@ public class ScrollDataExtractorFactory implements DataExtractorFactory {
        fieldCapabilitiesRequest.indices(datafeed.getIndices().toArray(new String[datafeed.getIndices().size()]));
        // We need capabilities for all fields matching the requested fields' parents so that we can work around
        // multi-fields that are not in source.
-        String[] requestFields = job.allFields().stream().map(f -> MlStrings.getParentField(f) + "*").toArray(size -> new String[size]);
+        String[] requestFields = job.allInputFields().stream().map(f -> MlStrings.getParentField(f) + "*")
+                .toArray(size -> new String[size]);
        fieldCapabilitiesRequest.fields(requestFields);
        MlClientHelper.<FieldCapabilitiesResponse>execute(datafeed, client, () -> {
            client.execute(FieldCapabilitiesAction.INSTANCE, fieldCapabilitiesRequest, fieldCapabilitiesHandler);
--- a/plugin/src/main/java/org/elasticsearch/xpack/ml/job/JobManager.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/ml/job/JobManager.java
@ -24,6 +24,8 @@ import org.elasticsearch.common.unit.ByteSizeValue;
 import org.elasticsearch.common.xcontent.ToXContent;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentFactory;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.xpack.ml.MLMetadataField;
 import org.elasticsearch.xpack.ml.MachineLearningClientActionPlugin;
 import org.elasticsearch.xpack.ml.MlMetadata;
@ -69,6 +71,7 @@ public class JobManager extends AbstractComponent {
    private static final DeprecationLogger DEPRECATION_LOGGER =
            new DeprecationLogger(Loggers.getLogger(JobManager.class));

+    private final Environment environment;
    private final JobProvider jobProvider;
    private final ClusterService clusterService;
    private final Auditor auditor;
@ -80,9 +83,10 @@ public class JobManager extends AbstractComponent {
    /**
     * Create a JobManager
     */
-    public JobManager(Settings settings, JobProvider jobProvider, ClusterService clusterService, Auditor auditor, Client client,
-                      UpdateJobProcessNotifier updateJobProcessNotifier) {
+    public JobManager(Environment environment, Settings settings, JobProvider jobProvider, ClusterService clusterService, Auditor auditor,
+                      Client client, UpdateJobProcessNotifier updateJobProcessNotifier) {
        super(settings);
+        this.environment = environment;
        this.jobProvider = Objects.requireNonNull(jobProvider);
        this.clusterService = Objects.requireNonNull(clusterService);
        this.auditor = Objects.requireNonNull(auditor);
@ -157,17 +161,19 @@ public class JobManager extends AbstractComponent {
    /**
     * Stores a job in the cluster state
     */
-    public void putJob(PutJobAction.Request request, ClusterState state, ActionListener<PutJobAction.Response> actionListener) {
+    public void putJob(PutJobAction.Request request, AnalysisRegistry analysisRegistry, ClusterState state,
+                       ActionListener<PutJobAction.Response> actionListener) throws IOException {
        // In 6.1 we want to make the model memory size limit more prominent, and also reduce the default from
        // 4GB to 1GB.  However, changing the meaning of a null model memory limit for existing jobs would be a
        // breaking change, so instead we add an explicit limit to newly created jobs that didn't have one when
        // submitted
        request.getJobBuilder().validateModelMemoryLimit(maxModelMemoryLimit);

+        request.getJobBuilder().validateCategorizationAnalyzer(analysisRegistry, environment);

        Job job = request.getJobBuilder().build(new Date());
        if (job.getDataDescription() != null && job.getDataDescription().getFormat() == DataDescription.DataFormat.DELIMITED) {
-            DEPRECATION_LOGGER.deprecated("Creating jobs with delimited data format is deprecated. Please use JSON instead.");
+            DEPRECATION_LOGGER.deprecated("Creating jobs with delimited data format is deprecated. Please use xcontent instead.");
        }

        MlMetadata currentMlMetadata = state.metaData().custom(MLMetadataField.TYPE);
--- a/plugin/src/main/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzer.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzer.java
@ -0,0 +1,72 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.job.categorization;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.common.collect.Tuple;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.analysis.AnalysisRegistry;
+import org.elasticsearch.xpack.ml.job.config.CategorizationAnalyzerConfig;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+
+/**
+ * The categorization analyzer.
+ *
+ * Converts messages to lists of tokens that will be fed to the ML categorization algorithm.
+ */
+public class CategorizationAnalyzer implements Closeable {
+
+    private final Analyzer analyzer;
+    private final boolean closeAnalyzer;
+
+    public CategorizationAnalyzer(AnalysisRegistry analysisRegistry, Environment environment,
+                                  CategorizationAnalyzerConfig categorizationAnalyzerConfig) throws IOException {
+
+        Tuple<Analyzer, Boolean> tuple = categorizationAnalyzerConfig.toAnalyzer(analysisRegistry, environment);
+        analyzer = tuple.v1();
+        closeAnalyzer = tuple.v2();
+    }
+
+    /**
+     * Release resources held by the analyzer (unless it's global).
+     */
+    @Override
+    public void close() {
+        if (closeAnalyzer) {
+            analyzer.close();
+        }
+    }
+
+    /**
+     * Given a field value, convert it to a list of tokens using the configured analyzer.
+     */
+    public List<String> tokenizeField(String fieldName, String fieldValue) {
+        List<String> tokens = new ArrayList<>();
+        try (TokenStream stream = analyzer.tokenStream(fieldName, fieldValue)) {
+            stream.reset();
+            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
+            while (stream.incrementToken()) {
+                String token = term.toString();
+                // Ignore empty tokens for categorization
+                if (token.isEmpty() == false) {
+                    tokens.add(term.toString());
+                }
+            }
+            stream.end();
+        } catch (IOException e) {
+            throw new ElasticsearchException("Failed to analyze value [" + fieldValue + "] of field [" + fieldName + "]", e);
+        }
+        return tokens;
+    }
+}
--- a/plugin/src/main/java/org/elasticsearch/xpack/ml/job/categorization/MlClassicTokenizer.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/ml/job/categorization/MlClassicTokenizer.java
@ -0,0 +1,120 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.job.categorization;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+import java.io.IOException;
+
+
+/**
+ * Java port of the classic ML categorization tokenizer, as implemented in the ML C++ code.
+ *
+ * In common with the original ML C++ code, there are no configuration options.
+ */
+public class MlClassicTokenizer extends Tokenizer {
+
+    public static String NAME = "ml_classic";
+
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+
+    private int nextOffset;
+    private int skippedPositions;
+
+    MlClassicTokenizer() {
+    }
+
+    /**
+     * Basically tokenise into [a-zA-Z0-9]+ strings, but also allowing underscores, dots and dashes in the middle.
+     * Then discard tokens that are hex numbers or begin with a digit.
+     */
+    @Override
+    public final boolean incrementToken() throws IOException {
+        clearAttributes();
+        skippedPositions = 0;
+
+        int start = -1;
+        int length = 0;
+
+        boolean haveNonHex = false;
+        int curChar;
+        while ((curChar = input.read()) >= 0) {
+            ++nextOffset;
+            if (Character.isLetterOrDigit(curChar) || (length > 0 && (curChar == '_' || curChar == '.' || curChar == '-'))) {
+                if (length == 0) {
+                    // We're at the first character of a candidate token, so record the offset
+                    start = nextOffset - 1;
+                }
+                termAtt.append((char) curChar);
+                ++length;
+
+                // We don't return tokens that are hex numbers, and it's most efficient to keep a running note of this
+                haveNonHex = haveNonHex ||
+                        // Count dots and dashes as numeric
+                        (Character.digit(curChar, 16) == -1 && curChar != '.' && curChar != '-');
+            } else if (length > 0) {
+                // If we get here, we've found a separator character having built up a candidate token
+
+                if (haveNonHex && Character.isDigit(termAtt.charAt(0)) == false) {
+                    // The candidate token is valid to return
+                    break;
+                }
+
+                // The candidate token is not valid to return, i.e. it's hex or begins with a digit, so wipe it and carry on searching
+                ++skippedPositions;
+                start = -1;
+                length = 0;
+                termAtt.setEmpty();
+            }
+        }
+
+        // We need to recheck whether we've got a valid token after the loop because
+        // the loop can also be exited on reaching the end of the stream
+        if (length == 0) {
+            return false;
+        }
+
+        if (haveNonHex == false || Character.isDigit(termAtt.charAt(0))) {
+            ++skippedPositions;
+            return false;
+        }
+
+        // Strip dots, dashes and underscores at the end of the token
+        char toCheck;
+        while ((toCheck = termAtt.charAt(length - 1)) == '_' || toCheck == '.' || toCheck == '-') {
+            --length;
+        }
+
+        // Characters that may exist in the term attribute beyond its defined length are ignored
+        termAtt.setLength(length);
+        offsetAtt.setOffset(start, start + length);
+        posIncrAtt.setPositionIncrement(skippedPositions + 1);
+
+        return true;
+    }
+
+    @Override
+    public final void end() throws IOException {
+        super.end();
+        // Set final offset
+        int finalOffset = nextOffset + (int) input.skip(Integer.MAX_VALUE) - 1;
+        offsetAtt.setOffset(finalOffset, finalOffset);
+        // Adjust any skipped tokens
+        posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
+    }
+
+    @Override
+    public void reset() throws IOException {
+        super.reset();
+        nextOffset = 0;
+        skippedPositions = 0;
+    }
+}
--- a/plugin/src/main/java/org/elasticsearch/xpack/ml/job/categorization/MlClassicTokenizerFactory.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/ml/job/categorization/MlClassicTokenizerFactory.java
@ -0,0 +1,30 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.job.categorization;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
+
+
+/**
+ * Factory for the classic ML categorization tokenizer, as implemented in the ML C++ code.
+ *
+ * In common with the original ML C++ code, there are no configuration options.
+ */
+public class MlClassicTokenizerFactory extends AbstractTokenizerFactory {
+
+    public MlClassicTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+        super(indexSettings, name, settings);
+    }
+
+    @Override
+    public Tokenizer create() {
+        return new MlClassicTokenizer();
+    }
+}
--- a/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/AutodetectCommunicator.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/AutodetectCommunicator.java
@ -14,7 +14,13 @@ import org.elasticsearch.common.logging.Loggers;
 import org.elasticsearch.common.util.concurrent.AbstractRunnable;
 import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.common.xcontent.XContentType;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.analysis.AnalysisRegistry;
+import org.elasticsearch.xpack.ml.MachineLearning;
 import org.elasticsearch.xpack.ml.calendars.ScheduledEvent;
+import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer;
+import org.elasticsearch.xpack.ml.job.config.AnalysisConfig;
+import org.elasticsearch.xpack.ml.job.config.CategorizationAnalyzerConfig;
 import org.elasticsearch.xpack.ml.job.config.DataDescription;
 import org.elasticsearch.xpack.ml.job.config.DetectionRule;
 import org.elasticsearch.xpack.ml.job.config.Job;
@ -60,6 +66,7 @@ public class AutodetectCommunicator implements Closeable {
    private static final Duration FLUSH_PROCESS_CHECK_FREQUENCY = Duration.ofSeconds(1);

    private final Job job;
+    private final Environment environment;
    private final AutodetectProcess autodetectProcess;
    private final StateStreamer stateStreamer;
    private final DataCountsReporter dataCountsReporter;
@ -67,13 +74,16 @@ public class AutodetectCommunicator implements Closeable {
    private final Consumer<Exception> onFinishHandler;
    private final ExecutorService autodetectWorkerExecutor;
    private final NamedXContentRegistry xContentRegistry;
+    private final boolean includeTokensField;
+    private volatile CategorizationAnalyzer categorizationAnalyzer;
    private volatile boolean processKilled;

-    AutodetectCommunicator(Job job, AutodetectProcess process, StateStreamer stateStreamer,
+    AutodetectCommunicator(Job job, Environment environment, AutodetectProcess process, StateStreamer stateStreamer,
                           DataCountsReporter dataCountsReporter, AutoDetectResultProcessor autoDetectResultProcessor,
                           Consumer<Exception> onFinishHandler, NamedXContentRegistry xContentRegistry,
                           ExecutorService autodetectWorkerExecutor) {
        this.job = job;
+        this.environment = environment;
        this.autodetectProcess = process;
        this.stateStreamer = stateStreamer;
        this.dataCountsReporter = dataCountsReporter;
@ -81,6 +91,8 @@ public class AutodetectCommunicator implements Closeable {
        this.onFinishHandler = onFinishHandler;
        this.xContentRegistry = xContentRegistry;
        this.autodetectWorkerExecutor = autodetectWorkerExecutor;
+        this.includeTokensField = MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA
+                && job.getAnalysisConfig().getCategorizationFieldName() != null;
    }

    public void init(ModelSnapshot modelSnapshot) throws IOException {
@ -89,12 +101,12 @@ public class AutodetectCommunicator implements Closeable {
    }

    private DataToProcessWriter createProcessWriter(Optional<DataDescription> dataDescription) {
-        return DataToProcessWriterFactory.create(true, autodetectProcess,
+        return DataToProcessWriterFactory.create(true, includeTokensField, autodetectProcess,
                dataDescription.orElse(job.getDataDescription()), job.getAnalysisConfig(),
                dataCountsReporter, xContentRegistry);
    }

-    public void writeToJob(InputStream inputStream, XContentType xContentType,
+    public void writeToJob(InputStream inputStream, AnalysisRegistry analysisRegistry, XContentType xContentType,
                           DataLoadParams params, BiConsumer<DataCounts, Exception> handler) {
        submitOperation(() -> {
            if (params.isResettingBuckets()) {
@ -104,10 +116,14 @@ public class AutodetectCommunicator implements Closeable {
            CountingInputStream countingStream = new CountingInputStream(inputStream, dataCountsReporter);
            DataToProcessWriter autoDetectWriter = createProcessWriter(params.getDataDescription());

+            if (includeTokensField && categorizationAnalyzer == null) {
+                createCategorizationAnalyzer(analysisRegistry);
+            }
+
            CountDownLatch latch = new CountDownLatch(1);
            AtomicReference<DataCounts> dataCountsAtomicReference = new AtomicReference<>();
            AtomicReference<Exception> exceptionAtomicReference = new AtomicReference<>();
-            autoDetectWriter.write(countingStream, xContentType, (dataCounts, e) -> {
+            autoDetectWriter.write(countingStream, categorizationAnalyzer, xContentType, (dataCounts, e) -> {
                dataCountsAtomicReference.set(dataCounts);
                exceptionAtomicReference.set(e);
                latch.countDown();
@ -165,6 +181,8 @@ public class AutodetectCommunicator implements Closeable {
            } else {
                throw new ElasticsearchException(e);
            }
+        } finally {
+            destroyCategorizationAnalyzer();
        }
    }

@ -186,6 +204,7 @@ public class AutodetectCommunicator implements Closeable {
            if (finish) {
                onFinishHandler.accept(null);
            }
+            destroyCategorizationAnalyzer();
        }
    }

@ -316,6 +335,19 @@ public class AutodetectCommunicator implements Closeable {
        return dataCountsReporter.runningTotalStats();
    }

+    /**
+     * Care must be taken to ensure this method is not called while data is being posted.
+     * The methods in this class that call it wait for all processing to complete first.
+     * The expectation is that external calls are only made when cleaning up after a fatal
+     * error.
+     */
+    void destroyCategorizationAnalyzer() {
+        if (categorizationAnalyzer != null) {
+            categorizationAnalyzer.close();
+            categorizationAnalyzer = null;
+        }
+    }
+
    private <T> void submitOperation(CheckedSupplier<T, Exception> operation, BiConsumer<T, Exception> handler) {
        autodetectWorkerExecutor.execute(new AbstractRunnable() {
            @Override
@ -339,4 +371,14 @@ public class AutodetectCommunicator implements Closeable {
            }
        });
    }
+
+    private void createCategorizationAnalyzer(AnalysisRegistry analysisRegistry) throws IOException {
+        AnalysisConfig analysisConfig = job.getAnalysisConfig();
+        CategorizationAnalyzerConfig categorizationAnalyzerConfig = analysisConfig.getCategorizationAnalyzerConfig();
+        if (categorizationAnalyzerConfig == null) {
+            categorizationAnalyzerConfig =
+                    CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(analysisConfig.getCategorizationFilters());
+        }
+        categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, environment, categorizationAnalyzerConfig);
+    }
 }
--- a/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/AutodetectProcessManager.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/AutodetectProcessManager.java
@ -20,6 +20,8 @@ import org.elasticsearch.common.util.concurrent.ThreadContext;
 import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentType;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.xpack.ml.MachineLearning;
@ -95,6 +97,7 @@ public class AutodetectProcessManager extends AbstractComponent {
            Setting.intSetting("xpack.ml.max_open_jobs", MAX_RUNNING_JOBS_PER_NODE, 1, Property.NodeScope);

    private final Client client;
+    private final Environment environment;
    private final ThreadPool threadPool;
    private final JobManager jobManager;
    private final JobProvider jobProvider;
@ -112,12 +115,13 @@ public class AutodetectProcessManager extends AbstractComponent {

    private final Auditor auditor;

-    public AutodetectProcessManager(Settings settings, Client client, ThreadPool threadPool,
+    public AutodetectProcessManager(Environment environment, Settings settings, Client client, ThreadPool threadPool,
                                    JobManager jobManager, JobProvider jobProvider, JobResultsPersister jobResultsPersister,
                                    JobDataCountsPersister jobDataCountsPersister,
                                    AutodetectProcessFactory autodetectProcessFactory, NormalizerFactory normalizerFactory,
                                    NamedXContentRegistry xContentRegistry, Auditor auditor) {
        super(settings);
+        this.environment = environment;
        this.client = client;
        this.threadPool = threadPool;
        this.xContentRegistry = xContentRegistry;
@ -179,19 +183,20 @@ public class AutodetectProcessManager extends AbstractComponent {
     * <li>If a high proportion of the records chronologically out of order</li>
     * </ol>
     *
-     * @param jobTask       The job task
-     * @param input         Data input stream
-     * @param xContentType  the {@link XContentType} of the input
-     * @param params        Data processing parameters
-     * @param handler       Delegate error or datacount results (Count of records, fields, bytes, etc written)
+     * @param jobTask          The job task
+     * @param analysisRegistry Registry of analyzer components - this is used to build a categorization analyzer if necessary
+     * @param input            Data input stream
+     * @param xContentType     the {@link XContentType} of the input
+     * @param params           Data processing parameters
+     * @param handler          Delegate error or datacount results (Count of records, fields, bytes, etc written)
     */
-    public void processData(JobTask jobTask, InputStream input, XContentType xContentType,
-                            DataLoadParams params, BiConsumer<DataCounts, Exception> handler) {
+    public void processData(JobTask jobTask, AnalysisRegistry analysisRegistry, InputStream input,
+                            XContentType xContentType, DataLoadParams params, BiConsumer<DataCounts, Exception> handler) {
        AutodetectCommunicator communicator = getOpenAutodetectCommunicator(jobTask);
        if (communicator == null) {
            throw ExceptionsHelper.conflictStatusException("Cannot process data because job [" + jobTask.getJobId() + "] is not open");
        }
-        communicator.writeToJob(input, xContentType, params, handler);
+        communicator.writeToJob(input, analysisRegistry, xContentType, params, handler);
    }

    /**
@ -411,7 +416,7 @@ public class AutodetectProcessManager extends AbstractComponent {
            }
            throw e;
        }
-        return new AutodetectCommunicator(job, process, new StateStreamer(client), dataCountsReporter, processor, handler,
+        return new AutodetectCommunicator(job, environment, process, new StateStreamer(client), dataCountsReporter, processor, handler,
                xContentRegistry, autodetectWorkerExecutor);

    }
@ -441,7 +446,13 @@ public class AutodetectProcessManager extends AbstractComponent {

    private Runnable onProcessCrash(JobTask jobTask) {
        return () -> {
-            processByAllocation.remove(jobTask.getAllocationId());
+            ProcessContext processContext = processByAllocation.remove(jobTask.getAllocationId());
+            if (processContext != null) {
+                AutodetectCommunicator communicator = processContext.getAutodetectCommunicator();
+                if (communicator != null) {
+                    communicator.destroyCategorizationAnalyzer();
+                }
+            }
            setJobState(jobTask, JobState.FAILED);
        };
    }
--- a/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/NativeAutodetectProcess.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/NativeAutodetectProcess.java
@ -57,7 +57,7 @@ class NativeAutodetectProcess implements AutodetectProcess {
    private final OutputStream processRestoreStream;
    private final LengthEncodedWriter recordWriter;
    private final ZonedDateTime startTime;
-    private final int numberOfAnalysisFields;
+    private final int numberOfFields;
    private final List<Path> filesToDelete;
    private final Runnable onProcessCrash;
    private volatile Future<?> logTailFuture;
@ -67,8 +67,8 @@ class NativeAutodetectProcess implements AutodetectProcess {
    private volatile boolean isReady;
    private final AutodetectResultsParser resultsParser;

-    NativeAutodetectProcess(String jobId, InputStream logStream, OutputStream processInStream,  InputStream processOutStream,
-                            OutputStream processRestoreStream, int numberOfAnalysisFields, List<Path> filesToDelete,
+    NativeAutodetectProcess(String jobId, InputStream logStream, OutputStream processInStream, InputStream processOutStream,
+                            OutputStream processRestoreStream, int numberOfFields, List<Path> filesToDelete,
                            AutodetectResultsParser resultsParser, Runnable onProcessCrash) {
        this.jobId = jobId;
        cppLogHandler = new CppLogMessageHandler(jobId, logStream);
@ -77,7 +77,7 @@ class NativeAutodetectProcess implements AutodetectProcess {
        this.processRestoreStream = processRestoreStream;
        this.recordWriter = new LengthEncodedWriter(this.processInStream);
        startTime = ZonedDateTime.now();
-        this.numberOfAnalysisFields = numberOfAnalysisFields;
+        this.numberOfFields = numberOfFields;
        this.filesToDelete = filesToDelete;
        this.resultsParser = resultsParser;
        this.onProcessCrash = Objects.requireNonNull(onProcessCrash);
@ -143,32 +143,32 @@ class NativeAutodetectProcess implements AutodetectProcess {

    @Override
    public void writeResetBucketsControlMessage(DataLoadParams params) throws IOException {
-        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(recordWriter, numberOfAnalysisFields);
+        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(recordWriter, numberOfFields);
        writer.writeResetBucketsMessage(params);
    }

    @Override
    public void writeUpdateModelPlotMessage(ModelPlotConfig modelPlotConfig) throws IOException {
-        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(recordWriter, numberOfAnalysisFields);
+        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(recordWriter, numberOfFields);
        writer.writeUpdateModelPlotMessage(modelPlotConfig);
    }

    @Override
    public void writeUpdateDetectorRulesMessage(int detectorIndex, List<DetectionRule> rules) throws IOException {
-        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(recordWriter, numberOfAnalysisFields);
+        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(recordWriter, numberOfFields);
        writer.writeUpdateDetectorRulesMessage(detectorIndex, rules);
    }

    @Override
    public String flushJob(FlushJobParams params) throws IOException {
-        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(recordWriter, numberOfAnalysisFields);
+        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(recordWriter, numberOfFields);
        writer.writeFlushControlMessage(params);
        return writer.writeFlushMessage();
    }

    @Override
    public void forecastJob(ForecastParams params) throws IOException {
-        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(recordWriter, numberOfAnalysisFields);
+        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(recordWriter, numberOfFields);
        writer.writeForecastMessage(params);
    }

--- a/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/NativeAutodetectProcessFactory.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/NativeAutodetectProcessFactory.java
@ -12,6 +12,7 @@ import org.elasticsearch.common.logging.Loggers;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
 import org.elasticsearch.env.Environment;
+import org.elasticsearch.xpack.ml.MachineLearning;
 import org.elasticsearch.xpack.ml.job.config.Job;
 import org.elasticsearch.xpack.ml.job.process.NativeController;
 import org.elasticsearch.xpack.ml.job.process.ProcessCtrl;
@ -58,13 +59,16 @@ public class NativeAutodetectProcessFactory implements AutodetectProcessFactory
        ProcessPipes processPipes = new ProcessPipes(env, NAMED_PIPE_HELPER, ProcessCtrl.AUTODETECT, job.getId(),
                true, false, true, true, params.modelSnapshot() != null, !ProcessCtrl.DONT_PERSIST_MODEL_STATE_SETTING.get(settings));
        createNativeProcess(job, params, processPipes, filesToDelete);
-        int numberOfAnalysisFields = job.getAnalysisConfig().analysisFields().size();
+        boolean includeTokensField = MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA
+                && job.getAnalysisConfig().getCategorizationFieldName() != null;
+        // The extra 1 is the control field
+        int numberOfFields = job.allInputFields().size() + (includeTokensField ? 1 : 0) + 1;

        StateProcessor stateProcessor = new StateProcessor(settings, client);
        AutodetectResultsParser resultsParser = new AutodetectResultsParser(settings);
        NativeAutodetectProcess autodetect = new NativeAutodetectProcess(
                job.getId(), processPipes.getLogStream().get(), processPipes.getProcessInStream().get(),
-                processPipes.getProcessOutStream().get(), processPipes.getRestoreStream().orElse(null), numberOfAnalysisFields,
+                processPipes.getProcessOutStream().get(), processPipes.getRestoreStream().orElse(null), numberOfFields,
                filesToDelete, resultsParser, onProcessCrash);
        try {
            autodetect.start(executorService, stateProcessor, processPipes.getPersistStream().get());
--- a/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/AbstractDataToProcessWriter.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/AbstractDataToProcessWriter.java
@ -7,10 +7,15 @@ package org.elasticsearch.xpack.ml.job.process.autodetect.writer;

 import org.apache.logging.log4j.Logger;
 import org.elasticsearch.common.xcontent.XContentType;
+import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer;
 import org.elasticsearch.xpack.ml.job.config.AnalysisConfig;
 import org.elasticsearch.xpack.ml.job.config.DataDescription;
 import org.elasticsearch.xpack.ml.job.process.DataCountsReporter;
 import org.elasticsearch.xpack.ml.job.process.autodetect.AutodetectProcess;
+import org.supercsv.encoder.CsvEncoder;
+import org.supercsv.encoder.DefaultCsvEncoder;
+import org.supercsv.prefs.CsvPreference;
+import org.supercsv.util.CsvContext;

 import java.io.IOException;
 import java.io.InputStream;
@ -32,6 +37,7 @@ public abstract class AbstractDataToProcessWriter implements DataToProcessWriter
    private static final long MS_IN_SECOND = 1000;

    private final boolean includeControlField;
+    private final boolean includeTokensField;

    protected final AutodetectProcess autodetectProcess;
    protected final DataDescription dataDescription;
@ -49,10 +55,11 @@ public abstract class AbstractDataToProcessWriter implements DataToProcessWriter
    private long latestEpochMs;
    private long latestEpochMsThisUpload;

-    protected AbstractDataToProcessWriter(boolean includeControlField, AutodetectProcess autodetectProcess,
+    protected AbstractDataToProcessWriter(boolean includeControlField, boolean includeTokensField, AutodetectProcess autodetectProcess,
                                          DataDescription dataDescription, AnalysisConfig analysisConfig,
                                          DataCountsReporter dataCountsReporter, Logger logger) {
        this.includeControlField = includeControlField;
+        this.includeTokensField = includeTokensField;
        this.autodetectProcess = Objects.requireNonNull(autodetectProcess);
        this.dataDescription = Objects.requireNonNull(dataDescription);
        this.analysisConfig = Objects.requireNonNull(analysisConfig);
@ -77,18 +84,19 @@ public abstract class AbstractDataToProcessWriter implements DataToProcessWriter

    /**
     * Set up the field index mappings. This must be called before
-     * {@linkplain DataToProcessWriter#write(InputStream, XContentType, BiConsumer)}
+     * {@linkplain DataToProcessWriter#write(InputStream, CategorizationAnalyzer, XContentType, BiConsumer)}
     * <p>
     * Finds the required input indexes in the <code>header</code> and sets the
     * mappings to the corresponding output indexes.
     */
-    void buildFieldIndexMapping(String[] header) throws IOException {
+    void buildFieldIndexMapping(String[] header) {
        Collection<String> inputFields = inputFields();
        inFieldIndexes = inputFieldIndexes(header, inputFields);
        checkForMissingFields(inputFields, inFieldIndexes, header);

        inputOutputMap = createInputOutputMap(inFieldIndexes);
-        dataCountsReporter.setAnalysedFieldsPerRecord(analysisConfig.analysisFields().size());
+        // The time field doesn't count
+        dataCountsReporter.setAnalysedFieldsPerRecord(inputFields().size() - 1);
    }

    /**
@ -112,13 +120,45 @@ public abstract class AbstractDataToProcessWriter implements DataToProcessWriter
    }

    /**
-     * Transform the input data and write to length encoded writer.<br>
+     * Tokenize the field that has been configured for categorization, and store the resulting list of tokens in CSV
+     * format in the appropriate field of the record to be sent to the analytics.
+     * @param categorizationAnalyzer   The analyzer to use to convert the categorization field to a list of tokens
+     * @param categorizationFieldValue The value of the categorization field to be tokenized
+     * @param record                   The record to be sent to the analytics
+     */
+    protected void tokenizeForCategorization(CategorizationAnalyzer categorizationAnalyzer, String categorizationFieldValue,
+                                             String[] record) {
+        assert includeTokensField;
+        // -2 because last field is the control field, and last but one is the pre-tokenized tokens field
+        record[record.length - 2] = tokenizeForCategorization(categorizationAnalyzer, analysisConfig.getCategorizationFieldName(),
+                categorizationFieldValue);
+    }
+
+    /**
+     * Accessible for testing only.
+     */
+    static String tokenizeForCategorization(CategorizationAnalyzer categorizationAnalyzer, String categorizationFieldName,
+                                            String categorizationFieldValue) {
+        StringBuilder builder = new StringBuilder();
+        CsvContext context = new CsvContext(0, 0, 0);
+        // Using the CsvEncoder directly is faster than using a CsvLineWriter with end-of-line set to the empty string
+        CsvEncoder encoder = new DefaultCsvEncoder();
+        boolean first = true;
+        for (String token : categorizationAnalyzer.tokenizeField(categorizationFieldName, categorizationFieldValue)) {
+            if (first) {
+                first = false;
+            } else {
+                builder.appendCodePoint(CsvPreference.STANDARD_PREFERENCE.getDelimiterChar());
+            }
+            builder.append(encoder.encode(token, context, CsvPreference.STANDARD_PREFERENCE));
+        }
+        return builder.toString();
+    }
+
+    /**
+     * Transform the date in the input data and write all fields to the length encoded writer.
     * <p>
-     * Fields that aren't transformed i.e. those in inputOutputMap must be
-     * copied from input to output before this function is called.
-     * <p>
-     * First all the transforms whose outputs the Date transform relies
-     * on are executed then the date transform then the remaining transforms.
+     * Fields  must be copied from input to output before this function is called.
     *
     * @param record             The record that will be written to the length encoded writer after the time has been transformed.
     *                           This should be the same size as the number of output (analysis fields) i.e.
@ -171,7 +211,7 @@ public abstract class AbstractDataToProcessWriter implements DataToProcessWriter
    final Collection<String> inputFields() {
        Set<String> requiredFields = analysisConfig.analysisFields();
        requiredFields.add(dataDescription.getTimeField());
-
+        requiredFields.remove(AnalysisConfig.ML_CATEGORY_FIELD);
        return requiredFields;
    }

@ -181,7 +221,7 @@ public abstract class AbstractDataToProcessWriter implements DataToProcessWriter
    protected final Map<String, Integer> inputFieldIndexes(String[] header, Collection<String> inputFields) {
        List<String> headerList = Arrays.asList(header);  // TODO header could be empty

-        Map<String, Integer> fieldIndexes = new HashMap<String, Integer>();
+        Map<String, Integer> fieldIndexes = new HashMap<>();

        for (String field : inputFields) {
            int index = headerList.indexOf(field);
@ -211,12 +251,19 @@ public abstract class AbstractDataToProcessWriter implements DataToProcessWriter

        int index = TIME_FIELD_OUT_INDEX + 1;
        for (String field : analysisConfig.analysisFields()) {
-            fieldIndexes.put(field, index++);
+            if (AnalysisConfig.ML_CATEGORY_FIELD.equals(field) == false) {
+                fieldIndexes.put(field, index++);
+            }
+        }
+
+        // field for categorization tokens
+        if (includeTokensField) {
+            fieldIndexes.put(LengthEncodedWriter.PRETOKENISED_TOKEN_FIELD, index++);
        }

        // control field
        if (includeControlField) {
-            fieldIndexes.put(LengthEncodedWriter.CONTROL_FIELD_NAME, index);
+            fieldIndexes.put(LengthEncodedWriter.CONTROL_FIELD_NAME, index++);
        }

        return fieldIndexes;
@ -227,11 +274,7 @@ public abstract class AbstractDataToProcessWriter implements DataToProcessWriter
     * the time field and (sometimes) the control field
     */
    protected int outputFieldCount() {
-        return analysisConfig.analysisFields().size() + (includeControlField ? 2 : 1);
-    }
-
-    protected Map<String, Integer> getOutputFieldIndexes() {
-        return outputFieldIndexes();
+        return inputFields().size() + (includeControlField ? 1 : 0) + (includeTokensField ? 1 : 0);
    }

    /**
@ -251,10 +294,12 @@ public abstract class AbstractDataToProcessWriter implements DataToProcessWriter
        inputOutputMap.add(new InputOutputMap(inIndex, outIndex));

        for (String field : analysisConfig.analysisFields()) {
-            ++outIndex;
-            inIndex = inFieldIndexes.get(field);
-            if (inIndex != null) {
-                inputOutputMap.add(new InputOutputMap(inIndex, outIndex));
+            if (AnalysisConfig.ML_CATEGORY_FIELD.equals(field) == false) {
+                ++outIndex;
+                inIndex = inFieldIndexes.get(field);
+                if (inIndex != null) {
+                    inputOutputMap.add(new InputOutputMap(inIndex, outIndex));
+                }
            }
        }

--- a/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/ControlMsgToProcessWriter.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/ControlMsgToProcessWriter.java
@ -76,34 +76,28 @@ public class ControlMsgToProcessWriter {
    private static AtomicLong ms_FlushNumber = new AtomicLong(1);

    private final LengthEncodedWriter lengthEncodedWriter;
-    private final int numberOfAnalysisFields;
+    private final int numberOfFields;

    /**
     * Construct the control message writer with a LengthEncodedWriter
     *
-     * @param lengthEncodedWriter
-     *            the writer
-     * @param numberOfAnalysisFields
-     *            The number of fields configured for analysis not including the
-     *            time field
+     * @param lengthEncodedWriter The writer
+     * @param numberOfFields      The number of fields the process expects in each record
     */
-    public ControlMsgToProcessWriter(LengthEncodedWriter lengthEncodedWriter, int numberOfAnalysisFields) {
+    public ControlMsgToProcessWriter(LengthEncodedWriter lengthEncodedWriter, int numberOfFields) {
        this.lengthEncodedWriter = Objects.requireNonNull(lengthEncodedWriter);
-        this.numberOfAnalysisFields= numberOfAnalysisFields;
+        this.numberOfFields = numberOfFields;
    }

    /**
     * Create the control message writer with a OutputStream. A
     * LengthEncodedWriter is created on the OutputStream parameter
     *
-     * @param os
-     *            the output stream
-     * @param numberOfAnalysisFields
-     *            The number of fields configured for analysis not including the
-     *            time field
+     * @param os             The output stream
+     * @param numberOfFields The number of fields the process expects in each record
     */
-    public static ControlMsgToProcessWriter create(OutputStream os, int numberOfAnalysisFields) {
-        return new ControlMsgToProcessWriter(new LengthEncodedWriter(os), numberOfAnalysisFields);
+    public static ControlMsgToProcessWriter create(OutputStream os, int numberOfFields) {
+        return new ControlMsgToProcessWriter(new LengthEncodedWriter(os), numberOfFields);
    }

    /**
@ -227,12 +221,10 @@ public class ControlMsgToProcessWriter {
     */
    private void writeMessage(String message) throws IOException {

-        // The fields consist of all the analysis fields plus the time and the
-        // control field, hence + 2
-        lengthEncodedWriter.writeNumFields(numberOfAnalysisFields + 2);
+        lengthEncodedWriter.writeNumFields(numberOfFields);

-        // Write blank values for all analysis fields and the time
-        for (int i = -1; i < numberOfAnalysisFields; ++i) {
+        // Write blank values for all fields other than the control field
+        for (int i = 1; i < numberOfFields; ++i) {
            lengthEncodedWriter.writeField("");
        }

--- a/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/CsvDataToProcessWriter.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/CsvDataToProcessWriter.java
@ -9,6 +9,7 @@ import org.apache.logging.log4j.Logger;
 import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.common.logging.Loggers;
 import org.elasticsearch.common.xcontent.XContentType;
+import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer;
 import org.elasticsearch.xpack.ml.job.config.AnalysisConfig;
 import org.elasticsearch.xpack.ml.job.config.DataDescription;
 import org.elasticsearch.xpack.ml.job.process.DataCountsReporter;
@ -55,10 +56,10 @@ class CsvDataToProcessWriter extends AbstractDataToProcessWriter {
     */
    private static final int MAX_LINES_PER_RECORD = 10000;

-    CsvDataToProcessWriter(boolean includeControlField, AutodetectProcess autodetectProcess,
-                                  DataDescription dataDescription, AnalysisConfig analysisConfig,
-                                  DataCountsReporter dataCountsReporter) {
-        super(includeControlField, autodetectProcess, dataDescription, analysisConfig, dataCountsReporter, LOGGER);
+    CsvDataToProcessWriter(boolean includeControlField, boolean includeTokensField, AutodetectProcess autodetectProcess,
+                           DataDescription dataDescription, AnalysisConfig analysisConfig,
+                           DataCountsReporter dataCountsReporter) {
+        super(includeControlField, includeTokensField, autodetectProcess, dataDescription, analysisConfig, dataCountsReporter, LOGGER);
    }

    /**
@ -68,7 +69,8 @@ class CsvDataToProcessWriter extends AbstractDataToProcessWriter {
     * header a exception is thrown
     */
    @Override
-    public void write(InputStream inputStream, XContentType xContentType, BiConsumer<DataCounts, Exception> handler) throws IOException {
+    public void write(InputStream inputStream, CategorizationAnalyzer categorizationAnalyzer, XContentType xContentType,
+                      BiConsumer<DataCounts, Exception> handler) throws IOException {
        CsvPreference csvPref = new CsvPreference.Builder(
                dataDescription.getQuoteCharacter(),
                dataDescription.getFieldDelimiter(),
@ -88,13 +90,11 @@ class CsvDataToProcessWriter extends AbstractDataToProcessWriter {

            buildFieldIndexMapping(header);

-            // backing array for the inputIndex
-            String[] inputRecord = new String[header.length];
-
            int maxIndex = 0;
            for (Integer index : inFieldIndexes.values()) {
                maxIndex = Math.max(index, maxIndex);
            }
+            Integer categorizationFieldIndex = inFieldIndexes.get(analysisConfig.getCategorizationFieldName());

            int numFields = outputFieldCount();
            String[] record = new String[numFields];
@ -122,7 +122,9 @@ class CsvDataToProcessWriter extends AbstractDataToProcessWriter {
                    }
                }

-                fillRecordFromLine(line, inputRecord);
+                if (categorizationAnalyzer != null && categorizationFieldIndex != null && categorizationFieldIndex < line.size()) {
+                    tokenizeForCategorization(categorizationAnalyzer, line.get(categorizationFieldIndex), record);
+                }
                transformTimeAndWrite(record, inputFieldCount);
            }

@ -134,16 +136,6 @@ class CsvDataToProcessWriter extends AbstractDataToProcessWriter {
        }
    }

-    private static void fillRecordFromLine(List<String> line, String[] record) {
-        Arrays.fill(record, "");
-        for (int i = 0; i < Math.min(line.size(), record.length); i++) {
-            String value = line.get(i);
-            if (value != null) {
-                record[i] = value;
-            }
-        }
-    }
-
    @Override
    protected boolean checkForMissingFields(Collection<String> inputFields, Map<String, Integer> inputFieldIndexes, String[] header) {
        for (String field : inputFields) {
--- a/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/DataToProcessWriter.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/DataToProcessWriter.java
@ -6,6 +6,7 @@
 package org.elasticsearch.xpack.ml.job.process.autodetect.writer;

 import org.elasticsearch.common.xcontent.XContentType;
+import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer;
 import org.elasticsearch.xpack.ml.job.process.autodetect.state.DataCounts;

 import java.io.IOException;
@ -31,7 +32,7 @@ public interface DataToProcessWriter {
     * <code>DataDescription</code>s timeField is missing from the CSV header
     * a <code>MissingFieldException</code> is thrown
     */
-    void write(InputStream inputStream, XContentType xContentType,
+    void write(InputStream inputStream, CategorizationAnalyzer categorizationAnalyzer, XContentType xContentType,
               BiConsumer<DataCounts, Exception> handler) throws IOException;

    /**
--- a/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/DataToProcessWriterFactory.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/DataToProcessWriterFactory.java
@ -18,7 +18,6 @@ import org.elasticsearch.xpack.ml.job.process.autodetect.AutodetectProcess;
 public final class DataToProcessWriterFactory {

    private DataToProcessWriterFactory() {
-
    }

    /**
@ -28,16 +27,16 @@ public final class DataToProcessWriterFactory {
     * @return A {@link JsonDataToProcessWriter} if the data format is JSON or
     *         otherwise a {@link CsvDataToProcessWriter}
     */
-    public static DataToProcessWriter create(boolean includeControlField,
+    public static DataToProcessWriter create(boolean includeControlField, boolean includeTokensField,
            AutodetectProcess autodetectProcess, DataDescription dataDescription,
            AnalysisConfig analysisConfig, DataCountsReporter dataCountsReporter,
            NamedXContentRegistry xContentRegistry) {
        switch (dataDescription.getFormat()) {
        case XCONTENT:
-            return new JsonDataToProcessWriter(includeControlField, autodetectProcess,
+            return new JsonDataToProcessWriter(includeControlField, includeTokensField, autodetectProcess,
                    dataDescription, analysisConfig, dataCountsReporter, xContentRegistry);
        case DELIMITED:
-            return new CsvDataToProcessWriter(includeControlField, autodetectProcess,
+            return new CsvDataToProcessWriter(includeControlField, includeTokensField, autodetectProcess,
                    dataDescription, analysisConfig, dataCountsReporter);
        default:
            throw new IllegalArgumentException();
--- a/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/FieldConfigWriter.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/FieldConfigWriter.java
@ -10,6 +10,7 @@ import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.xcontent.ToXContent;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentFactory;
+import org.elasticsearch.xpack.ml.MachineLearning;
 import org.elasticsearch.xpack.ml.calendars.ScheduledEvent;
 import org.elasticsearch.xpack.ml.job.config.AnalysisConfig;
 import org.elasticsearch.xpack.ml.job.config.DefaultDetectorDescription;
@ -70,8 +71,11 @@ public class FieldConfigWriter {
        writeDetectors(contents);
        writeFilters(contents);
        writeScheduledEvents(contents);
-        writeAsEnumeratedSettings(CATEGORIZATION_FILTER_PREFIX, config.getCategorizationFilters(),
-                contents, true);
+
+        if (MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA == false) {
+            writeAsEnumeratedSettings(CATEGORIZATION_FILTER_PREFIX, config.getCategorizationFilters(),
+                    contents, true);
+        }

        // As values are written as entire settings rather than part of a
        // clause no quoting is needed
--- a/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/JsonDataToProcessWriter.java
+++ b/plugin/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/JsonDataToProcessWriter.java
@ -12,6 +12,7 @@ import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.common.xcontent.XContentFactory;
 import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.common.xcontent.XContentType;
+import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer;
 import org.elasticsearch.xpack.ml.job.config.AnalysisConfig;
 import org.elasticsearch.xpack.ml.job.config.DataDescription;
 import org.elasticsearch.xpack.ml.job.process.DataCountsReporter;
@ -38,10 +39,10 @@ class JsonDataToProcessWriter extends AbstractDataToProcessWriter {
    private static final Logger LOGGER = Loggers.getLogger(JsonDataToProcessWriter.class);
    private NamedXContentRegistry xContentRegistry;

-    JsonDataToProcessWriter(boolean includeControlField, AutodetectProcess autodetectProcess,
-            DataDescription dataDescription, AnalysisConfig analysisConfig,
-            DataCountsReporter dataCountsReporter, NamedXContentRegistry xContentRegistry) {
-        super(includeControlField, autodetectProcess, dataDescription, analysisConfig,
+    JsonDataToProcessWriter(boolean includeControlField, boolean includeTokensField, AutodetectProcess autodetectProcess,
+                            DataDescription dataDescription, AnalysisConfig analysisConfig,
+                            DataCountsReporter dataCountsReporter, NamedXContentRegistry xContentRegistry) {
+        super(includeControlField, includeTokensField, autodetectProcess, dataDescription, analysisConfig,
                dataCountsReporter, LOGGER);
        this.xContentRegistry = xContentRegistry;
    }
@ -54,14 +55,15 @@ class JsonDataToProcessWriter extends AbstractDataToProcessWriter {
     * timeField is missing from the JOSN inputIndex an exception is thrown
     */
    @Override
-    public void write(InputStream inputStream, XContentType xContentType, BiConsumer<DataCounts, Exception> handler)
+    public void write(InputStream inputStream, CategorizationAnalyzer categorizationAnalyzer, XContentType xContentType,
+                      BiConsumer<DataCounts, Exception> handler)
            throws IOException {
        dataCountsReporter.startNewIncrementalCount();

        if (xContentType.equals(XContentType.JSON)) {
-            writeJsonXContent(inputStream);
+            writeJsonXContent(categorizationAnalyzer, inputStream);
        } else if (xContentType.equals(XContentType.SMILE)) {
-            writeSmileXContent(inputStream);
+            writeSmileXContent(categorizationAnalyzer, inputStream);
        } else {
            throw new RuntimeException("XContentType [" + xContentType
                    + "] is not supported by JsonDataToProcessWriter");
@ -75,14 +77,14 @@ class JsonDataToProcessWriter extends AbstractDataToProcessWriter {
                ));
    }

-    private void writeJsonXContent(InputStream inputStream) throws IOException {
+    private void writeJsonXContent(CategorizationAnalyzer categorizationAnalyzer, InputStream inputStream) throws IOException {
        try (XContentParser parser = XContentFactory.xContent(XContentType.JSON)
                .createParser(xContentRegistry, inputStream)) {
-            writeJson(parser);
+            writeJson(categorizationAnalyzer, parser);
        }
    }

-    private void writeSmileXContent(InputStream inputStream) throws IOException {
+    private void writeSmileXContent(CategorizationAnalyzer categorizationAnalyzer, InputStream inputStream) throws IOException {
        while (true) {
            byte[] nextObject = findNextObject(XContentType.SMILE.xContent().streamSeparator(), inputStream);
            if (nextObject.length == 0) {
@ -90,7 +92,7 @@ class JsonDataToProcessWriter extends AbstractDataToProcessWriter {
            }
            try (XContentParser parser = XContentFactory.xContent(XContentType.SMILE)
                    .createParser(xContentRegistry, nextObject)) {
-                writeJson(parser);
+                writeJson(categorizationAnalyzer, parser);
            }
        }
    }
@ -121,20 +123,20 @@ class JsonDataToProcessWriter extends AbstractDataToProcessWriter {
        return new byte[0];
    }

-    private void writeJson(XContentParser parser) throws IOException {
-        Collection<String> analysisFields = inputFields();
+    private void writeJson(CategorizationAnalyzer categorizationAnalyzer, XContentParser parser) throws IOException {
+        Collection<String> inputFields = inputFields();

-        buildFieldIndexMapping(analysisFields.toArray(new String[0]));
+        buildFieldIndexMapping(inputFields.toArray(new String[0]));

        int numFields = outputFieldCount();
        String[] input = new String[numFields];
        String[] record = new String[numFields];

-        // We never expect to get the control field
-        boolean[] gotFields = new boolean[analysisFields.size()];
+        // We never expect to get the control field or categorization tokens field
+        boolean[] gotFields = new boolean[inputFields.size()];

-        XContentRecordReader recordReader = new XContentRecordReader(parser, inFieldIndexes,
-                LOGGER);
+        XContentRecordReader recordReader = new XContentRecordReader(parser, inFieldIndexes, LOGGER);
+        Integer categorizationFieldIndex = inFieldIndexes.get(analysisConfig.getCategorizationFieldName());
        long inputFieldCount = recordReader.read(input, gotFields);
        while (inputFieldCount >= 0) {
            Arrays.fill(record, "");
@ -151,6 +153,9 @@ class JsonDataToProcessWriter extends AbstractDataToProcessWriter {
                record[inOut.outputIndex] = (field == null) ? "" : field;
            }

+            if (categorizationAnalyzer != null && categorizationFieldIndex != null) {
+                tokenizeForCategorization(categorizationAnalyzer, input[categorizationFieldIndex], record);
+            }
            transformTimeAndWrite(record, inputFieldCount);

            inputFieldCount = recordReader.read(input, gotFields);
@ -174,8 +179,8 @@ class JsonDataToProcessWriter extends AbstractDataToProcessWriter {
    private static long missingFieldCount(boolean[] gotFieldFlags) {
        long count = 0;

-        for (int i = 0; i < gotFieldFlags.length; i++) {
-            if (gotFieldFlags[i] == false) {
+        for (boolean gotFieldFlag : gotFieldFlags) {
+            if (gotFieldFlag == false) {
                ++count;
            }
        }
--- a/plugin/src/test/java/org/elasticsearch/xpack/ml/integration/CategorizationIT.java
+++ b/plugin/src/test/java/org/elasticsearch/xpack/ml/integration/CategorizationIT.java
@ -9,7 +9,9 @@ import org.elasticsearch.action.bulk.BulkRequestBuilder;
 import org.elasticsearch.action.bulk.BulkResponse;
 import org.elasticsearch.action.index.IndexRequest;
 import org.elasticsearch.action.support.WriteRequest;
+import org.elasticsearch.common.logging.Loggers;
 import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.xpack.ml.MachineLearning;
 import org.elasticsearch.xpack.ml.datafeed.DatafeedConfig;
 import org.elasticsearch.xpack.ml.job.config.AnalysisConfig;
 import org.elasticsearch.xpack.ml.job.config.DataDescription;
@ -19,10 +21,10 @@ import org.elasticsearch.xpack.ml.job.results.CategoryDefinition;
 import org.junit.After;
 import org.junit.Before;

-import java.io.IOException;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
+import java.util.Locale;

 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.is;
@ -38,7 +40,7 @@ public class CategorizationIT extends MlNativeAutodetectIntegTestCase {
    private long nowMillis;

    @Before
-    public void setUpData() throws IOException {
+    public void setUpData() {
        client().admin().indices().prepareCreate(DATA_INDEX)
                .addMapping(DATA_TYPE, "time", "type=date,format=epoch_millis",
                        "msg", "type=text")
@ -75,7 +77,7 @@ public class CategorizationIT extends MlNativeAutodetectIntegTestCase {
    }

    @After
-    public void tearDownData() throws Exception {
+    public void tearDownData() {
        cleanUp();
        client().admin().indices().prepareDelete(DATA_INDEX).get();
        client().admin().indices().prepareRefresh("*").get();
@ -89,8 +91,8 @@ public class CategorizationIT extends MlNativeAutodetectIntegTestCase {

        String datafeedId = job.getId() + "-feed";
        DatafeedConfig.Builder datafeedConfig = new DatafeedConfig.Builder(datafeedId, job.getId());
-        datafeedConfig.setIndices(Arrays.asList(DATA_INDEX));
-        datafeedConfig.setTypes(Arrays.asList(DATA_TYPE));
+        datafeedConfig.setIndices(Collections.singletonList(DATA_INDEX));
+        datafeedConfig.setTypes(Collections.singletonList(DATA_TYPE));
        DatafeedConfig datafeed = datafeedConfig.build();
        registerDatafeed(datafeed);
        putDatafeed(datafeed);
@ -108,13 +110,13 @@ public class CategorizationIT extends MlNativeAutodetectIntegTestCase {
        CategoryDefinition category2 = categories.get(1);
        assertThat(category2.getRegex(), equalTo(".*?Failed.+?to.+?shutdown.+?error.+?" +
                "org.aaaa.bbbb.Cccc.+?line.+?caused.+?by.+?foo.+?exception.*"));
-        assertThat(category2.getExamples(), equalTo(Arrays.asList(
+        assertThat(category2.getExamples(), equalTo(Collections.singletonList(
                "Failed to shutdown [error org.aaaa.bbbb.Cccc line 54 caused by foo exception]")));

        CategoryDefinition category3 = categories.get(2);
        assertThat(category3.getRegex(), equalTo(".*?Failed.+?to.+?shutdown.+?error.+?but.+?" +
                "this.+?time.+?completely.+?different.*"));
-        assertThat(category3.getExamples(), equalTo(Arrays.asList(
+        assertThat(category3.getExamples(), equalTo(Collections.singletonList(
                "Failed to shutdown [error but this time completely different]")));

        openJob("categorization");
@ -128,15 +130,15 @@ public class CategorizationIT extends MlNativeAutodetectIntegTestCase {
    }

    public void testCategorizationWithFilters() throws Exception {
-        Job.Builder job = newJobBuilder("categorization-with-filters", Arrays.asList("\\[.*\\]"));
+        Job.Builder job = newJobBuilder("categorization-with-filters", Collections.singletonList("\\[.*\\]"));
        registerJob(job);
        putJob(job);
        openJob(job.getId());

        String datafeedId = job.getId() + "-feed";
        DatafeedConfig.Builder datafeedConfig = new DatafeedConfig.Builder(datafeedId, job.getId());
-        datafeedConfig.setIndices(Arrays.asList(DATA_INDEX));
-        datafeedConfig.setTypes(Arrays.asList(DATA_TYPE));
+        datafeedConfig.setIndices(Collections.singletonList(DATA_INDEX));
+        datafeedConfig.setTypes(Collections.singletonList(DATA_TYPE));
        DatafeedConfig datafeed = datafeedConfig.build();
        registerDatafeed(datafeed);
        putDatafeed(datafeed);
@ -158,12 +160,60 @@ public class CategorizationIT extends MlNativeAutodetectIntegTestCase {
                "Failed to shutdown [error org.aaaa.bbbb.Cccc line 54 caused by foo exception]")));
    }

+    public void testCategorizationPerformance() {
+        // To compare Java/C++ tokenization performance:
+        // 1. Change false to true in this assumption
+        // 2. Run the test several times
+        // 3. Change MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA to false
+        // 4. Run the test several more times
+        // 5. Check the timings that get logged
+        // 6. Revert the changes to this assumption and MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA
+        assumeTrue("This is time consuming to run on every build - it should be run manually when comparing Java/C++ tokenization",
+                false);
+
+        int testBatchSize = 1000;
+        int testNumBatches = 1000;
+        String[] possibleMessages = new String[] {
+                "<sol13m-9402.1.p2ps: Info: Tue Apr 06  19:00:16 2010> Source LOTS on 33080:817 has shut down.<END>",
+                "<lnl00m-8601.1.p2ps: Alert: Tue Apr 06  18:57:24 2010> P2PS failed to connect to the hrm server. "
+                        + "Reason: Failed to connect to hrm server - No ACK from SIPC<END>",
+                "<sol00m-8607.1.p2ps: Debug: Tue Apr 06  18:56:43 2010>  Did not receive an image data for IDN_SELECTFEED:7630.T on 493. "
+                        + "Recalling item. <END>",
+                "<lnl13m-8602.1.p2ps.rrcpTransport.0.sinkSide.rrcp.transmissionBus: Warning: Tue Apr 06  18:36:32 2010> "
+                        + "RRCP STATUS MSG: RRCP_REBOOT: node 33191 has rebooted<END>",
+                "<sol00m-8608.1.p2ps: Info: Tue Apr 06  18:30:02 2010> Source PRISM_VOBr on 33069:757 has shut down.<END>",
+                "<lnl06m-9402.1.p2ps: Info: Thu Mar 25  18:30:01 2010> Service PRISM_VOB has shut down.<END>"
+        };
+
+        String jobId = "categorization-performance";
+        Job.Builder job = newJobBuilder(jobId, Collections.emptyList());
+        registerJob(job);
+        putJob(job);
+        openJob(job.getId());
+
+        long startTime = System.currentTimeMillis();
+
+        for (int batchNum = 0; batchNum < testNumBatches; ++batchNum) {
+            StringBuilder json = new StringBuilder(testBatchSize * 100);
+            for (int docNum = 0; docNum < testBatchSize; ++docNum) {
+                json.append(String.format(Locale.ROOT, "{\"time\":1000000,\"msg\":\"%s\"}\n",
+                        possibleMessages[docNum % possibleMessages.length]));
+            }
+            postData(jobId, json.toString());
+        }
+        flushJob(jobId, false);
+
+        long duration = System.currentTimeMillis() - startTime;
+        Loggers.getLogger(CategorizationIT.class).info("Performance test with tokenization in " +
+                (MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA ? "Java" : "C++") + " took " + duration + "ms");
+    }
+
    private static Job.Builder newJobBuilder(String id, List<String> categorizationFilters) {
        Detector.Builder detector = new Detector.Builder();
        detector.setFunction("count");
        detector.setByFieldName("mlcategory");
        AnalysisConfig.Builder analysisConfig = new AnalysisConfig.Builder(
-                Arrays.asList(detector.build()));
+                Collections.singletonList(detector.build()));
        analysisConfig.setBucketSpan(TimeValue.timeValueHours(1));
        analysisConfig.setCategorizationFieldName("msg");
        analysisConfig.setCategorizationFilters(categorizationFilters);
--- a/plugin/src/test/java/org/elasticsearch/xpack/ml/job/JobManagerTests.java
+++ b/plugin/src/test/java/org/elasticsearch/xpack/ml/job/JobManagerTests.java
@ -17,12 +17,15 @@ import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.settings.ClusterSettings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
+import org.elasticsearch.env.TestEnvironment;
+import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.xpack.ml.MLMetadataField;
 import org.elasticsearch.xpack.ml.MachineLearningClientActionPlugin;
 import org.elasticsearch.xpack.ml.MlMetadata;
 import org.elasticsearch.xpack.ml.action.PutJobAction;
 import org.elasticsearch.xpack.ml.action.util.QueryPage;
+import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzerTests;
 import org.elasticsearch.xpack.ml.job.config.AnalysisConfig;
 import org.elasticsearch.xpack.ml.job.config.DataDescription;
 import org.elasticsearch.xpack.ml.job.config.Detector;
@ -33,6 +36,7 @@ import org.junit.Before;
 import org.mockito.ArgumentCaptor;
 import org.mockito.Matchers;

+import java.io.IOException;
 import java.util.Collections;
 import java.util.Date;

@ -47,13 +51,18 @@ import static org.mockito.Mockito.when;

 public class JobManagerTests extends ESTestCase {

+    private Environment environment;
+    private AnalysisRegistry analysisRegistry;
    private Client client;
    private ClusterService clusterService;
    private JobProvider jobProvider;
    private Auditor auditor;

    @Before
-    public void setupMocks() {
+    public void setup() throws Exception {
+        Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()).build();
+        environment = TestEnvironment.newEnvironment(settings);
+        analysisRegistry = CategorizationAnalyzerTests.buildTestAnalysisRegistry(environment);
        client = mock(Client.class);
        clusterService = mock(ClusterService.class);
        jobProvider = mock(JobProvider.class);
@ -92,7 +101,7 @@ public class JobManagerTests extends ESTestCase {
    }

    @SuppressWarnings("unchecked")
-    public void testPutJob_AddsCreateTime() {
+    public void testPutJob_AddsCreateTime() throws IOException {
        JobManager jobManager = createJobManager();
        PutJobAction.Request putJobRequest = new PutJobAction.Request(createJob());

@ -111,7 +120,7 @@ public class JobManagerTests extends ESTestCase {

        ClusterState clusterState = createClusterState();

-        jobManager.putJob(putJobRequest, clusterState, new ActionListener<PutJobAction.Response>() {
+        jobManager.putJob(putJobRequest, analysisRegistry, clusterState, new ActionListener<PutJobAction.Response>() {
            @Override
            public void onResponse(PutJobAction.Response response) {
                Job job = requestCaptor.getValue();
@ -129,7 +138,7 @@ public class JobManagerTests extends ESTestCase {
        });
    }

-    public void testPutJob_ThrowsIfJobExists() {
+    public void testPutJob_ThrowsIfJobExists() throws IOException {
        JobManager jobManager = createJobManager();
        PutJobAction.Request putJobRequest = new PutJobAction.Request(createJob());

@ -138,7 +147,7 @@ public class JobManagerTests extends ESTestCase {
        ClusterState clusterState = ClusterState.builder(new ClusterName("name"))
                .metaData(MetaData.builder().putCustom(MLMetadataField.TYPE, mlMetadata.build())).build();

-        jobManager.putJob(putJobRequest, clusterState, new ActionListener<PutJobAction.Response>() {
+        jobManager.putJob(putJobRequest, analysisRegistry, clusterState, new ActionListener<PutJobAction.Response>() {
            @Override
            public void onResponse(PutJobAction.Response response) {
                fail("should have got an error");
@ -164,12 +173,11 @@ public class JobManagerTests extends ESTestCase {
    }

    private JobManager createJobManager() {
-        Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
-        ClusterSettings clusterSettings = new ClusterSettings(settings,
+        ClusterSettings clusterSettings = new ClusterSettings(environment.settings(),
                Collections.singleton(MachineLearningClientActionPlugin.MAX_MODEL_MEMORY_LIMIT));
        when(clusterService.getClusterSettings()).thenReturn(clusterSettings);
        UpdateJobProcessNotifier notifier = mock(UpdateJobProcessNotifier.class);
-        return new JobManager(settings, jobProvider, clusterService, auditor, client, notifier);
+        return new JobManager(environment, environment.settings(), jobProvider, clusterService, auditor, client, notifier);
    }

    private ClusterState createClusterState() {
--- a/plugin/src/test/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzerTests.java
+++ b/plugin/src/test/java/org/elasticsearch/xpack/ml/job/categorization/CategorizationAnalyzerTests.java
@ -0,0 +1,190 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.job.categorization;
+
+import org.elasticsearch.analysis.common.CommonAnalysisPlugin;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.env.TestEnvironment;
+import org.elasticsearch.index.analysis.AnalysisRegistry;
+import org.elasticsearch.indices.analysis.AnalysisModule;
+import org.elasticsearch.test.ESTestCase;
+import org.elasticsearch.xpack.XPackPlugin;
+import org.elasticsearch.xpack.ml.job.config.CategorizationAnalyzerConfig;
+import org.junit.Before;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+
+public class CategorizationAnalyzerTests extends ESTestCase {
+
+    private AnalysisRegistry analysisRegistry;
+    private Environment environment;
+
+    public static AnalysisRegistry buildTestAnalysisRegistry(Environment environment) throws Exception {
+        CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin();
+        XPackPlugin xPackPlugin = new XPackPlugin(environment.settings(), environment.configFile());
+        return new AnalysisModule(environment, Arrays.asList(commonAnalysisPlugin, xPackPlugin)).getAnalysisRegistry();
+    }
+
+    @Before
+    public void setup() throws Exception {
+        Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()).build();
+        environment = TestEnvironment.newEnvironment(settings);
+        analysisRegistry = buildTestAnalysisRegistry(environment);
+    }
+
+    // The default categorization analyzer matches what the analyzer in the ML C++ does
+    public void testDefaultCategorizationAnalyzer() throws IOException {
+        CategorizationAnalyzerConfig defaultConfig = CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(null);
+        try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, environment, defaultConfig)) {
+
+            assertEquals(Arrays.asList("sol13m-8608.1.p2ps", "Info", "Source", "AES_SERVICE2", "on", "has", "shut", "down"),
+                    categorizationAnalyzer.tokenizeField("p2ps",
+                            "<sol13m-8608.1.p2ps: Info: > Source AES_SERVICE2 on 33122:967 has shut down."));
+
+            assertEquals(Arrays.asList("Vpxa", "verbose", "VpxaHalCnxHostagent", "opID", "WFU-ddeadb59", "WaitForUpdatesDone", "Received",
+                    "callback"),
+                    categorizationAnalyzer.tokenizeField("vmware",
+                            "Vpxa: [49EC0B90 verbose 'VpxaHalCnxHostagent' opID=WFU-ddeadb59] [WaitForUpdatesDone] Received callback"));
+
+            assertEquals(Arrays.asList("org.apache.coyote.http11.Http11BaseProtocol", "destroy"),
+                    categorizationAnalyzer.tokenizeField("apache",
+                            "org.apache.coyote.http11.Http11BaseProtocol destroy"));
+
+            assertEquals(Arrays.asList("INFO", "session", "PROXY", "Session", "DESTROYED"),
+                    categorizationAnalyzer.tokenizeField("proxy",
+                            " [1111529792] INFO  session <45409105041220090733@62.218.251.123> - " +
+                                    "----------------- PROXY Session DESTROYED --------------------"));
+
+            assertEquals(Arrays.asList("PSYoungGen", "total", "used"),
+                    categorizationAnalyzer.tokenizeField("java",
+                            "PSYoungGen      total 2572800K, used 1759355K [0x0000000759500000, 0x0000000800000000, 0x0000000800000000)"));
+        }
+    }
+
+    public void testDefaultCategorizationAnalyzerWithCategorizationFilter() throws IOException {
+        // A categorization filter that removes stuff in square brackets
+        CategorizationAnalyzerConfig defaultConfigWithCategorizationFilter =
+                CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(Collections.singletonList("\\[[^\\]]*\\]"));
+        try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, environment,
+                defaultConfigWithCategorizationFilter)) {
+
+            assertEquals(Arrays.asList("sol13m-8608.1.p2ps", "Info", "Source", "AES_SERVICE2", "on", "has", "shut", "down"),
+                    categorizationAnalyzer.tokenizeField("p2ps",
+                            "<sol13m-8608.1.p2ps: Info: > Source AES_SERVICE2 on 33122:967 has shut down."));
+
+            assertEquals(Arrays.asList("Vpxa", "Received", "callback"),
+                    categorizationAnalyzer.tokenizeField("vmware",
+                            "Vpxa: [49EC0B90 verbose 'VpxaHalCnxHostagent' opID=WFU-ddeadb59] [WaitForUpdatesDone] Received callback"));
+
+            assertEquals(Arrays.asList("org.apache.coyote.http11.Http11BaseProtocol", "destroy"),
+                    categorizationAnalyzer.tokenizeField("apache",
+                            "org.apache.coyote.http11.Http11BaseProtocol destroy"));
+
+            assertEquals(Arrays.asList("INFO", "session", "PROXY", "Session", "DESTROYED"),
+                    categorizationAnalyzer.tokenizeField("proxy",
+                            " [1111529792] INFO  session <45409105041220090733@62.218.251.123> - " +
+                                    "----------------- PROXY Session DESTROYED --------------------"));
+
+            assertEquals(Arrays.asList("PSYoungGen", "total", "used"),
+                    categorizationAnalyzer.tokenizeField("java",
+                            "PSYoungGen      total 2572800K, used 1759355K [0x0000000759500000, 0x0000000800000000, 0x0000000800000000)"));
+        }
+    }
+
+    // The Elasticsearch standard analyzer - this is the default for indexing in Elasticsearch, but
+    // NOT for ML categorization (and you'll see why if you look at the expected results of this test!)
+    public void testStandardAnalyzer() throws IOException {
+        CategorizationAnalyzerConfig config = new CategorizationAnalyzerConfig.Builder().setAnalyzer("standard").build();
+        try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, environment, config)) {
+
+            assertEquals(Arrays.asList("sol13m", "8608.1", "p2ps", "info", "source", "aes_service2", "on", "33122", "967", "has", "shut",
+                    "down"),
+                    categorizationAnalyzer.tokenizeField("p2ps",
+                            "<sol13m-8608.1.p2ps: Info: > Source AES_SERVICE2 on 33122:967 has shut down."));
+
+            assertEquals(Arrays.asList("vpxa", "49ec0b90", "verbose", "vpxahalcnxhostagent", "opid", "wfu", "ddeadb59",
+                    "waitforupdatesdone", "received", "callback"),
+                    categorizationAnalyzer.tokenizeField("vmware",
+                            "Vpxa: [49EC0B90 verbose 'VpxaHalCnxHostagent' opID=WFU-ddeadb59] [WaitForUpdatesDone] Received callback"));
+
+            assertEquals(Arrays.asList("org.apache.coyote.http11", "http11baseprotocol", "destroy"),
+                    categorizationAnalyzer.tokenizeField("apache",
+                            "org.apache.coyote.http11.Http11BaseProtocol destroy"));
+
+            assertEquals(Arrays.asList("1111529792", "info", "session", "45409105041220090733", "62.218.251.123", "proxy", "session",
+                    "destroyed"),
+                    categorizationAnalyzer.tokenizeField("proxy",
+                            " [1111529792] INFO  session <45409105041220090733@62.218.251.123> - " +
+                                    "----------------- PROXY Session DESTROYED --------------------"));
+
+            assertEquals(Arrays.asList("psyounggen", "total", "2572800k", "used", "1759355k", "0x0000000759500000", "0x0000000800000000",
+                    "0x0000000800000000"),
+                    categorizationAnalyzer.tokenizeField("java",
+                            "PSYoungGen      total 2572800K, used 1759355K [0x0000000759500000, 0x0000000800000000, 0x0000000800000000)"));
+        }
+    }
+
+    public void testCustomAnalyzer() throws IOException {
+        Map<String, Object> ignoreStuffInSqaureBrackets = new HashMap<>();
+        ignoreStuffInSqaureBrackets.put("type", "pattern_replace");
+        ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]");
+        Map<String, Object> ignoreStuffThatBeginsWithADigit = new HashMap<>();
+        ignoreStuffThatBeginsWithADigit.put("type", "pattern_replace");
+        ignoreStuffThatBeginsWithADigit.put("pattern", "^[0-9].*");
+        CategorizationAnalyzerConfig config = new CategorizationAnalyzerConfig.Builder()
+                .addCharFilter(ignoreStuffInSqaureBrackets)
+                .setTokenizer("classic")
+                .addTokenFilter("lowercase")
+                .addTokenFilter(ignoreStuffThatBeginsWithADigit)
+                .addTokenFilter("snowball")
+                .build();
+        try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, environment, config)) {
+
+            assertEquals(Arrays.asList("sol13m-8608.1.p2p", "info", "sourc", "aes_service2", "on", "has", "shut", "down"),
+                    categorizationAnalyzer.tokenizeField("p2ps",
+                            "<sol13m-8608.1.p2ps: Info: > Source AES_SERVICE2 on 33122:967 has shut down."));
+
+            assertEquals(Arrays.asList("vpxa", "receiv", "callback"),
+                    categorizationAnalyzer.tokenizeField("vmware",
+                            "Vpxa: [49EC0B90 verbose 'VpxaHalCnxHostagent' opID=WFU-ddeadb59] [WaitForUpdatesDone] Received callback"));
+
+            assertEquals(Arrays.asList("org.apache.coyote.http11.http11baseprotocol", "destroy"),
+                    categorizationAnalyzer.tokenizeField("apache",
+                            "org.apache.coyote.http11.Http11BaseProtocol destroy"));
+
+            assertEquals(Arrays.asList("info", "session", "proxi", "session", "destroy"),
+                    categorizationAnalyzer.tokenizeField("proxy",
+                            " [1111529792] INFO  session <45409105041220090733@62.218.251.123> - " +
+                                    "----------------- PROXY Session DESTROYED --------------------"));
+
+            assertEquals(Arrays.asList("psyounggen", "total", "use"),
+                    categorizationAnalyzer.tokenizeField("java",
+                            "PSYoungGen      total 2572800K, used 1759355K [0x0000000759500000, 0x0000000800000000, 0x0000000800000000)"));
+        }
+    }
+
+    public void testThaiAnalyzer() throws IOException {
+        CategorizationAnalyzerConfig config = new CategorizationAnalyzerConfig.Builder().setAnalyzer("thai").build();
+        try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, environment, config)) {
+
+            // An example from the ES docs - no idea what it means or whether it's remotely sensible from a categorization point-of-view
+            assertEquals(Arrays.asList("แสดง", "งาน", "ดี"),
+                    categorizationAnalyzer.tokenizeField("thai",
+                            "การที่ได้ต้องแสดงว่างานดี"));
+        }
+    }
+
+    public void testInvalidAnalyzer() {
+        CategorizationAnalyzerConfig config = new CategorizationAnalyzerConfig.Builder().setAnalyzer("does not exist").build();
+        expectThrows(IllegalArgumentException.class, () -> new CategorizationAnalyzer(analysisRegistry, environment, config));
+    }
+}
--- a/plugin/src/test/java/org/elasticsearch/xpack/ml/job/categorization/MlClassicTokenizerTests.java
+++ b/plugin/src/test/java/org/elasticsearch/xpack/ml/job/categorization/MlClassicTokenizerTests.java
@ -0,0 +1,48 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.job.categorization;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.elasticsearch.test.ESTestCase;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+
+public class MlClassicTokenizerTests extends ESTestCase {
+
+    public void testTokenize() throws IOException {
+        String testData = "one .-_two **stars**in**their**eyes** three.-_ sand.-_wich 4four five5 a1b2c3 42 www.elastic.co";
+        try (Tokenizer tokenizer = new MlClassicTokenizer()) {
+            tokenizer.setReader(new StringReader(testData));
+            tokenizer.reset();
+            CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
+            assertTrue(tokenizer.incrementToken());
+            assertEquals("one", term.toString());
+            assertTrue(tokenizer.incrementToken());
+            assertEquals("two", term.toString());
+            assertTrue(tokenizer.incrementToken());
+            assertEquals("stars", term.toString());
+            assertTrue(tokenizer.incrementToken());
+            assertEquals("in", term.toString());
+            assertTrue(tokenizer.incrementToken());
+            assertEquals("their", term.toString());
+            assertTrue(tokenizer.incrementToken());
+            assertEquals("eyes", term.toString());
+            assertTrue(tokenizer.incrementToken());
+            assertEquals("three", term.toString());
+            assertTrue(tokenizer.incrementToken());
+            assertEquals("sand.-_wich", term.toString());
+            assertTrue(tokenizer.incrementToken());
+            assertEquals("five5", term.toString());
+            assertTrue(tokenizer.incrementToken());
+            assertEquals("www.elastic.co", term.toString());
+            assertFalse(tokenizer.incrementToken());
+            tokenizer.end();
+        }
+    }
+}
--- a/plugin/src/test/java/org/elasticsearch/xpack/ml/job/config/AnalysisConfigTests.java
+++ b/plugin/src/test/java/org/elasticsearch/xpack/ml/job/config/AnalysisConfigTests.java
@ -14,12 +14,13 @@ import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.xpack.ml.job.messages.Messages;
 import org.elasticsearch.xpack.ml.job.process.autodetect.writer.RecordWriter;

-import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 import java.util.TreeSet;

@ -50,7 +51,38 @@ public class AnalysisConfigTests extends AbstractSerializingTestCase<AnalysisCon
        }
        if (isCategorization) {
            builder.setCategorizationFieldName(randomAlphaOfLength(10));
-            builder.setCategorizationFilters(Arrays.asList(generateRandomStringArray(10, 10, false)));
+            if (randomBoolean()) {
+                builder.setCategorizationFilters(Arrays.asList(generateRandomStringArray(10, 10, false)));
+            } else {
+                CategorizationAnalyzerConfig.Builder analyzerBuilder = new CategorizationAnalyzerConfig.Builder();
+                if (rarely()) {
+                    analyzerBuilder.setAnalyzer(randomAlphaOfLength(10));
+                } else {
+                    if (randomBoolean()) {
+                        for (String pattern : generateRandomStringArray(3, 40, false)) {
+                            Map<String, Object> charFilter = new HashMap<>();
+                            charFilter.put("type", "pattern_replace");
+                            charFilter.put("pattern", pattern);
+                            analyzerBuilder.addCharFilter(charFilter);
+                        }
+                    }
+
+                    Map<String, Object> tokenizer = new HashMap<>();
+                    tokenizer.put("type", "pattern");
+                    tokenizer.put("pattern", randomAlphaOfLength(10));
+                    analyzerBuilder.setTokenizer(tokenizer);
+
+                    if (randomBoolean()) {
+                        for (String pattern : generateRandomStringArray(4, 40, false)) {
+                            Map<String, Object> tokenFilter = new HashMap<>();
+                            tokenFilter.put("type", "pattern_replace");
+                            tokenFilter.put("pattern", pattern);
+                            analyzerBuilder.addTokenFilter(tokenFilter);
+                        }
+                    }
+                }
+                builder.setCategorizationAnalyzerConfig(analyzerBuilder.build());
+            }
        }
        if (randomBoolean()) {
            builder.setLatency(TimeValue.timeValueSeconds(randomIntBetween(1, 1_000_000)));
@ -334,21 +366,21 @@ public class AnalysisConfigTests extends AbstractSerializingTestCase<AnalysisCon
    }

    public void testEquals_GivenSameReference() {
-        AnalysisConfig config = createFullyPopulatedConfig();
+        AnalysisConfig config = createFullyPopulatedNonRandomConfig();
        assertTrue(config.equals(config));
    }

    public void testEquals_GivenDifferentClass() {
-        assertFalse(createFullyPopulatedConfig().equals("a string"));
+        assertFalse(createFullyPopulatedNonRandomConfig().equals("a string"));
    }

    public void testEquals_GivenNull() {
-        assertFalse(createFullyPopulatedConfig().equals(null));
+        assertFalse(createFullyPopulatedNonRandomConfig().equals(null));
    }

    public void testEquals_GivenEqualConfig() {
-        AnalysisConfig config1 = createFullyPopulatedConfig();
-        AnalysisConfig config2 = createFullyPopulatedConfig();
+        AnalysisConfig config1 = createFullyPopulatedNonRandomConfig();
+        AnalysisConfig config2 = createFullyPopulatedNonRandomConfig();

        assertTrue(config1.equals(config2));
        assertTrue(config2.equals(config1));
@ -471,14 +503,15 @@ public class AnalysisConfigTests extends AbstractSerializingTestCase<AnalysisCon
        assertEquals(new HashSet<>(Arrays.asList("filter1", "filter2")), config.extractReferencedFilters());
    }

-    private static AnalysisConfig createFullyPopulatedConfig() {
+    private static AnalysisConfig createFullyPopulatedNonRandomConfig() {
        Detector.Builder detector = new Detector.Builder("min", "count");
        detector.setOverFieldName("mlcategory");
        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(
                Collections.singletonList(detector.build()));
        builder.setBucketSpan(TimeValue.timeValueHours(1));
        builder.setCategorizationFieldName("cat");
-        builder.setCategorizationFilters(Collections.singletonList("foo"));
+        builder.setCategorizationAnalyzerConfig(
+                CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(Collections.singletonList("foo")));
        builder.setInfluencers(Collections.singletonList("myInfluencer"));
        builder.setLatency(TimeValue.timeValueSeconds(3600));
        builder.setSummaryCountFieldName("sumCount");
@ -568,6 +601,26 @@ public class AnalysisConfigTests extends AbstractSerializingTestCase<AnalysisCon
        analysisConfig.build();
    }

+    public void testVerify_GivenValidConfigWithCategorizationFieldNameAndCategorizationAnalyzerConfig() {
+        AnalysisConfig.Builder analysisConfig = createValidCategorizationConfig();
+        analysisConfig.setCategorizationAnalyzerConfig(
+                CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(Arrays.asList("foo", "bar")));
+
+        analysisConfig.build();
+    }
+
+    public void testVerify_GivenBothCategorizationFiltersAndCategorizationAnalyzerConfig() {
+        AnalysisConfig.Builder analysisConfig = createValidCategorizationConfig();
+        analysisConfig.setCategorizationFilters(Arrays.asList("foo", "bar"));
+        analysisConfig.setCategorizationAnalyzerConfig(
+                CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(Collections.singletonList("baz")));
+
+        ElasticsearchException e = ESTestCase.expectThrows(ElasticsearchException.class, analysisConfig::build);
+
+        assertEquals(Messages.getMessage(Messages.JOB_CONFIG_CATEGORIZATION_FILTERS_INCOMPATIBLE_WITH_CATEGORIZATION_ANALYZER),
+                e.getMessage());
+    }
+
    public void testVerify_GivenFieldIsControlField() {
        AnalysisConfig.Builder analysisConfig = createValidConfig();
        if (randomBoolean()) {
@ -821,9 +874,9 @@ public class AnalysisConfigTests extends AbstractSerializingTestCase<AnalysisCon
    }

    @Override
-    protected AnalysisConfig mutateInstance(AnalysisConfig instance) throws IOException {
+    protected AnalysisConfig mutateInstance(AnalysisConfig instance) {
        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(instance);
-        switch (between(0, 11)) {
+        switch (between(0, 12)) {
        case 0:
            List<Detector> detectors = new ArrayList<>(instance.getDetectors());
            Detector.Builder detector = new Detector.Builder();
@ -867,6 +920,7 @@ public class AnalysisConfigTests extends AbstractSerializingTestCase<AnalysisCon
            }
            filters.add(randomAlphaOfLengthBetween(1, 20));
            builder.setCategorizationFilters(filters);
+            builder.setCategorizationAnalyzerConfig(null);
            if (instance.getCategorizationFieldName() == null) {
                builder.setCategorizationFieldName(randomAlphaOfLengthBetween(1, 10));
                List<Detector> newDetectors = new ArrayList<>(instance.getDetectors());
@ -879,36 +933,50 @@ public class AnalysisConfigTests extends AbstractSerializingTestCase<AnalysisCon
            }
            break;
        case 5:
-            builder.setSummaryCountFieldName(instance.getSummaryCountFieldName() + randomAlphaOfLengthBetween(1, 5));
+            builder.setCategorizationFilters(null);
+            builder.setCategorizationAnalyzerConfig(CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(
+                    Collections.singletonList(randomAlphaOfLengthBetween(1, 20))));
+            if (instance.getCategorizationFieldName() == null) {
+                builder.setCategorizationFieldName(randomAlphaOfLengthBetween(1, 10));
+                List<Detector> newDetectors = new ArrayList<>(instance.getDetectors());
+                Detector.Builder catDetector = new Detector.Builder();
+                catDetector.setFunction("count");
+                catDetector.setByFieldName("mlcategory");
+                newDetectors.add(catDetector.build());
+                builder.setDetectors(newDetectors);
+            }
            break;
        case 6:
+            builder.setSummaryCountFieldName(instance.getSummaryCountFieldName() + randomAlphaOfLengthBetween(1, 5));
+            break;
+        case 7:
            List<String> influencers = new ArrayList<>(instance.getInfluencers());
            influencers.add(randomAlphaOfLengthBetween(5, 10));
            builder.setInfluencers(influencers);
            builder.setUsePerPartitionNormalization(false);
            break;
-        case 7:
+        case 8:
            if (instance.getOverlappingBuckets() == null) {
                builder.setOverlappingBuckets(randomBoolean());
            } else {
                builder.setOverlappingBuckets(instance.getOverlappingBuckets() == false);
            }
            break;
-        case 8:
+        case 9:
            if (instance.getResultFinalizationWindow() == null) {
                builder.setResultFinalizationWindow(between(1, 100) * 1000L);
            } else {
                builder.setResultFinalizationWindow(instance.getResultFinalizationWindow() + (between(1, 100) * 1000));
            }
            break;
-        case 9:
+        case 10:
            if (instance.getMultivariateByFields() == null) {
                builder.setMultivariateByFields(randomBoolean());
            } else {
                builder.setMultivariateByFields(instance.getMultivariateByFields() == false);
            }
            break;
-        case 10:
+        case 11:
            List<TimeValue> multipleBucketSpans;
            if (instance.getMultipleBucketSpans() == null) {
                multipleBucketSpans = new ArrayList<>();
@ -918,7 +986,7 @@ public class AnalysisConfigTests extends AbstractSerializingTestCase<AnalysisCon
            multipleBucketSpans.add(new TimeValue(between(2, 10) * instance.getBucketSpan().millis()));
            builder.setMultipleBucketSpans(multipleBucketSpans);
            break;
-        case 11:
+        case 12:
            boolean usePerPartitionNormalization = instance.getUsePerPartitionNormalization() == false;
            builder.setUsePerPartitionNormalization(usePerPartitionNormalization);
            if (usePerPartitionNormalization) {
--- a/plugin/src/test/java/org/elasticsearch/xpack/ml/job/config/CategorizationAnalyzerConfigTests.java
+++ b/plugin/src/test/java/org/elasticsearch/xpack/ml/job/config/CategorizationAnalyzerConfigTests.java
@ -0,0 +1,250 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.job.config;
+
+import org.elasticsearch.common.io.stream.Writeable;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.xcontent.XContentParser;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.env.TestEnvironment;
+import org.elasticsearch.index.analysis.AnalysisRegistry;
+import org.elasticsearch.test.AbstractSerializingTestCase;
+import org.elasticsearch.xpack.ml.MlParserType;
+import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzerTests;
+import org.junit.Before;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+
+public class CategorizationAnalyzerConfigTests extends AbstractSerializingTestCase<CategorizationAnalyzerConfig> {
+
+    private AnalysisRegistry analysisRegistry;
+    private Environment environment;
+
+    @Before
+    public void setup() throws Exception {
+        Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()).build();
+        environment = TestEnvironment.newEnvironment(settings);
+        analysisRegistry = CategorizationAnalyzerTests.buildTestAnalysisRegistry(environment);
+    }
+
+    public void testVerify_GivenNoConfig() {
+        CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder();
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
+        assertEquals("categorization_analyzer that is not a global analyzer must specify a [tokenizer] field", e.getMessage());
+    }
+
+    public void testVerify_GivenDefault() throws IOException {
+        CategorizationAnalyzerConfig defaultConfig = CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(null);
+        CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder(defaultConfig);
+        builder.verify(analysisRegistry, environment);
+    }
+
+    public void testVerify_GivenValidAnalyzer() throws IOException {
+        CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder().setAnalyzer("standard");
+        builder.verify(analysisRegistry, environment);
+    }
+
+    public void testVerify_GivenInvalidAnalyzer() {
+        CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder().setAnalyzer("does not exist");
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
+        assertEquals("Failed to find global analyzer [does not exist]", e.getMessage());
+    }
+
+    public void testVerify_GivenValidCustomConfig() throws IOException {
+        Map<String, Object> ignoreStuffInSqaureBrackets = new HashMap<>();
+        ignoreStuffInSqaureBrackets.put("type", "pattern_replace");
+        ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]");
+        Map<String, Object> ignoreStuffThatBeginsWithADigit = new HashMap<>();
+        ignoreStuffThatBeginsWithADigit.put("type", "pattern_replace");
+        ignoreStuffThatBeginsWithADigit.put("pattern", "^[0-9].*");
+        CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
+                .addCharFilter(ignoreStuffInSqaureBrackets)
+                .setTokenizer("classic")
+                .addTokenFilter("lowercase")
+                .addTokenFilter(ignoreStuffThatBeginsWithADigit)
+                .addTokenFilter("snowball");
+        builder.verify(analysisRegistry, environment);
+    }
+
+    public void testVerify_GivenCustomConfigWithInvalidCharFilter() {
+        CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
+                .addCharFilter("wrong!")
+                .setTokenizer("classic")
+                .addTokenFilter("lowercase")
+                .addTokenFilter("snowball");
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
+        assertEquals("Failed to find global char filter under [wrong!]", e.getMessage());
+    }
+
+    public void testVerify_GivenCustomConfigWithMisconfiguredCharFilter() {
+        Map<String, Object> noPattern = new HashMap<>();
+        noPattern.put("type", "pattern_replace");
+        noPattern.put("attern", "should have been pattern");
+        CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
+                .addCharFilter(noPattern)
+                .setTokenizer("classic")
+                .addTokenFilter("lowercase")
+                .addTokenFilter("snowball");
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
+        assertEquals("pattern is missing for [_anonymous_charfilter] char filter of type 'pattern_replace'", e.getMessage());
+    }
+
+    public void testVerify_GivenCustomConfigWithInvalidTokenizer() {
+        Map<String, Object> ignoreStuffInSqaureBrackets = new HashMap<>();
+        ignoreStuffInSqaureBrackets.put("type", "pattern_replace");
+        ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]");
+        CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
+                .addCharFilter(ignoreStuffInSqaureBrackets)
+                .setTokenizer("oops!")
+                .addTokenFilter("lowercase")
+                .addTokenFilter("snowball");
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
+        assertEquals("Failed to find global tokenizer under [oops!]", e.getMessage());
+    }
+
+    public void testVerify_GivenNoTokenizer() {
+        Map<String, Object> ignoreStuffInSqaureBrackets = new HashMap<>();
+        ignoreStuffInSqaureBrackets.put("type", "pattern_replace");
+        ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]");
+        Map<String, Object> ignoreStuffThatBeginsWithADigit = new HashMap<>();
+        ignoreStuffThatBeginsWithADigit.put("type", "pattern_replace");
+        ignoreStuffThatBeginsWithADigit.put("pattern", "^[0-9].*");
+        CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
+                .addCharFilter(ignoreStuffInSqaureBrackets)
+                .addTokenFilter("lowercase")
+                .addTokenFilter(ignoreStuffThatBeginsWithADigit)
+                .addTokenFilter("snowball");
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
+        assertEquals("categorization_analyzer that is not a global analyzer must specify a [tokenizer] field", e.getMessage());
+    }
+
+    public void testVerify_GivenCustomConfigWithInvalidTokenFilter() {
+        Map<String, Object> ignoreStuffInSqaureBrackets = new HashMap<>();
+        ignoreStuffInSqaureBrackets.put("type", "pattern_replace");
+        ignoreStuffInSqaureBrackets.put("pattern", "\\[[^\\]]*\\]");
+        CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
+                .addCharFilter(ignoreStuffInSqaureBrackets)
+                .setTokenizer("classic")
+                .addTokenFilter("lowercase")
+                .addTokenFilter("oh dear!");
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
+        assertEquals("Failed to find global token filter under [oh dear!]", e.getMessage());
+    }
+
+    public void testVerify_GivenCustomConfigWithMisconfiguredTokenFilter() {
+        Map<String, Object> noPattern = new HashMap<>();
+        noPattern.put("type", "pattern_replace");
+        noPattern.put("attern", "should have been pattern");
+        CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
+                .addCharFilter("html_strip")
+                .setTokenizer("classic")
+                .addTokenFilter("lowercase")
+                .addTokenFilter(noPattern);
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
+        assertEquals("pattern is missing for [_anonymous_tokenfilter] token filter of type 'pattern_replace'", e.getMessage());
+    }
+
+    public void testVerify_GivenAnalyzerAndCharFilter() {
+        CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
+                .setAnalyzer("standard")
+                .addCharFilter("html_strip");
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
+        assertEquals("categorization_analyzer that is a global analyzer cannot also specify a [char_filter] field", e.getMessage());
+    }
+
+    public void testVerify_GivenAnalyzerAndTokenizer() {
+        CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
+                .setAnalyzer("standard")
+                .setTokenizer("classic");
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
+        assertEquals("categorization_analyzer that is a global analyzer cannot also specify a [tokenizer] field", e.getMessage());
+    }
+
+    public void testVerify_GivenAnalyzerAndTokenFilter() {
+        CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder()
+                .setAnalyzer("standard")
+                .addTokenFilter("lowercase");
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> builder.verify(analysisRegistry, environment));
+        assertEquals("categorization_analyzer that is a global analyzer cannot also specify a [filter] field", e.getMessage());
+    }
+
+    @Override
+    protected CategorizationAnalyzerConfig createTestInstance() {
+        return createRandomized().build();
+    }
+
+    public static CategorizationAnalyzerConfig.Builder createRandomized() {
+        CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder();
+        if (rarely()) {
+            builder.setAnalyzer(randomAlphaOfLength(10));
+        } else {
+            if (randomBoolean()) {
+                for (String pattern : generateRandomStringArray(3, 40, false)) {
+                    if (rarely()) {
+                        builder.addCharFilter(randomAlphaOfLength(10));
+                    } else {
+                        Map<String, Object> charFilter = new HashMap<>();
+                        charFilter.put("type", "pattern_replace");
+                        charFilter.put("pattern", pattern);
+                        builder.addCharFilter(charFilter);
+                    }
+                }
+            }
+
+            if (rarely()) {
+                builder.setTokenizer(randomAlphaOfLength(10));
+            } else {
+                Map<String, Object> tokenizer = new HashMap<>();
+                tokenizer.put("type", "pattern");
+                tokenizer.put("pattern", randomAlphaOfLength(10));
+                builder.setTokenizer(tokenizer);
+            }
+
+            if (randomBoolean()) {
+                for (String pattern : generateRandomStringArray(4, 40, false)) {
+                    if (rarely()) {
+                        builder.addTokenFilter(randomAlphaOfLength(10));
+                    } else {
+                        Map<String, Object> tokenFilter = new HashMap<>();
+                        tokenFilter.put("type", "pattern_replace");
+                        tokenFilter.put("pattern", pattern);
+                        builder.addTokenFilter(tokenFilter);
+                    }
+                }
+            }
+        }
+        return builder;
+    }
+
+    @Override
+    protected Writeable.Reader<CategorizationAnalyzerConfig> instanceReader() {
+        return CategorizationAnalyzerConfig::new;
+    }
+
+    @Override
+    protected CategorizationAnalyzerConfig doParseInstance(XContentParser parser) throws IOException {
+        return CategorizationAnalyzerConfig.buildFromXContentObject(parser, MlParserType.CONFIG);
+    }
+
+    @Override
+    protected CategorizationAnalyzerConfig mutateInstance(CategorizationAnalyzerConfig instance) {
+        CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder(instance);
+
+        if (instance.getAnalyzer() != null) {
+            builder.setAnalyzer(instance.getAnalyzer() + "mutated");
+        } else {
+            if (randomBoolean()) {
+                builder.addCharFilter(randomAlphaOfLengthBetween(1, 20));
+            } else {
+                builder.addTokenFilter(randomAlphaOfLengthBetween(1, 20));
+            }
+        }
+        return builder.build();
+    }
+}
--- a/plugin/src/test/java/org/elasticsearch/xpack/ml/job/config/JobTests.java
+++ b/plugin/src/test/java/org/elasticsearch/xpack/ml/job/config/JobTests.java
@ -102,8 +102,8 @@ public class JobTests extends AbstractSerializingTestCase<Job> {
        assertNull(job.getBackgroundPersistInterval());
        assertThat(job.getModelSnapshotRetentionDays(), equalTo(1L));
        assertNull(job.getResultsRetentionDays());
-        assertNotNull(job.allFields());
-        assertFalse(job.allFields().isEmpty());
+        assertNotNull(job.allInputFields());
+        assertFalse(job.allInputFields().isEmpty());
    }

    public void testNoId() {
--- a/plugin/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/AutodetectCommunicatorTests.java
+++ b/plugin/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/AutodetectCommunicatorTests.java
@ -7,11 +7,16 @@ package org.elasticsearch.xpack.ml.job.process.autodetect;

 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.common.xcontent.XContentType;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.env.TestEnvironment;
+import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.xpack.ml.calendars.ScheduledEvent;
 import org.elasticsearch.xpack.ml.calendars.ScheduledEventTests;
+import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzerTests;
 import org.elasticsearch.xpack.ml.job.config.AnalysisConfig;
 import org.elasticsearch.xpack.ml.job.config.DataDescription;
 import org.elasticsearch.xpack.ml.job.config.DetectionRule;
@ -61,10 +66,15 @@ import static org.mockito.Mockito.when;

 public class AutodetectCommunicatorTests extends ESTestCase {

+    private Environment environment;
+    private AnalysisRegistry analysisRegistry;
    private StateStreamer stateStreamer;

    @Before
-    public void initMocks() {
+    public void setup() throws Exception {
+        Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()).build();
+        environment = TestEnvironment.newEnvironment(settings);
+        analysisRegistry = CategorizationAnalyzerTests.buildTestAnalysisRegistry(environment);
        stateStreamer = mock(StateStreamer.class);
    }

@ -72,7 +82,7 @@ public class AutodetectCommunicatorTests extends ESTestCase {
        DataLoadParams params = new DataLoadParams(TimeRange.builder().startTime("1").endTime("2").build(), Optional.empty());
        AutodetectProcess process = mockAutodetectProcessWithOutputStream();
        try (AutodetectCommunicator communicator = createAutodetectCommunicator(process, mock(AutoDetectResultProcessor.class))) {
-            communicator.writeToJob(new ByteArrayInputStream(new byte[0]),
+            communicator.writeToJob(new ByteArrayInputStream(new byte[0]), analysisRegistry,
                    randomFrom(XContentType.values()), params, (dataCounts, e) -> {});
            verify(process).writeResetBucketsControlMessage(params);
        }
@ -250,8 +260,8 @@ public class AutodetectCommunicatorTests extends ESTestCase {
            ((ActionListener<Boolean>) invocation.getArguments()[0]).onResponse(true);
            return null;
        }).when(dataCountsReporter).finishReporting(any());
-        return new AutodetectCommunicator(createJobDetails(), autodetectProcess, stateStreamer,
-                dataCountsReporter, autoDetectResultProcessor, finishHandler,
+        return new AutodetectCommunicator(createJobDetails(), environment, autodetectProcess,
+                stateStreamer, dataCountsReporter, autoDetectResultProcessor, finishHandler,
                new NamedXContentRegistry(Collections.emptyList()), executorService);
    }

--- a/plugin/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/AutodetectProcessManagerTests.java
+++ b/plugin/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/AutodetectProcessManagerTests.java
@ -15,11 +15,15 @@ import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
 import org.elasticsearch.common.util.concurrent.ThreadContext;
 import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.common.xcontent.XContentType;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.env.TestEnvironment;
+import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.test.junit.annotations.TestLogging;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.xpack.ml.action.TransportOpenJobAction.JobTask;
 import org.elasticsearch.xpack.ml.job.JobManager;
+import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzerTests;
 import org.elasticsearch.xpack.ml.job.config.AnalysisConfig;
 import org.elasticsearch.xpack.ml.job.config.DataDescription;
 import org.elasticsearch.xpack.ml.job.config.DetectionRule;
@ -86,12 +90,15 @@ import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.spy;

 /**
- * Calling the * {@link AutodetectProcessManager#processData(JobTask, InputStream, XContentType, DataLoadParams, BiConsumer)}
+ * Calling the
+ * {@link AutodetectProcessManager#processData(JobTask, AnalysisRegistry, InputStream, XContentType, DataLoadParams, BiConsumer)}
 * method causes an AutodetectCommunicator to be created on demand. Most of
 * these tests have to do that before they can assert other things
 */
 public class AutodetectProcessManagerTests extends ESTestCase {

+    private Environment environment;
+    private AnalysisRegistry analysisRegistry;
    private JobManager jobManager;
    private JobProvider jobProvider;
    private JobResultsPersister jobResultsPersister;
@ -106,7 +113,10 @@ public class AutodetectProcessManagerTests extends ESTestCase {
    private Set<MlFilter> filters = new HashSet<>();

    @Before
-    public void initMocks() {
+    public void setup() throws Exception {
+        Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()).build();
+        environment = TestEnvironment.newEnvironment(settings);
+        analysisRegistry = CategorizationAnalyzerTests.buildTestAnalysisRegistry(environment);
        jobManager = mock(JobManager.class);
        jobProvider = mock(JobProvider.class);
        jobResultsPersister = mock(JobResultsPersister.class);
@ -214,8 +224,8 @@ public class AutodetectProcessManagerTests extends ESTestCase {
                (j, autodetectParams, e, onProcessCrash) -> autodetectProcess;
        Settings.Builder settings = Settings.builder();
        settings.put(AutodetectProcessManager.MAX_OPEN_JOBS_PER_NODE.getKey(), 3);
-        AutodetectProcessManager manager = spy(new AutodetectProcessManager(settings.build(), client, threadPool, jobManager, jobProvider,
-                jobResultsPersister, jobDataCountsPersister, autodetectProcessFactory,
+        AutodetectProcessManager manager = spy(new AutodetectProcessManager(environment, settings.build(), client, threadPool,
+                jobManager, jobProvider, jobResultsPersister, jobDataCountsPersister, autodetectProcessFactory,
                normalizerFactory, new NamedXContentRegistry(Collections.emptyList()), auditor));
        doReturn(executorService).when(manager).createAutodetectExecutorService(any());

@ -265,12 +275,12 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        when(jobTask.getJobId()).thenReturn("foo");
        DataLoadParams params = new DataLoadParams(TimeRange.builder().build(), Optional.empty());
        manager.openJob(jobTask, e -> {});
-        manager.processData(jobTask, createInputStream(""), randomFrom(XContentType.values()),
+        manager.processData(jobTask, analysisRegistry, createInputStream(""), randomFrom(XContentType.values()),
                params, (dataCounts1, e) -> {});
        assertEquals(1, manager.numberOfOpenJobs());
    }

-    public void testProcessDataThrowsElasticsearchStatusException_onIoException() throws Exception {
+    public void testProcessDataThrowsElasticsearchStatusException_onIoException() {
        AutodetectCommunicator communicator = Mockito.mock(AutodetectCommunicator.class);
        AutodetectProcessManager manager = createManager(communicator);

@ -279,17 +289,17 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        XContentType xContentType = randomFrom(XContentType.values());
        doAnswer(invocationOnMock -> {
            @SuppressWarnings("unchecked")
-            BiConsumer<DataCounts, Exception> handler = (BiConsumer<DataCounts, Exception>) invocationOnMock.getArguments()[3];
+            BiConsumer<DataCounts, Exception> handler = (BiConsumer<DataCounts, Exception>) invocationOnMock.getArguments()[4];
            handler.accept(null, new IOException("blah"));
            return null;
-        }).when(communicator).writeToJob(eq(inputStream), same(xContentType), eq(params), any());
+        }).when(communicator).writeToJob(eq(inputStream), same(analysisRegistry), same(xContentType), eq(params), any());


        JobTask jobTask = mock(JobTask.class);
        when(jobTask.getJobId()).thenReturn("foo");
        manager.openJob(jobTask, e -> {});
        Exception[] holder = new Exception[1];
-        manager.processData(jobTask, inputStream, xContentType, params, (dataCounts1, e) -> holder[0] = e);
+        manager.processData(jobTask, analysisRegistry, inputStream, xContentType, params, (dataCounts1, e) -> holder[0] = e);
        assertNotNull(holder[0]);
    }

@ -301,7 +311,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        JobTask jobTask = mock(JobTask.class);
        when(jobTask.getJobId()).thenReturn("foo");
        manager.openJob(jobTask, e -> {});
-        manager.processData(jobTask, createInputStream(""), randomFrom(XContentType.values()),
+        manager.processData(jobTask, analysisRegistry, createInputStream(""), randomFrom(XContentType.values()),
                mock(DataLoadParams.class), (dataCounts1, e) -> {});

        // job is created
@ -329,7 +339,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        JobTask jobTask = mock(JobTask.class);
        when(jobTask.getJobId()).thenReturn("foo");
        manager.openJob(jobTask, e -> {});
-        manager.processData(jobTask, createInputStream(""), randomFrom(XContentType.values()),
+        manager.processData(jobTask, analysisRegistry, createInputStream(""), randomFrom(XContentType.values()),
                mock(DataLoadParams.class), (dataCounts1, e) -> {});

        assertEquals(1, manager.numberOfOpenJobs());
@ -372,7 +382,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        JobTask jobTask = mock(JobTask.class);
        when(jobTask.getJobId()).thenReturn("foo");
        manager.openJob(jobTask, e -> {});
-        manager.processData(jobTask, createInputStream(""), randomFrom(XContentType.values()),
+        manager.processData(jobTask, analysisRegistry, createInputStream(""), randomFrom(XContentType.values()),
                mock(DataLoadParams.class), (dataCounts1, e) -> {});

        // Close the job in a separate thread so that it can simulate taking a long time to close
@ -391,7 +401,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        assertFalse(closeThread.isAlive());
    }

-    public void testBucketResetMessageIsSent() throws IOException {
+    public void testBucketResetMessageIsSent() {
        AutodetectCommunicator communicator = mock(AutodetectCommunicator.class);
        AutodetectProcessManager manager = createManager(communicator);
        XContentType xContentType = randomFrom(XContentType.values());
@ -401,11 +411,11 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        JobTask jobTask = mock(JobTask.class);
        when(jobTask.getJobId()).thenReturn("foo");
        manager.openJob(jobTask, e -> {});
-        manager.processData(jobTask, inputStream, xContentType, params, (dataCounts1, e) -> {});
-        verify(communicator).writeToJob(same(inputStream), same(xContentType), same(params), any());
+        manager.processData(jobTask, analysisRegistry, inputStream, xContentType, params, (dataCounts1, e) -> {});
+        verify(communicator).writeToJob(same(inputStream), same(analysisRegistry), same(xContentType), same(params), any());
    }

-    public void testFlush() throws IOException {
+    public void testFlush() {
        AutodetectCommunicator communicator = mock(AutodetectCommunicator.class);
        AutodetectProcessManager manager = createManager(communicator);

@ -413,7 +423,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        when(jobTask.getJobId()).thenReturn("foo");
        InputStream inputStream = createInputStream("");
        manager.openJob(jobTask, e -> {});
-        manager.processData(jobTask, inputStream, randomFrom(XContentType.values()),
+        manager.processData(jobTask, analysisRegistry, inputStream, randomFrom(XContentType.values()),
                mock(DataLoadParams.class), (dataCounts1, e) -> {});

        FlushJobParams params = FlushJobParams.builder().build();
@ -422,7 +432,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        verify(communicator).flushJob(same(params), any());
    }

-    public void testFlushThrows() throws IOException {
+    public void testFlushThrows() {
        AutodetectCommunicator communicator = mock(AutodetectCommunicator.class);
        AutodetectProcessManager manager = createManagerAndCallProcessData(communicator, "foo");

@ -441,7 +451,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        assertEquals("[foo] exception while flushing job", holder[0].getMessage());
    }

-    public void testCloseThrows() throws IOException {
+    public void testCloseThrows() {
        AutodetectCommunicator communicator = mock(AutodetectCommunicator.class);
        AutodetectProcessManager manager = createManager(communicator);

@ -453,7 +463,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        JobTask jobTask = mock(JobTask.class);
        when(jobTask.getJobId()).thenReturn("foo");
        manager.openJob(jobTask, e -> {});
-        manager.processData(jobTask, createInputStream(""), randomFrom(XContentType.values()), mock(DataLoadParams.class),
+        manager.processData(jobTask, analysisRegistry, createInputStream(""), randomFrom(XContentType.values()), mock(DataLoadParams.class),
                (dataCounts1, e) -> {
                });
        verify(manager).setJobState(any(), eq(JobState.OPENED));
@ -465,7 +475,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        verify(manager).setJobState(any(), eq(JobState.FAILED));
    }

-    public void testwriteUpdateProcessMessage() throws IOException {
+    public void testwriteUpdateProcessMessage() {
        AutodetectCommunicator communicator = mock(AutodetectCommunicator.class);
        AutodetectProcessManager manager = createManagerAndCallProcessData(communicator, "foo");
        ModelPlotConfig modelConfig = mock(ModelPlotConfig.class);
@ -478,7 +488,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        verify(communicator).writeUpdateProcessMessage(same(updateParams), eq(Collections.emptyList()), any());
    }

-    public void testJobHasActiveAutodetectProcess() throws IOException {
+    public void testJobHasActiveAutodetectProcess() {
        AutodetectCommunicator communicator = mock(AutodetectCommunicator.class);
        AutodetectProcessManager manager = createManager(communicator);
        JobTask jobTask = mock(JobTask.class);
@ -486,7 +496,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        assertFalse(manager.jobHasActiveAutodetectProcess(jobTask));

        manager.openJob(jobTask, e -> {});
-        manager.processData(jobTask, createInputStream(""), randomFrom(XContentType.values()),
+        manager.processData(jobTask, analysisRegistry, createInputStream(""), randomFrom(XContentType.values()),
                mock(DataLoadParams.class), (dataCounts1, e) -> {});

        assertTrue(manager.jobHasActiveAutodetectProcess(jobTask));
@ -504,7 +514,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        assertFalse(manager.jobHasActiveAutodetectProcess(jobTask));

        manager.openJob(jobTask, e -> {});
-        manager.processData(jobTask, createInputStream(""), randomFrom(XContentType.values()),
+        manager.processData(jobTask, analysisRegistry, createInputStream(""), randomFrom(XContentType.values()),
                mock(DataLoadParams.class), (dataCounts1, e) -> {});

        assertTrue(manager.jobHasActiveAutodetectProcess(jobTask));
@ -514,14 +524,14 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        verify(communicator).killProcess(false, false);
    }

-    public void testProcessData_GivenStateNotOpened() throws IOException {
+    public void testProcessData_GivenStateNotOpened() {
        AutodetectCommunicator communicator = mock(AutodetectCommunicator.class);
        doAnswer(invocationOnMock -> {
            @SuppressWarnings("unchecked")
-            BiConsumer<DataCounts, Exception> handler = (BiConsumer<DataCounts, Exception>) invocationOnMock.getArguments()[3];
+            BiConsumer<DataCounts, Exception> handler = (BiConsumer<DataCounts, Exception>) invocationOnMock.getArguments()[4];
            handler.accept(new DataCounts("foo"), null);
            return null;
-        }).when(communicator).writeToJob(any(), any(), any(), any());
+        }).when(communicator).writeToJob(any(), any(), any(), any(), any());
        AutodetectProcessManager manager = createManager(communicator);

        JobTask jobTask = mock(JobTask.class);
@ -529,7 +539,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        manager.openJob(jobTask, e -> {});
        InputStream inputStream = createInputStream("");
        DataCounts[] dataCounts = new DataCounts[1];
-        manager.processData(jobTask, inputStream,
+        manager.processData(jobTask, analysisRegistry, inputStream,
                randomFrom(XContentType.values()), mock(DataLoadParams.class), (dataCounts1, e) -> dataCounts[0] = dataCounts1);

        assertThat(dataCounts[0], equalTo(new DataCounts("foo")));
@ -547,8 +557,8 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        AutodetectProcess autodetectProcess = mock(AutodetectProcess.class);
        AutodetectProcessFactory autodetectProcessFactory =
                (j, autodetectParams, e, onProcessCrash) -> autodetectProcess;
-        AutodetectProcessManager manager = new AutodetectProcessManager(Settings.EMPTY, client, threadPool, jobManager, jobProvider,
-                jobResultsPersister, jobDataCountsPersister, autodetectProcessFactory,
+        AutodetectProcessManager manager = new AutodetectProcessManager(environment, Settings.EMPTY,
+                client, threadPool, jobManager, jobProvider, jobResultsPersister, jobDataCountsPersister, autodetectProcessFactory,
                normalizerFactory, new NamedXContentRegistry(Collections.emptyList()), auditor);

        JobTask jobTask = mock(JobTask.class);
@ -558,7 +568,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        verify(autodetectProcess, times(1)).close();
    }

-    public void testCreate_givenFirstTime() throws IOException {
+    public void testCreate_givenFirstTime() {
        modelSnapshot = null;
        AutodetectProcessManager manager = createNonSpyManager("foo");

@ -571,7 +581,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        verifyNoMoreInteractions(auditor);
    }

-    public void testCreate_givenExistingModelSnapshot() throws IOException {
+    public void testCreate_givenExistingModelSnapshot() {
        modelSnapshot = new ModelSnapshot.Builder("foo").setSnapshotId("snapshot-1")
                .setLatestRecordTimeStamp(new Date(0L)).build();
        dataCounts = new DataCounts("foo");
@ -589,7 +599,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        verifyNoMoreInteractions(auditor);
    }

-    public void testCreate_givenNonZeroCountsAndNoModelSnapshotNorQuantiles() throws IOException {
+    public void testCreate_givenNonZeroCountsAndNoModelSnapshotNorQuantiles() {
        modelSnapshot = null;
        quantiles = null;
        dataCounts = new DataCounts("foo");
@ -620,8 +630,8 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        AutodetectProcess autodetectProcess = mock(AutodetectProcess.class);
        AutodetectProcessFactory autodetectProcessFactory =
                (j, autodetectParams, e, onProcessCrash) -> autodetectProcess;
-        return new AutodetectProcessManager(Settings.EMPTY, client, threadPool, jobManager, jobProvider,
-                jobResultsPersister, jobDataCountsPersister, autodetectProcessFactory,
+        return new AutodetectProcessManager(environment, Settings.EMPTY, client, threadPool, jobManager,
+                jobProvider, jobResultsPersister, jobDataCountsPersister, autodetectProcessFactory,
                normalizerFactory, new NamedXContentRegistry(Collections.emptyList()), auditor);
    }

@ -645,8 +655,8 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        when(threadPool.getThreadContext()).thenReturn(new ThreadContext(Settings.EMPTY));
        when(threadPool.executor(anyString())).thenReturn(EsExecutors.newDirectExecutorService());
        AutodetectProcessFactory autodetectProcessFactory = mock(AutodetectProcessFactory.class);
-        AutodetectProcessManager manager = new AutodetectProcessManager(Settings.EMPTY, client,
-                threadPool, jobManager, jobProvider, jobResultsPersister, jobDataCountsPersister,
+        AutodetectProcessManager manager = new AutodetectProcessManager(environment, Settings.EMPTY,
+                client, threadPool, jobManager, jobProvider, jobResultsPersister, jobDataCountsPersister,
                autodetectProcessFactory, normalizerFactory,
                new NamedXContentRegistry(Collections.emptyList()), auditor);
        manager = spy(manager);
@ -659,7 +669,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
        JobTask jobTask = mock(JobTask.class);
        when(jobTask.getJobId()).thenReturn(jobId);
        manager.openJob(jobTask, e -> {});
-        manager.processData(jobTask, createInputStream(""), randomFrom(XContentType.values()),
+        manager.processData(jobTask, analysisRegistry, createInputStream(""), randomFrom(XContentType.values()),
                mock(DataLoadParams.class), (dataCounts, e) -> {});
        return manager;
    }
--- a/plugin/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/NativeAutodetectProcessTests.java
+++ b/plugin/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/NativeAutodetectProcessTests.java
@ -38,7 +38,7 @@ import static org.mockito.Mockito.when;

 public class NativeAutodetectProcessTests extends ESTestCase {

-    private static final int NUMBER_ANALYSIS_FIELDS = 3;
+    private static final int NUMBER_FIELDS = 5;

    private ExecutorService executorService;

@ -54,7 +54,7 @@ public class NativeAutodetectProcessTests extends ESTestCase {
        when(logStream.read(new byte[1024])).thenReturn(-1);
        try (NativeAutodetectProcess process = new NativeAutodetectProcess("foo", logStream,
                mock(OutputStream.class), mock(InputStream.class), mock(OutputStream.class),
-                NUMBER_ANALYSIS_FIELDS, null,
+                NUMBER_FIELDS, null,
                new AutodetectResultsParser(Settings.EMPTY), mock(Runnable.class))) {
            process.start(executorService, mock(StateProcessor.class), mock(InputStream.class));

@ -74,7 +74,7 @@ public class NativeAutodetectProcessTests extends ESTestCase {
        String[] record = {"r1", "r2", "r3", "r4", "r5"};
        ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
        try (NativeAutodetectProcess process = new NativeAutodetectProcess("foo", logStream,
-                bos, mock(InputStream.class), mock(OutputStream.class), NUMBER_ANALYSIS_FIELDS, Collections.emptyList(),
+                bos, mock(InputStream.class), mock(OutputStream.class), NUMBER_FIELDS, Collections.emptyList(),
                new AutodetectResultsParser(Settings.EMPTY), mock(Runnable.class))) {
            process.start(executorService, mock(StateProcessor.class), mock(InputStream.class));

@ -106,7 +106,7 @@ public class NativeAutodetectProcessTests extends ESTestCase {
        when(logStream.read(new byte[1024])).thenReturn(-1);
        ByteArrayOutputStream bos = new ByteArrayOutputStream(ControlMsgToProcessWriter.FLUSH_SPACES_LENGTH + 1024);
        try (NativeAutodetectProcess process = new NativeAutodetectProcess("foo", logStream,
-                bos, mock(InputStream.class), mock(OutputStream.class), NUMBER_ANALYSIS_FIELDS, Collections.emptyList(),
+                bos, mock(InputStream.class), mock(OutputStream.class), NUMBER_FIELDS, Collections.emptyList(),
                new AutodetectResultsParser(Settings.EMPTY), mock(Runnable.class))) {
            process.start(executorService, mock(StateProcessor.class), mock(InputStream.class));

@ -123,7 +123,7 @@ public class NativeAutodetectProcessTests extends ESTestCase {
        when(logStream.read(new byte[1024])).thenReturn(-1);
        ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
        try (NativeAutodetectProcess process = new NativeAutodetectProcess("foo", logStream,
-                bos, mock(InputStream.class), mock(OutputStream.class), NUMBER_ANALYSIS_FIELDS, Collections.emptyList(),
+                bos, mock(InputStream.class), mock(OutputStream.class), NUMBER_FIELDS, Collections.emptyList(),
                new AutodetectResultsParser(Settings.EMPTY), mock(Runnable.class))) {
            process.start(executorService, mock(StateProcessor.class), mock(InputStream.class));

@ -141,7 +141,7 @@ public class NativeAutodetectProcessTests extends ESTestCase {
        when(logStream.read(new byte[1024])).thenReturn(-1);
        ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
        try (NativeAutodetectProcess process = new NativeAutodetectProcess("foo", logStream,
-                bos, mock(InputStream.class), mock(OutputStream.class), NUMBER_ANALYSIS_FIELDS, Collections.emptyList(),
+                bos, mock(InputStream.class), mock(OutputStream.class), NUMBER_FIELDS, Collections.emptyList(),
                new AutodetectResultsParser(Settings.EMPTY), mock(Runnable.class))) {
            process.start(executorService, mock(StateProcessor.class), mock(InputStream.class));

--- a/plugin/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/AbstractDataToProcessWriterTests.java
+++ b/plugin/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/AbstractDataToProcessWriterTests.java
@ -5,8 +5,15 @@
 */
 package org.elasticsearch.xpack.ml.job.process.autodetect.writer;

+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.env.TestEnvironment;
+import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.test.ESTestCase;
+import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer;
+import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzerTests;
 import org.elasticsearch.xpack.ml.job.config.AnalysisConfig;
+import org.elasticsearch.xpack.ml.job.config.CategorizationAnalyzerConfig;
 import org.elasticsearch.xpack.ml.job.config.DataDescription;
 import org.elasticsearch.xpack.ml.job.config.Detector;
 import org.elasticsearch.xpack.ml.job.process.DataCountsReporter;
@ -16,25 +23,31 @@ import org.junit.Before;
 import org.mockito.Mockito;

 import java.io.IOException;
-import java.util.Arrays;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;

-/**
- * Testing methods of AbstractDataToProcessWriter but uses the concrete instances.
- */
 public class AbstractDataToProcessWriterTests extends ESTestCase {
+
+    private AnalysisRegistry analysisRegistry;
+    private Environment environment;
    private AutodetectProcess autodetectProcess;
    private DataCountsReporter dataCountsReporter;

    @Before
-    public void setUpMocks() {
+    public void setup() throws Exception {
+        Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()).build();
+        environment = TestEnvironment.newEnvironment(settings);
+        analysisRegistry = CategorizationAnalyzerTests.buildTestAnalysisRegistry(environment);
        autodetectProcess = Mockito.mock(AutodetectProcess.class);
        dataCountsReporter = Mockito.mock(DataCountsReporter.class);
    }

+    /**
+     * Testing methods of AbstractDataToProcessWriter but uses the concrete instances.
+     */
    public void testInputFields() throws IOException {
        DataDescription.Builder dd = new DataDescription.Builder();
        dd.setTimeField("time_field");
@ -43,10 +56,11 @@ public class AbstractDataToProcessWriterTests extends ESTestCase {
        detector.setByFieldName("metric");
        detector.setPartitionFieldName("host");
        detector.setDetectorDescription("metric(value) by metric partitionfield=host");
-        AnalysisConfig ac = new AnalysisConfig.Builder(Arrays.asList(detector.build())).build();
+        AnalysisConfig ac = new AnalysisConfig.Builder(Collections.singletonList(detector.build())).build();

+        boolean includeTokensFields = randomBoolean();
        AbstractDataToProcessWriter writer =
-                new CsvDataToProcessWriter(true, autodetectProcess, dd.build(), ac, dataCountsReporter);
+                new CsvDataToProcessWriter(true, includeTokensFields, autodetectProcess, dd.build(), ac, dataCountsReporter);

        writer.writeHeader();

@ -62,28 +76,60 @@ public class AbstractDataToProcessWriterTests extends ESTestCase {

        Map<String, Integer> inputIndexes = writer.getInputFieldIndexes();
        assertEquals(4, inputIndexes.size());
-        assertEquals(new Integer(0), inputIndexes.get("time_field"));
-        assertEquals(new Integer(1), inputIndexes.get("metric"));
-        assertEquals(new Integer(2), inputIndexes.get("host"));
-        assertEquals(new Integer(3), inputIndexes.get("value"));
+        assertEquals(Integer.valueOf(0), inputIndexes.get("time_field"));
+        assertEquals(Integer.valueOf(1), inputIndexes.get("metric"));
+        assertEquals(Integer.valueOf(2), inputIndexes.get("host"));
+        assertEquals(Integer.valueOf(3), inputIndexes.get("value"));

-        Map<String, Integer> outputIndexes = writer.getOutputFieldIndexes();
-        assertEquals(5, outputIndexes.size());
-        assertEquals(new Integer(0), outputIndexes.get("time_field"));
-        assertEquals(new Integer(1), outputIndexes.get("host"));
-        assertEquals(new Integer(2), outputIndexes.get("metric"));
-        assertEquals(new Integer(3), outputIndexes.get("value"));
-        assertEquals(new Integer(4), outputIndexes.get(LengthEncodedWriter.CONTROL_FIELD_NAME));
+        Map<String, Integer> outputIndexes = writer.outputFieldIndexes();
+        assertEquals(includeTokensFields ? 6 : 5, outputIndexes.size());
+        assertEquals(Integer.valueOf(0), outputIndexes.get("time_field"));
+        assertEquals(Integer.valueOf(1), outputIndexes.get("host"));
+        assertEquals(Integer.valueOf(2), outputIndexes.get("metric"));
+        assertEquals(Integer.valueOf(3), outputIndexes.get("value"));
+        if (includeTokensFields) {
+            assertEquals(Integer.valueOf(4), outputIndexes.get(LengthEncodedWriter.PRETOKENISED_TOKEN_FIELD));
+            assertEquals(Integer.valueOf(5), outputIndexes.get(LengthEncodedWriter.CONTROL_FIELD_NAME));
+        } else {
+            assertEquals(Integer.valueOf(4), outputIndexes.get(LengthEncodedWriter.CONTROL_FIELD_NAME));
+        }

        List<InputOutputMap> inOutMaps = writer.getInputOutputMap();
        assertEquals(4, inOutMaps.size());
-        assertEquals(inOutMaps.get(0).inputIndex, 0);
-        assertEquals(inOutMaps.get(0).outputIndex, 0);
-        assertEquals(inOutMaps.get(1).inputIndex, 2);
-        assertEquals(inOutMaps.get(1).outputIndex, 1);
-        assertEquals(inOutMaps.get(2).inputIndex, 1);
-        assertEquals(inOutMaps.get(2).outputIndex, 2);
-        assertEquals(inOutMaps.get(3).inputIndex, 3);
-        assertEquals(inOutMaps.get(3).outputIndex, 3);
+        assertEquals(0, inOutMaps.get(0).inputIndex);
+        assertEquals(0, inOutMaps.get(0).outputIndex);
+        assertEquals(2, inOutMaps.get(1).inputIndex);
+        assertEquals(1, inOutMaps.get(1).outputIndex);
+        assertEquals(1, inOutMaps.get(2).inputIndex);
+        assertEquals(2, inOutMaps.get(2).outputIndex);
+        assertEquals(3, inOutMaps.get(3).inputIndex);
+        assertEquals(3, inOutMaps.get(3).outputIndex);
+    }
+
+    public void testTokenizeForCategorization() throws IOException {
+        CategorizationAnalyzerConfig defaultConfig = CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(null);
+        try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, environment, defaultConfig)) {
+
+            assertEquals("sol13m-8608.1.p2ps,Info,Source,AES_SERVICE2,on,has,shut,down",
+                    AbstractDataToProcessWriter.tokenizeForCategorization(categorizationAnalyzer, "p2ps",
+                            "<sol13m-8608.1.p2ps: Info: > Source AES_SERVICE2 on 33122:967 has shut down."));
+
+            assertEquals("Vpxa,verbose,VpxaHalCnxHostagent,opID,WFU-ddeadb59,WaitForUpdatesDone,Received,callback",
+                    AbstractDataToProcessWriter.tokenizeForCategorization(categorizationAnalyzer, "vmware",
+                            "Vpxa: [49EC0B90 verbose 'VpxaHalCnxHostagent' opID=WFU-ddeadb59] [WaitForUpdatesDone] Received callback"));
+
+            assertEquals("org.apache.coyote.http11.Http11BaseProtocol,destroy",
+                    AbstractDataToProcessWriter.tokenizeForCategorization(categorizationAnalyzer, "apache",
+                            "org.apache.coyote.http11.Http11BaseProtocol destroy"));
+
+            assertEquals("INFO,session,PROXY,Session,DESTROYED",
+                    AbstractDataToProcessWriter.tokenizeForCategorization(categorizationAnalyzer, "proxy",
+                            " [1111529792] INFO  session <45409105041220090733@62.218.251.123> - " +
+                                    "----------------- PROXY Session DESTROYED --------------------"));
+
+            assertEquals("PSYoungGen,total,used",
+                    AbstractDataToProcessWriter.tokenizeForCategorization(categorizationAnalyzer, "java",
+                            "PSYoungGen      total 2572800K, used 1759355K [0x0000000759500000, 0x0000000800000000, 0x0000000800000000)"));
+        }
    }
 }
--- a/plugin/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/ControlMsgToProcessWriterTests.java
+++ b/plugin/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/ControlMsgToProcessWriterTests.java
@ -40,7 +40,7 @@ public class ControlMsgToProcessWriterTests extends ESTestCase {
    }

    public void testWriteFlushControlMessage_GivenAdvanceTime() throws IOException {
-        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 2);
+        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 4);
        FlushJobParams flushJobParams = FlushJobParams.builder().advanceTime("1234567890").build();

        writer.writeFlushControlMessage(flushJobParams);
@ -53,7 +53,7 @@ public class ControlMsgToProcessWriterTests extends ESTestCase {
    }

    public void testWriteFlushControlMessage_GivenSkipTime() throws IOException {
-        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 2);
+        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 4);
        FlushJobParams flushJobParams = FlushJobParams.builder().skipTime("1234567890").build();

        writer.writeFlushControlMessage(flushJobParams);
@ -66,7 +66,7 @@ public class ControlMsgToProcessWriterTests extends ESTestCase {
    }

    public void testWriteFlushControlMessage_GivenSkipAndAdvanceTime() throws IOException {
-        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 2);
+        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 4);
        FlushJobParams flushJobParams = FlushJobParams.builder().skipTime("1000").advanceTime("2000").build();

        writer.writeFlushControlMessage(flushJobParams);
@ -77,7 +77,7 @@ public class ControlMsgToProcessWriterTests extends ESTestCase {
    }

    public void testWriteFlushControlMessage_GivenCalcInterimResultsWithNoTimeParams() throws IOException {
-        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 2);
+        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 4);
        FlushJobParams flushJobParams = FlushJobParams.builder()
                .calcInterim(true).build();

@ -91,7 +91,7 @@ public class ControlMsgToProcessWriterTests extends ESTestCase {
    }

    public void testWriteFlushControlMessage_GivenPlainFlush() throws IOException {
-        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 2);
+        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 4);
        FlushJobParams flushJobParams = FlushJobParams.builder().build();

        writer.writeFlushControlMessage(flushJobParams);
@ -100,7 +100,7 @@ public class ControlMsgToProcessWriterTests extends ESTestCase {
    }

    public void testWriteFlushControlMessage_GivenCalcInterimResultsWithTimeParams() throws IOException {
-        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 2);
+        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 4);
        FlushJobParams flushJobParams = FlushJobParams.builder()
                .calcInterim(true)
                .forTimeRange(TimeRange.builder().startTime("120").endTime("180").build())
@ -116,7 +116,7 @@ public class ControlMsgToProcessWriterTests extends ESTestCase {
    }

    public void testWriteFlushControlMessage_GivenCalcInterimAndAdvanceTime() throws IOException {
-        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 2);
+        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 4);
        FlushJobParams flushJobParams = FlushJobParams.builder()
                .calcInterim(true)
                .forTimeRange(TimeRange.builder().startTime("50").endTime("100").build())
@ -136,7 +136,7 @@ public class ControlMsgToProcessWriterTests extends ESTestCase {
    }

    public void testWriteFlushMessage() throws IOException {
-        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 2);
+        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 4);
        long firstId = Long.parseLong(writer.writeFlushMessage());
        Mockito.reset(lengthEncodedWriter);

@ -159,7 +159,7 @@ public class ControlMsgToProcessWriterTests extends ESTestCase {
    }

    public void testWriteResetBucketsMessage() throws IOException {
-        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 2);
+        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 4);

        writer.writeResetBucketsMessage(
                new DataLoadParams(TimeRange.builder().startTime("0").endTime("600").build(), Optional.empty()));
@ -172,7 +172,7 @@ public class ControlMsgToProcessWriterTests extends ESTestCase {
    }

    public void testWriteUpdateModelPlotMessage() throws IOException {
-        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 2);
+        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 4);

        writer.writeUpdateModelPlotMessage(new ModelPlotConfig(true, "foo,bar"));

@ -184,7 +184,7 @@ public class ControlMsgToProcessWriterTests extends ESTestCase {
    }

    public void testWriteUpdateDetectorRulesMessage() throws IOException {
-        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 2);
+        ControlMsgToProcessWriter writer = new ControlMsgToProcessWriter(lengthEncodedWriter, 4);

        DetectionRule rule1 = new DetectionRule.Builder(createRule("5")).setTargetFieldName("targetField1")
                .setTargetFieldValue("targetValue").setConditionsConnective(Connective.AND).build();
--- a/plugin/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/CsvDataToProcessWriterTests.java
+++ b/plugin/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/CsvDataToProcessWriterTests.java
@ -6,9 +6,17 @@
 package org.elasticsearch.xpack.ml.job.process.autodetect.writer;

 import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.env.TestEnvironment;
+import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.test.ESTestCase;
+import org.elasticsearch.xpack.ml.MachineLearning;
+import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer;
+import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzerTests;
 import org.elasticsearch.xpack.ml.job.config.AnalysisConfig;
+import org.elasticsearch.xpack.ml.job.config.CategorizationAnalyzerConfig;
 import org.elasticsearch.xpack.ml.job.config.DataDescription;
 import org.elasticsearch.xpack.ml.job.config.DataDescription.DataFormat;
 import org.elasticsearch.xpack.ml.job.config.Detector;
@ -27,6 +35,7 @@ import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.Date;
 import java.util.List;

@ -40,6 +49,9 @@ import static org.mockito.Mockito.when;

 public class CsvDataToProcessWriterTests extends ESTestCase {

+    private AnalysisRegistry analysisRegistry;
+    private Environment environment;
+
    private AutodetectProcess autodetectProcess;
    private DataDescription.Builder dataDescription;
    private AnalysisConfig analysisConfig;
@ -48,7 +60,11 @@ public class CsvDataToProcessWriterTests extends ESTestCase {
    private List<String[]> writtenRecords;

    @Before
-    public void setUpMocks() throws IOException {
+    public void setup() throws Exception {
+        Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()).build();
+        environment = TestEnvironment.newEnvironment(settings);
+        analysisRegistry = CategorizationAnalyzerTests.buildTestAnalysisRegistry(environment);
+
        autodetectProcess = Mockito.mock(AutodetectProcess.class);
        dataCountsReporter = Mockito.mock(DataCountsReporter.class);

@ -69,7 +85,7 @@ public class CsvDataToProcessWriterTests extends ESTestCase {
        dataDescription.setTimeFormat(DataDescription.EPOCH);

        Detector detector = new Detector.Builder("metric", "value").build();
-        analysisConfig = new AnalysisConfig.Builder(Arrays.asList(detector)).build();
+        analysisConfig = new AnalysisConfig.Builder(Collections.singletonList(detector)).build();
    }

    public void testWrite_GivenTimeFormatIsEpochAndDataIsValid() throws IOException {
@ -80,7 +96,7 @@ public class CsvDataToProcessWriterTests extends ESTestCase {
        InputStream inputStream = createInputStream(input.toString());
        CsvDataToProcessWriter writer = createWriter();
        writer.writeHeader();
-        writer.write(inputStream, null, (r, e) -> {});
+        writer.write(inputStream, null, null, (r, e) -> {});
        verify(dataCountsReporter, times(1)).startNewIncrementalCount();

        List<String[]> expectedRecords = new ArrayList<>();
@ -93,6 +109,43 @@ public class CsvDataToProcessWriterTests extends ESTestCase {
        verify(dataCountsReporter).finishReporting(any());
    }

+    public void testWrite_GivenTimeFormatIsEpochAndCategorization() throws IOException {
+        Detector.Builder detector = new Detector.Builder("count", null);
+        detector.setByFieldName("mlcategory");
+        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Collections.singletonList(detector.build()));
+        builder.setCategorizationFieldName("message");
+        builder.setCategorizationAnalyzerConfig(CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(null));
+        analysisConfig = builder.build();
+
+        StringBuilder input = new StringBuilder();
+        input.append("time,message\n");
+        input.append("1,Node 1 started\n");
+        input.append("2,Node 2 started\n");
+        InputStream inputStream = createInputStream(input.toString());
+        CsvDataToProcessWriter writer = createWriter();
+        writer.writeHeader();
+        try (CategorizationAnalyzer categorizationAnalyzer =
+                     new CategorizationAnalyzer(analysisRegistry, environment, analysisConfig.getCategorizationAnalyzerConfig())) {
+            writer.write(inputStream, categorizationAnalyzer, null, (r, e) -> {});
+        }
+        verify(dataCountsReporter, times(1)).startNewIncrementalCount();
+
+        List<String[]> expectedRecords = new ArrayList<>();
+        // The "." field is the control field; "..." is the pre-tokenized tokens field
+        if (MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA) {
+            expectedRecords.add(new String[]{"time", "message", "...", "."});
+            expectedRecords.add(new String[]{"1", "Node 1 started", "Node,started", ""});
+            expectedRecords.add(new String[]{"2", "Node 2 started", "Node,started", ""});
+        } else {
+            expectedRecords.add(new String[]{"time", "message", "."});
+            expectedRecords.add(new String[]{"1", "Node 1 started", ""});
+            expectedRecords.add(new String[]{"2", "Node 2 started", ""});
+        }
+        assertWrittenRecordsEqualTo(expectedRecords);
+
+        verify(dataCountsReporter).finishReporting(any());
+    }
+
    public void testWrite_GivenTimeFormatIsEpochAndTimestampsAreOutOfOrder() throws IOException {
        StringBuilder input = new StringBuilder();
        input.append("time,metric,value\n");
@ -102,7 +155,7 @@ public class CsvDataToProcessWriterTests extends ESTestCase {
        InputStream inputStream = createInputStream(input.toString());
        CsvDataToProcessWriter writer = createWriter();
        writer.writeHeader();
-        writer.write(inputStream, null, (r, e) -> {});
+        writer.write(inputStream, null, null, (r, e) -> {});
        verify(dataCountsReporter, times(1)).startNewIncrementalCount();

        List<String[]> expectedRecords = new ArrayList<>();
@ -126,7 +179,7 @@ public class CsvDataToProcessWriterTests extends ESTestCase {
        when(dataCountsReporter.getLatestRecordTime()).thenReturn(new Date(5000L));
        CsvDataToProcessWriter writer = createWriter();
        writer.writeHeader();
-        writer.write(inputStream, null, (r, e) -> {});
+        writer.write(inputStream, null, null, (r, e) -> {});
        verify(dataCountsReporter, times(1)).startNewIncrementalCount();

        List<String[]> expectedRecords = new ArrayList<>();
@ -141,7 +194,8 @@ public class CsvDataToProcessWriterTests extends ESTestCase {
    }

    public void testWrite_GivenTimeFormatIsEpochAndSomeTimestampsWithinLatencySomeOutOfOrder() throws IOException {
-        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Arrays.asList(new Detector.Builder("metric", "value").build()));
+        AnalysisConfig.Builder builder =
+                new AnalysisConfig.Builder(Collections.singletonList(new Detector.Builder("metric", "value").build()));
        builder.setLatency(TimeValue.timeValueSeconds(2));
        analysisConfig = builder.build();

@ -156,7 +210,7 @@ public class CsvDataToProcessWriterTests extends ESTestCase {
        InputStream inputStream = createInputStream(input.toString());
        CsvDataToProcessWriter writer = createWriter();
        writer.writeHeader();
-        writer.write(inputStream, null, (r, e) -> {});
+        writer.write(inputStream, null, null, (r, e) -> {});
        verify(dataCountsReporter, times(1)).startNewIncrementalCount();

        List<String[]> expectedRecords = new ArrayList<>();
@ -174,7 +228,8 @@ public class CsvDataToProcessWriterTests extends ESTestCase {
    }

    public void testWrite_NullByte() throws IOException {
-        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Arrays.asList(new Detector.Builder("metric", "value").build()));
+        AnalysisConfig.Builder builder =
+                new AnalysisConfig.Builder(Collections.singletonList(new Detector.Builder("metric", "value").build()));
        builder.setLatency(TimeValue.ZERO);
        analysisConfig = builder.build();

@ -189,7 +244,7 @@ public class CsvDataToProcessWriterTests extends ESTestCase {
        InputStream inputStream = createInputStream(input.toString());
        CsvDataToProcessWriter writer = createWriter();
        writer.writeHeader();
-        writer.write(inputStream, null, (r, e) -> {});
+        writer.write(inputStream, null, null, (r, e) -> {});
        verify(dataCountsReporter, times(1)).startNewIncrementalCount();

        List<String[]> expectedRecords = new ArrayList<>();
@ -211,7 +266,8 @@ public class CsvDataToProcessWriterTests extends ESTestCase {
    }

    public void testWrite_EmptyInput() throws IOException {
-        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Arrays.asList(new Detector.Builder("metric", "value").build()));
+        AnalysisConfig.Builder builder =
+                new AnalysisConfig.Builder(Collections.singletonList(new Detector.Builder("metric", "value").build()));
        builder.setLatency(TimeValue.ZERO);
        analysisConfig = builder.build();

@ -227,7 +283,7 @@ public class CsvDataToProcessWriterTests extends ESTestCase {
        CsvDataToProcessWriter writer = createWriter();
        writer.writeHeader();

-        writer.write(inputStream, null, (counts, e) -> {
+        writer.write(inputStream, null, null, (counts, e) -> {
            if (e != null) {
                fail(e.getMessage());
            } else {
@ -251,7 +307,7 @@ public class CsvDataToProcessWriterTests extends ESTestCase {
        writer.writeHeader();

        SuperCsvException e = ESTestCase.expectThrows(SuperCsvException.class,
-                () -> writer.write(inputStream, null, (response, error) -> {}));
+                () -> writer.write(inputStream, null, null, (response, error) -> {}));
        // Expected line numbers are 2 and 10001, but SuperCSV may print the
        // numbers using a different locale's digit characters
        assertTrue(e.getMessage(), e.getMessage().matches(
@ -263,7 +319,9 @@ public class CsvDataToProcessWriterTests extends ESTestCase {
    }

    private CsvDataToProcessWriter createWriter() {
-        return new CsvDataToProcessWriter(true, autodetectProcess, dataDescription.build(), analysisConfig,
+        boolean includeTokensField = MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA &&
+                analysisConfig.getCategorizationFieldName() != null;
+        return new CsvDataToProcessWriter(true, includeTokensField, autodetectProcess, dataDescription.build(), analysisConfig,
                dataCountsReporter);
    }

--- a/plugin/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/DataToProcessWriterFactoryTests.java
+++ b/plugin/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/DataToProcessWriterFactoryTests.java
@ -33,7 +33,7 @@ public class DataToProcessWriterFactoryTests extends ESTestCase {
    }

    private static DataToProcessWriter createWriter(DataDescription dataDescription) {
-        return DataToProcessWriterFactory.create(true, mock(AutodetectProcess.class),
+        return DataToProcessWriterFactory.create(true, false, mock(AutodetectProcess.class),
                dataDescription, AnalysisConfigTests.createRandomized().build(),
                mock(DataCountsReporter.class), new NamedXContentRegistry(Collections.emptyList()));
    }
--- a/plugin/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/FieldConfigWriterTests.java
+++ b/plugin/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/FieldConfigWriterTests.java
@ -9,6 +9,7 @@ import org.apache.logging.log4j.Logger;
 import org.elasticsearch.common.xcontent.ToXContent;
 import org.elasticsearch.common.xcontent.XContentFactory;
 import org.elasticsearch.test.ESTestCase;
+import org.elasticsearch.xpack.ml.MachineLearning;
 import org.elasticsearch.xpack.ml.calendars.ScheduledEvent;
 import org.elasticsearch.xpack.ml.job.config.AnalysisConfig;
 import org.elasticsearch.xpack.ml.job.config.Condition;
@ -133,7 +134,7 @@ public class FieldConfigWriterTests extends ESTestCase {
        Detector.Builder d = new Detector.Builder("metric", "Integer_Value");
        d.setByFieldName("mlcategory");

-        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Arrays.asList(d.build()));
+        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Collections.singletonList(d.build()));
        builder.setCategorizationFieldName("foo");
        analysisConfig = builder.build();
        writer = mock(OutputStreamWriter.class);
@ -148,7 +149,7 @@ public class FieldConfigWriterTests extends ESTestCase {
        Detector.Builder d = new Detector.Builder("metric", "Integer_Value");
        d.setByFieldName("ts_hash");

-        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Arrays.asList(d.build()));
+        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Collections.singletonList(d.build()));
        builder.setInfluencers(Arrays.asList("sun", "moon", "earth"));
        analysisConfig = builder.build();

@ -167,8 +168,8 @@ public class FieldConfigWriterTests extends ESTestCase {
        Detector.Builder d = new Detector.Builder("metric", "Integer_Value");
        d.setByFieldName("mlcategory");

-        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Arrays.asList(d.build()));
-        builder.setInfluencers(Arrays.asList("sun"));
+        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Collections.singletonList(d.build()));
+        builder.setInfluencers(Collections.singletonList("sun"));
        builder.setCategorizationFieldName("myCategory");
        builder.setCategorizationFilters(Arrays.asList("foo", " ", "abc,def"));
        analysisConfig = builder.build();
@ -179,9 +180,10 @@ public class FieldConfigWriterTests extends ESTestCase {

        verify(writer).write(
                "detector.0.clause = metric(Integer_Value) by mlcategory categorizationfield=myCategory\n" +
+                (MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA ? "" :
                        "categorizationfilter.0 = foo\n" +
                        "categorizationfilter.1 = \" \"\n" +
-                        "categorizationfilter.2 = \"abc,def\"\n" +
+                        "categorizationfilter.2 = \"abc,def\"\n") +
                "influencer.0 = sun\n");
        verifyNoMoreInteractions(writer);
    }
@ -192,10 +194,10 @@ public class FieldConfigWriterTests extends ESTestCase {
        detector.setPartitionFieldName("instance");
        RuleCondition ruleCondition = RuleCondition.createNumerical
                (RuleConditionType.NUMERICAL_ACTUAL, "metricName", "metricValue", new Condition(Operator.LT, "5"));
-        DetectionRule rule = new DetectionRule.Builder(Arrays.asList(ruleCondition)).setTargetFieldName("instance").build();
-        detector.setRules(Arrays.asList(rule));
+        DetectionRule rule = new DetectionRule.Builder(Collections.singletonList(ruleCondition)).setTargetFieldName("instance").build();
+        detector.setRules(Collections.singletonList(rule));

-        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Arrays.asList(detector.build()));
+        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Collections.singletonList(detector.build()));
        analysisConfig = builder.build();

        writer = mock(OutputStreamWriter.class);
@ -217,7 +219,7 @@ public class FieldConfigWriterTests extends ESTestCase {
    public void testWrite_GivenFilters() throws IOException {
        Detector d = new Detector.Builder("count", null).build();

-        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Arrays.asList(d));
+        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Collections.singletonList(d));
        analysisConfig = builder.build();

        filters.add(new MlFilter("filter_1", Arrays.asList("a", "b")));
@ -235,7 +237,7 @@ public class FieldConfigWriterTests extends ESTestCase {
    public void testWrite_GivenScheduledEvents() throws IOException {
        Detector d = new Detector.Builder("count", null).build();

-        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Arrays.asList(d));
+        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Collections.singletonList(d));
        analysisConfig = builder.build();

        scheduledEvents.add(new ScheduledEvent.Builder().description("The Ashes")
--- a/plugin/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/JsonDataToProcessWriterTests.java
+++ b/plugin/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/JsonDataToProcessWriterTests.java
@ -8,13 +8,21 @@ package org.elasticsearch.xpack.ml.job.process.autodetect.writer;
 import org.elasticsearch.ElasticsearchParseException;
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.io.stream.BytesStreamOutput;
+import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.common.xcontent.XContentFactory;
 import org.elasticsearch.common.xcontent.XContentGenerator;
 import org.elasticsearch.common.xcontent.XContentType;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.env.TestEnvironment;
+import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.test.ESTestCase;
+import org.elasticsearch.xpack.ml.MachineLearning;
+import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer;
+import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzerTests;
 import org.elasticsearch.xpack.ml.job.config.AnalysisConfig;
+import org.elasticsearch.xpack.ml.job.config.CategorizationAnalyzerConfig;
 import org.elasticsearch.xpack.ml.job.config.DataDescription;
 import org.elasticsearch.xpack.ml.job.config.DataDescription.DataFormat;
 import org.elasticsearch.xpack.ml.job.config.Detector;
@ -26,7 +34,6 @@ import org.mockito.invocation.InvocationOnMock;
 import org.mockito.stubbing.Answer;

 import java.io.ByteArrayInputStream;
-import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
@ -43,6 +50,9 @@ import static org.mockito.Mockito.verify;

 public class JsonDataToProcessWriterTests extends ESTestCase {

+    private AnalysisRegistry analysisRegistry;
+    private Environment environment;
+
    private AutodetectProcess autodetectProcess;
    private DataCountsReporter dataCountsReporter;

@ -52,7 +62,11 @@ public class JsonDataToProcessWriterTests extends ESTestCase {
    private List<String[]> writtenRecords;

    @Before
-    public void setUpMocks() throws IOException {
+    public void setup() throws Exception {
+        Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()).build();
+        environment = TestEnvironment.newEnvironment(settings);
+        analysisRegistry = CategorizationAnalyzerTests.buildTestAnalysisRegistry(environment);
+
        autodetectProcess = Mockito.mock(AutodetectProcess.class);
        dataCountsReporter = Mockito.mock(DataCountsReporter.class);

@ -67,13 +81,12 @@ public class JsonDataToProcessWriterTests extends ESTestCase {
            }
        }).when(autodetectProcess).writeRecord(any(String[].class));

-
        dataDescription = new DataDescription.Builder();
        dataDescription.setFormat(DataFormat.XCONTENT);
        dataDescription.setTimeFormat(DataDescription.EPOCH);

        Detector detector = new Detector.Builder("metric", "value").build();
-        analysisConfig = new AnalysisConfig.Builder(Arrays.asList(detector)).build();
+        analysisConfig = new AnalysisConfig.Builder(Collections.singletonList(detector)).build();
    }

    public void testWrite_GivenTimeFormatIsEpochAndDataIsValid() throws Exception {
@ -83,7 +96,7 @@ public class JsonDataToProcessWriterTests extends ESTestCase {
        InputStream inputStream = createInputStream(input.toString());
        JsonDataToProcessWriter writer = createWriter();
        writer.writeHeader();
-        writer.write(inputStream, XContentType.JSON, (r, e) -> {});
+        writer.write(inputStream, null, XContentType.JSON, (r, e) -> {});
        verify(dataCountsReporter, times(1)).startNewIncrementalCount();

        List<String[]> expectedRecords = new ArrayList<>();
@ -96,8 +109,43 @@ public class JsonDataToProcessWriterTests extends ESTestCase {
        verify(dataCountsReporter).finishReporting(any());
    }

-    public void testWrite_GivenTimeFormatIsEpochAndTimestampsAreOutOfOrder()
-            throws Exception {
+    public void testWrite_GivenTimeFormatIsEpochAndCategorization() throws Exception {
+        Detector.Builder detector = new Detector.Builder("count", null);
+        detector.setByFieldName("mlcategory");
+        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Collections.singletonList(detector.build()));
+        builder.setCategorizationFieldName("message");
+        builder.setCategorizationAnalyzerConfig(CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(null));
+        analysisConfig = builder.build();
+
+        StringBuilder input = new StringBuilder();
+        input.append("{\"time\":\"1\", \"message\":\"Node 1 started\"}");
+        input.append("{\"time\":\"2\", \"message\":\"Node 2 started\"}");
+        InputStream inputStream = createInputStream(input.toString());
+        JsonDataToProcessWriter writer = createWriter();
+        writer.writeHeader();
+        try (CategorizationAnalyzer categorizationAnalyzer =
+                     new CategorizationAnalyzer(analysisRegistry, environment, analysisConfig.getCategorizationAnalyzerConfig())) {
+            writer.write(inputStream, categorizationAnalyzer, XContentType.JSON, (r, e) -> {});
+        }
+        verify(dataCountsReporter, times(1)).startNewIncrementalCount();
+
+        List<String[]> expectedRecords = new ArrayList<>();
+        // The "." field is the control field; "..." is the pre-tokenized tokens field
+        if (MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA) {
+            expectedRecords.add(new String[]{"time", "message", "...", "."});
+            expectedRecords.add(new String[]{"1", "Node 1 started", "Node,started", ""});
+            expectedRecords.add(new String[]{"2", "Node 2 started", "Node,started", ""});
+        } else {
+            expectedRecords.add(new String[]{"time", "message", "."});
+            expectedRecords.add(new String[]{"1", "Node 1 started", ""});
+            expectedRecords.add(new String[]{"2", "Node 2 started", ""});
+        }
+        assertWrittenRecordsEqualTo(expectedRecords);
+
+        verify(dataCountsReporter).finishReporting(any());
+    }
+
+    public void testWrite_GivenTimeFormatIsEpochAndTimestampsAreOutOfOrder() throws Exception {
        StringBuilder input = new StringBuilder();
        input.append("{\"time\":\"3\", \"metric\":\"foo\", \"value\":\"3.0\"}");
        input.append("{\"time\":\"1\", \"metric\":\"bar\", \"value\":\"1.0\"}");
@ -105,7 +153,7 @@ public class JsonDataToProcessWriterTests extends ESTestCase {
        InputStream inputStream = createInputStream(input.toString());
        JsonDataToProcessWriter writer = createWriter();
        writer.writeHeader();
-        writer.write(inputStream, XContentType.JSON, (r, e) -> {});
+        writer.write(inputStream, null, XContentType.JSON, (r, e) -> {});
        verify(dataCountsReporter, times(1)).startNewIncrementalCount();

        List<String[]> expectedRecords = new ArrayList<>();
@ -119,9 +167,9 @@ public class JsonDataToProcessWriterTests extends ESTestCase {
        verify(dataCountsReporter).finishReporting(any());
    }

-    public void testWrite_GivenTimeFormatIsEpochAndSomeTimestampsWithinLatencySomeOutOfOrder()
-            throws Exception {
-        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Arrays.asList(new Detector.Builder("metric", "value").build()));
+    public void testWrite_GivenTimeFormatIsEpochAndSomeTimestampsWithinLatencySomeOutOfOrder() throws Exception {
+        AnalysisConfig.Builder builder =
+                new AnalysisConfig.Builder(Collections.singletonList(new Detector.Builder("metric", "value").build()));
        builder.setLatency(TimeValue.timeValueSeconds(2));
        analysisConfig = builder.build();

@ -134,7 +182,7 @@ public class JsonDataToProcessWriterTests extends ESTestCase {
        InputStream inputStream = createInputStream(input.toString());
        JsonDataToProcessWriter writer = createWriter();
        writer.writeHeader();
-        writer.write(inputStream, XContentType.JSON, (r, e) -> {});
+        writer.write(inputStream, null, XContentType.JSON, (r, e) -> {});

        List<String[]> expectedRecords = new ArrayList<>();
        // The final field is the control field
@ -150,9 +198,9 @@ public class JsonDataToProcessWriterTests extends ESTestCase {
        verify(dataCountsReporter).finishReporting(any());
    }

-    public void testWrite_GivenMalformedJsonWithoutNestedLevels()
-            throws Exception {
-        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Arrays.asList(new Detector.Builder("metric", "value").build()));
+    public void testWrite_GivenMalformedJsonWithoutNestedLevels() throws Exception {
+        AnalysisConfig.Builder builder =
+                new AnalysisConfig.Builder(Collections.singletonList(new Detector.Builder("metric", "value").build()));
        builder.setLatency(TimeValue.timeValueSeconds(2));
        analysisConfig = builder.build();

@ -163,7 +211,7 @@ public class JsonDataToProcessWriterTests extends ESTestCase {
        InputStream inputStream = createInputStream(input.toString());
        JsonDataToProcessWriter writer = createWriter();
        writer.writeHeader();
-        writer.write(inputStream, XContentType.JSON, (r, e) -> {});
+        writer.write(inputStream, null, XContentType.JSON, (r, e) -> {});
        verify(dataCountsReporter, times(1)).startNewIncrementalCount();

        List<String[]> expectedRecords = new ArrayList<>();
@ -181,7 +229,7 @@ public class JsonDataToProcessWriterTests extends ESTestCase {
    public void testWrite_GivenMalformedJsonWithNestedLevels()
            throws Exception {
        Detector detector = new Detector.Builder("metric", "nested.value").build();
-        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Arrays.asList(detector));
+        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Collections.singletonList(detector));
        builder.setLatency(TimeValue.timeValueSeconds(2));
        analysisConfig = builder.build();

@ -192,7 +240,7 @@ public class JsonDataToProcessWriterTests extends ESTestCase {
        InputStream inputStream = createInputStream(input.toString());
        JsonDataToProcessWriter writer = createWriter();
        writer.writeHeader();
-        writer.write(inputStream, XContentType.JSON, (r, e) -> {});
+        writer.write(inputStream, null, XContentType.JSON, (r, e) -> {});
        verify(dataCountsReporter, times(1)).startNewIncrementalCount();

        List<String[]> expectedRecords = new ArrayList<>();
@ -208,7 +256,7 @@ public class JsonDataToProcessWriterTests extends ESTestCase {

    public void testWrite_GivenMalformedJsonThatNeverRecovers()
            throws Exception {
-        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Arrays.asList(new Detector.Builder("count", null).build()));
+        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Collections.singletonList(new Detector.Builder("count", null).build()));
        builder.setLatency(TimeValue.timeValueSeconds(2));
        analysisConfig = builder.build();

@ -220,12 +268,12 @@ public class JsonDataToProcessWriterTests extends ESTestCase {
        writer.writeHeader();

        ESTestCase.expectThrows(ElasticsearchParseException.class,
-                () -> writer.write(inputStream, XContentType.JSON, (r, e) -> {}));
+                () -> writer.write(inputStream, null, XContentType.JSON, (r, e) -> {}));
    }

-    public void testWrite_GivenJsonWithArrayField()
-            throws Exception {
-        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Arrays.asList(new Detector.Builder("metric", "value").build()));
+    public void testWrite_GivenJsonWithArrayField() throws Exception {
+        AnalysisConfig.Builder builder =
+                new AnalysisConfig.Builder(Collections.singletonList(new Detector.Builder("metric", "value").build()));
        builder.setLatency(TimeValue.timeValueSeconds(2));
        analysisConfig = builder.build();

@ -235,7 +283,7 @@ public class JsonDataToProcessWriterTests extends ESTestCase {
        InputStream inputStream = createInputStream(input.toString());
        JsonDataToProcessWriter writer = createWriter();
        writer.writeHeader();
-        writer.write(inputStream, XContentType.JSON, (r, e) -> {});
+        writer.write(inputStream, null, XContentType.JSON, (r, e) -> {});
        verify(dataCountsReporter, times(1)).startNewIncrementalCount();

        List<String[]> expectedRecords = new ArrayList<>();
@ -248,9 +296,9 @@ public class JsonDataToProcessWriterTests extends ESTestCase {
        verify(dataCountsReporter).finishReporting(any());
    }

-    public void testWrite_GivenJsonWithMissingFields()
-            throws Exception {
-        AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Arrays.asList(new Detector.Builder("metric", "value").build()));
+    public void testWrite_GivenJsonWithMissingFields() throws Exception {
+        AnalysisConfig.Builder builder =
+                new AnalysisConfig.Builder(Collections.singletonList(new Detector.Builder("metric", "value").build()));
        builder.setLatency(TimeValue.timeValueSeconds(2));
        analysisConfig = builder.build();

@ -264,7 +312,7 @@ public class JsonDataToProcessWriterTests extends ESTestCase {
        InputStream inputStream = createInputStream(input.toString());
        JsonDataToProcessWriter writer = createWriter();
        writer.writeHeader();
-        writer.write(inputStream, XContentType.JSON, (r, e) -> {});
+        writer.write(inputStream, null, XContentType.JSON, (r, e) -> {});
        verify(dataCountsReporter, times(1)).startNewIncrementalCount();

        List<String[]> expectedRecords = new ArrayList<>();
@ -309,7 +357,7 @@ public class JsonDataToProcessWriterTests extends ESTestCase {
        InputStream inputStream = new ByteArrayInputStream(BytesReference.toBytes(xsonOs.bytes()));
        JsonDataToProcessWriter writer = createWriter();
        writer.writeHeader();
-        writer.write(inputStream, XContentType.SMILE, (r, e) -> {});
+        writer.write(inputStream, null, XContentType.SMILE, (r, e) -> {});
        verify(dataCountsReporter, times(1)).startNewIncrementalCount();

        List<String[]> expectedRecords = new ArrayList<>();
@ -327,7 +375,9 @@ public class JsonDataToProcessWriterTests extends ESTestCase {
    }

    private JsonDataToProcessWriter createWriter() {
-        return new JsonDataToProcessWriter(true, autodetectProcess, dataDescription.build(), analysisConfig,
+        boolean includeTokensField = MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA &&
+                analysisConfig.getCategorizationFieldName() != null;
+        return new JsonDataToProcessWriter(true, includeTokensField, autodetectProcess, dataDescription.build(), analysisConfig,
                dataCountsReporter, new NamedXContentRegistry(Collections.emptyList()));
    }

--- a/plugin/src/test/resources/rest-api-spec/test/ml/jobs_crud.yml
+++ b/plugin/src/test/resources/rest-api-spec/test/ml/jobs_crud.yml
@ -1050,7 +1050,7 @@

  - do:
      warnings:
-          - Creating jobs with delimited data format is deprecated. Please use JSON instead.
+          - Creating jobs with delimited data format is deprecated. Please use xcontent instead.
      xpack.ml.put_job:
        job_id: delimited-format-job
        body:  >
@ -1067,3 +1067,71 @@
            }
          }
  - match: { job_id: "delimited-format-job" }
+
+---
+"Test job with named categorization_analyzer":
+  - do:
+      xpack.ml.put_job:
+        job_id: jobs-crud-categorization-analyzer-job
+        body:  >
+          {
+            "analysis_config" : {
+                "detectors" :[{"function":"mean","field_name":"responsetime","by_field_name":"airline"},
+                    {"function":"count","by_field_name":"mlcategory"}],
+                "categorization_field_name": "some_category",
+                "categorization_analyzer" : "standard"
+            },
+            "data_description" : {
+            }
+          }
+  - match: { job_id: "jobs-crud-categorization-analyzer-job" }
+  - match: { analysis_config.categorization_analyzer: "standard" }
+
+---
+"Test job with custom categorization_analyzer":
+  - do:
+      xpack.ml.put_job:
+        job_id: jobs-crud-categorization-analyzer-job
+        body:  >
+          {
+            "analysis_config" : {
+                "detectors" :[{"function":"mean","field_name":"responsetime","by_field_name":"airline"},
+                    {"function":"count","by_field_name":"mlcategory"}],
+                "categorization_field_name": "some_category",
+                "categorization_analyzer" : {
+                    "char_filter" : ["html_strip"],
+                    "tokenizer" : "classic",
+                    "filter" : ["stop"]
+                }
+            },
+            "data_description" : {
+            }
+          }
+  - match: { job_id: "jobs-crud-categorization-analyzer-job" }
+  - match: { analysis_config.categorization_analyzer.char_filter.0: "html_strip" }
+  - match: { analysis_config.categorization_analyzer.tokenizer: "classic" }
+  - match: { analysis_config.categorization_analyzer.filter.0: "stop" }
+
+---
+"Test job with categorization_analyzer and categorization_filters":
+  - do:
+      catch: /categorization_filters cannot be used with categorization_analyzer - instead specify them as pattern_replace char_filters in the analyzer/
+      xpack.ml.put_job:
+        job_id: jobs-crud-categorization-analyzer-job
+        body:  >
+          {
+            "analysis_config" : {
+                "detectors" :[{"function":"mean","field_name":"responsetime","by_field_name":"airline"},
+                    {"function":"count","by_field_name":"mlcategory"}],
+                "categorization_field_name": "some_category",
+                "categorization_analyzer" : {
+                    "char_filter" : ["html_strip"],
+                    "tokenizer" : "classic",
+                    "filter" : ["stop"]
+                },
+                "categorization_filters" : ["cat1.*", "cat2.*"]
+            },
+            "data_description" : {
+            }
+          }
+
--- a/plugin/src/test/resources/rest-api-spec/test/ml/ml_classic_analyze.yml
+++ b/plugin/src/test/resources/rest-api-spec/test/ml/ml_classic_analyze.yml
@ -0,0 +1,83 @@
+---
+"Test analyze API with an analyzer that does what we used to do in native code":
+  - do:
+      indices.analyze:
+        body:  >
+          {
+            "tokenizer" : "ml_classic",
+            "filter" : [
+              { "type" : "stop", "stopwords": [
+                "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
+                "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun",
+                "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December",
+                "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
+                "GMT", "UTC"
+              ] }
+            ],
+            "text" : "[elasticsearch] [2017-12-13T10:46:30,816][INFO ][o.e.c.m.MetaDataCreateIndexService] [node-0] [.watcher-history-7-2017.12.13] creating index, cause [auto(bulk api)], templates [.watch-history-7], shards [1]/[1], mappings [doc]"
+          }
+  - match: { tokens.0.token: "elasticsearch" }
+  - match: { tokens.0.start_offset: 1 }
+  - match: { tokens.0.end_offset: 14 }
+  - match: { tokens.0.position: 0 }
+  - match: { tokens.1.token: "INFO" }
+  - match: { tokens.1.start_offset: 42 }
+  - match: { tokens.1.end_offset: 46 }
+  - match: { tokens.1.position: 5 }
+  - match: { tokens.2.token: "o.e.c.m.MetaDataCreateIndexService" }
+  - match: { tokens.2.start_offset: 49 }
+  - match: { tokens.2.end_offset: 83 }
+  - match: { tokens.2.position: 6 }
+  - match: { tokens.3.token: "node-0" }
+  - match: { tokens.3.start_offset: 86 }
+  - match: { tokens.3.end_offset: 92 }
+  - match: { tokens.3.position: 7 }
+  - match: { tokens.4.token: "watcher-history-7-2017.12.13" }
+  - match: { tokens.4.start_offset: 96 }
+  - match: { tokens.4.end_offset: 124 }
+  - match: { tokens.4.position: 8 }
+  - match: { tokens.5.token: "creating" }
+  - match: { tokens.5.start_offset: 126 }
+  - match: { tokens.5.end_offset: 134 }
+  - match: { tokens.5.position: 9 }
+  - match: { tokens.6.token: "index" }
+  - match: { tokens.6.start_offset: 135 }
+  - match: { tokens.6.end_offset: 140 }
+  - match: { tokens.6.position: 10 }
+  - match: { tokens.7.token: "cause" }
+  - match: { tokens.7.start_offset: 142 }
+  - match: { tokens.7.end_offset: 147 }
+  - match: { tokens.7.position: 11 }
+  - match: { tokens.8.token: "auto" }
+  - match: { tokens.8.start_offset: 149 }
+  - match: { tokens.8.end_offset: 153 }
+  - match: { tokens.8.position: 12 }
+  - match: { tokens.9.token: "bulk" }
+  - match: { tokens.9.start_offset: 154 }
+  - match: { tokens.9.end_offset: 158 }
+  - match: { tokens.9.position: 13 }
+  - match: { tokens.10.token: "api" }
+  - match: { tokens.10.start_offset: 159 }
+  - match: { tokens.10.end_offset: 162 }
+  - match: { tokens.10.position: 14 }
+  - match: { tokens.11.token: "templates" }
+  - match: { tokens.11.start_offset: 166 }
+  - match: { tokens.11.end_offset: 175 }
+  - match: { tokens.11.position: 15 }
+  - match: { tokens.12.token: "watch-history-7" }
+  - match: { tokens.12.start_offset: 178 }
+  - match: { tokens.12.end_offset: 193 }
+  - match: { tokens.12.position: 16 }
+  - match: { tokens.13.token: "shards" }
+  - match: { tokens.13.start_offset: 196 }
+  - match: { tokens.13.end_offset: 202 }
+  - match: { tokens.13.position: 17 }
+  - match: { tokens.14.token: "mappings" }
+  - match: { tokens.14.start_offset: 212 }
+  - match: { tokens.14.end_offset: 220 }
+  - match: { tokens.14.position: 20 }
+  - match: { tokens.15.token: "doc" }
+  - match: { tokens.15.start_offset: 222 }
+  - match: { tokens.15.end_offset: 225 }
+  - match: { tokens.15.position: 21 }
+
--- a/qa/smoke-test-ml-with-security/build.gradle
+++ b/qa/smoke-test-ml-with-security/build.gradle
@ -15,6 +15,9 @@ task copyMlRestTests(type: Copy) {

 integTestRunner {
  systemProperty 'tests.rest.blacklist', [
+    // Remove this test because it doesn't call an ML endpoint and we don't want
+    // to grant extra permissions to the users used in this test suite
+    'ml/ml_classic_analyze/Test analyze API with an analyzer that does what we used to do in native code',
    // Remove tests that are expected to throw an exception, because we cannot then
    // know whether to expect an authorization exception or a validation exception
    'ml/calendar_crud/Test cannot create calendar with name _all',
@ -43,6 +46,7 @@ integTestRunner {
    'ml/jobs_crud/Test put job after closing state index',
    'ml/jobs_crud/Test put job with inconsistent body/param ids',
    'ml/jobs_crud/Test put job with time field in analysis_config',
+    'ml/jobs_crud/Test job with categorization_analyzer and categorization_filters',
    'ml/jobs_get/Test get job given missing job_id',
    'ml/jobs_get_result_buckets/Test mutually-exclusive params',
    'ml/jobs_get_result_buckets/Test mutually-exclusive params via body',