[ML] Reverse engineer Grok patterns from categorization results (#30125)

This change adds a grok_pattern field to the GET categories API output in ML. It's calculated using the regex and examples in the categorization result, and applying a list of candidate Grok patterns to the bits in between the tokens that are considered to define the category. This can currently be considered a prototype, as the Grok patterns it produces are not optimal. However, enough people have said it would be useful for it to be worthwhile exposing it as experimental functionality for interested parties to try out.
2018-05-15 09:02:38 +01:00 · 2018-05-15 09:02:38 +01:00 · 50c34b2a9b
parent 7dd816e77c
commit 50c34b2a9b
11 changed files with 561 additions and 43 deletions
--- a/x-pack/docs/en/rest-api/ml/get-category.asciidoc
+++ b/x-pack/docs/en/rest-api/ml/get-category.asciidoc
@ -62,11 +62,11 @@ roles provide these privileges. For more information, see
 ==== Examples

 The following example gets information about one category for the
-`it_ops_new_logs` job:
+`esxi_log` job:

 [source,js]
 --------------------------------------------------
-GET _xpack/ml/anomaly_detectors/it_ops_new_logs/results/categories
+GET _xpack/ml/anomaly_detectors/esxi_log/results/categories
 {
  "page":{
    "size": 1
@ -83,14 +83,18 @@ In this example, the API returns the following information:
  "count": 11,
  "categories": [
    {
-      "job_id": "it_ops_new_logs",
-      "category_id": 1,
-      "terms": "Actual Transaction Already Voided Reversed hostname dbserver.acme.com physicalhost esxserver1.acme.com vmhost app1.acme.com",
-      "regex": ".*?Actual.+?Transaction.+?Already.+?Voided.+?Reversed.+?hostname.+?dbserver.acme.com.+?physicalhost.+?esxserver1.acme.com.+?vmhost.+?app1.acme.com.*",
-      "max_matching_length": 137,
-      "examples": [
-        "Actual Transaction Already Voided / Reversed;hostname=dbserver.acme.com;physicalhost=esxserver1.acme.com;vmhost=app1.acme.com"
-      ]
+      "job_id" : "esxi_log",
+      "category_id" : 1,
+      "terms" : "Vpxa verbose vpxavpxaInvtVm opID VpxaInvtVmChangeListener Guest DiskInfo Changed",
+      "regex" : ".*?Vpxa.+?verbose.+?vpxavpxaInvtVm.+?opID.+?VpxaInvtVmChangeListener.+?Guest.+?DiskInfo.+?Changed.*",
+      "max_matching_length": 154,
+      "examples" : [
+        "Oct 19 17:04:44 esxi1.acme.com Vpxa: [3CB3FB90 verbose 'vpxavpxaInvtVm' opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed",
+        "Oct 19 17:04:45 esxi2.acme.com Vpxa: [3CA66B90 verbose 'vpxavpxaInvtVm' opID=WFU-33927856] [VpxaInvtVmChangeListener] Guest DiskInfo Changed",
+        "Oct 19 17:04:51 esxi1.acme.com Vpxa: [FFDBAB90 verbose 'vpxavpxaInvtVm' opID=WFU-25e0d447] [VpxaInvtVmChangeListener] Guest DiskInfo Changed",
+        "Oct 19 17:04:58 esxi2.acme.com Vpxa: [FFDDBB90 verbose 'vpxavpxaInvtVm' opID=WFU-bbff0134] [VpxaInvtVmChangeListener] Guest DiskInfo Changed"
+      ],
+      "grok_pattern" : ".*?%{SYSLOGTIMESTAMP:timestamp}.+?Vpxa.+?%{BASE16NUM:field}.+?verbose.+?vpxavpxaInvtVm.+?opID.+?VpxaInvtVmChangeListener.+?Guest.+?DiskInfo.+?Changed.*"
    }
  ]
 }
--- a/x-pack/docs/en/rest-api/ml/resultsresource.asciidoc
+++ b/x-pack/docs/en/rest-api/ml/resultsresource.asciidoc
@ -405,6 +405,13 @@ A category resource has the following properties:
 `examples`::
  (array) A list of examples of actual values that matched the category.

+`grok_pattern`::
+  experimental[] (string) A Grok pattern that could be used in Logstash or an
+  Ingest Pipeline to extract fields from messages that match the category. This
+  field is experimental and may be changed or removed in a future release. The
+  Grok patterns that are found are not optimal, but are often a good starting
+  point for manual tweaking.
+
 `job_id`::
  (string) The unique identifier for the job that these results belong to.

--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/results/CategoryDefinition.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/results/CategoryDefinition.java
@ -5,6 +5,7 @@
 */
 package org.elasticsearch.xpack.core.ml.job.results;

+import org.elasticsearch.Version;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
@ -34,6 +35,7 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
    public static final ParseField REGEX = new ParseField("regex");
    public static final ParseField MAX_MATCHING_LENGTH = new ParseField("max_matching_length");
    public static final ParseField EXAMPLES = new ParseField("examples");
+    public static final ParseField GROK_PATTERN = new ParseField("grok_pattern");

    // Used for QueryPage
    public static final ParseField RESULTS_FIELD = new ParseField("categories");
@ -51,6 +53,7 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
        parser.declareString(CategoryDefinition::setRegex, REGEX);
        parser.declareLong(CategoryDefinition::setMaxMatchingLength, MAX_MATCHING_LENGTH);
        parser.declareStringArray(CategoryDefinition::setExamples, EXAMPLES);
+        parser.declareString(CategoryDefinition::setGrokPattern, GROK_PATTERN);

        return parser;
    }
@ -61,6 +64,7 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
    private String regex = "";
    private long maxMatchingLength = 0L;
    private final Set<String> examples;
+    private String grokPattern;

    public CategoryDefinition(String jobId) {
        this.jobId = jobId;
@ -74,6 +78,9 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
        regex = in.readString();
        maxMatchingLength = in.readLong();
        examples = new TreeSet<>(in.readList(StreamInput::readString));
+        if (in.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
+            grokPattern = in.readOptionalString();
+        }
    }

    @Override
@ -84,6 +91,9 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
        out.writeString(regex);
        out.writeLong(maxMatchingLength);
        out.writeStringList(new ArrayList<>(examples));
+        if (out.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
+            out.writeOptionalString(grokPattern);
+        }
    }

    public String getJobId() {
@ -139,6 +149,14 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
        examples.add(example);
    }

+    public String getGrokPattern() {
+        return grokPattern;
+    }
+
+    public void setGrokPattern(String grokPattern) {
+        this.grokPattern = grokPattern;
+    }
+
    @Override
    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
        builder.startObject();
@ -148,6 +166,9 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
        builder.field(REGEX.getPreferredName(), regex);
        builder.field(MAX_MATCHING_LENGTH.getPreferredName(), maxMatchingLength);
        builder.field(EXAMPLES.getPreferredName(), examples);
+        if (grokPattern != null) {
+            builder.field(GROK_PATTERN.getPreferredName(), grokPattern);
+        }
        builder.endObject();
        return builder;
    }
@ -166,11 +187,12 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
                && Objects.equals(this.terms, that.terms)
                && Objects.equals(this.regex, that.regex)
                && Objects.equals(this.maxMatchingLength, that.maxMatchingLength)
-                && Objects.equals(this.examples, that.examples);
+                && Objects.equals(this.examples, that.examples)
+                && Objects.equals(this.grokPattern, that.grokPattern);
    }

    @Override
    public int hashCode() {
-        return Objects.hash(jobId, categoryId, terms, regex, maxMatchingLength, examples);
+        return Objects.hash(jobId, categoryId, terms, regex, maxMatchingLength, examples, grokPattern);
    }
 }
--- a/x-pack/plugin/ml/build.gradle
+++ b/x-pack/plugin/ml/build.gradle
@ -46,6 +46,7 @@ dependencies {
    testCompile project(path: xpackModule('security'), configuration: 'testArtifacts')

    // ml deps
+    compile project(':libs:grok')
    compile 'net.sf.supercsv:super-csv:2.4.0'
    nativeBundle "org.elasticsearch.ml:ml-cpp:${project.version}@zip"
    testCompile 'org.ini4j:ini4j:0.5.2'
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportGetCategoriesAction.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportGetCategoriesAction.java
@ -41,7 +41,7 @@ public class TransportGetCategoriesAction extends HandledTransportAction<GetCate

        Integer from = request.getPageParams() != null ? request.getPageParams().getFrom() : null;
        Integer size = request.getPageParams() != null ? request.getPageParams().getSize() : null;
-        jobProvider.categoryDefinitions(request.getJobId(), request.getCategoryId(), from, size,
+        jobProvider.categoryDefinitions(request.getJobId(), request.getCategoryId(), true, from, size,
                r -> listener.onResponse(new GetCategoriesAction.Response(r)), listener::onFailure, client);
    }
 }
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java
@ -0,0 +1,243 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.job.categorization;
+
+import org.elasticsearch.common.logging.Loggers;
+import org.elasticsearch.grok.Grok;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+
+/**
+ * Creates Grok patterns that will match all the examples in a given category_definition.
+ *
+ * The choice of field names is quite primitive.  The intention is that a human will edit these.
+ */
+public final class GrokPatternCreator {
+
+    private static String PREFACE = "preface";
+    private static String EPILOGUE = "epilogue";
+
+    /**
+     * The first match in this list will be chosen, so it needs to be ordered
+     * such that more generic patterns come after more specific patterns.
+     */
+    private static final List<GrokPatternCandidate> ORDERED_CANDIDATE_GROK_PATTERNS = Arrays.asList(
+            new GrokPatternCandidate("TIMESTAMP_ISO8601", "timestamp"),
+            new GrokPatternCandidate("DATESTAMP_RFC822", "timestamp"),
+            new GrokPatternCandidate("DATESTAMP_RFC2822", "timestamp"),
+            new GrokPatternCandidate("DATESTAMP_OTHER", "timestamp"),
+            new GrokPatternCandidate("DATESTAMP_EVENTLOG", "timestamp"),
+            new GrokPatternCandidate("SYSLOGTIMESTAMP", "timestamp"),
+            new GrokPatternCandidate("HTTPDATE", "timestamp"),
+            new GrokPatternCandidate("CATALINA_DATESTAMP", "timestamp"),
+            new GrokPatternCandidate("TOMCAT_DATESTAMP", "timestamp"),
+            new GrokPatternCandidate("CISCOTIMESTAMP", "timestamp"),
+            new GrokPatternCandidate("DATE", "date"),
+            new GrokPatternCandidate("TIME", "time"),
+            new GrokPatternCandidate("LOGLEVEL", "loglevel"),
+            new GrokPatternCandidate("URI", "uri"),
+            new GrokPatternCandidate("UUID", "uuid"),
+            new GrokPatternCandidate("MAC", "macaddress"),
+            // Can't use \b as the breaks, because slashes are not "word" characters
+            new GrokPatternCandidate("PATH", "path", "(?<!\\w)", "(?!\\w)"),
+            new GrokPatternCandidate("EMAILADDRESS", "email"),
+            // TODO: would be nice to have IPORHOST here, but HOST matches almost all words
+            new GrokPatternCandidate("IP", "ipaddress"),
+            // This already includes pre/post break conditions
+            new GrokPatternCandidate("QUOTEDSTRING", "field", "", ""),
+            // Can't use \b as the break before, because it doesn't work for negative numbers (the
+            // minus sign is not a "word" character)
+            new GrokPatternCandidate("NUMBER", "field", "(?<!\\w)"),
+            // Disallow +, - and . before hex numbers, otherwise this pattern will pick up base 10
+            // numbers that NUMBER rejected due to preceeding characters
+            new GrokPatternCandidate("BASE16NUM", "field", "(?<![\\w.+-])")
+            // TODO: also unfortunately can't have USERNAME in the list as it matches too broadly
+            // Fixing these problems with overly broad matches would require some extra intelligence
+            // to be added to remove inappropriate matches.  One idea would be to use a dictionary,
+            // but that doesn't necessarily help as "jay" could be a username but is also a dictionary
+            // word (plus there's the international headache with relying on dictionaries).  Similarly,
+            // hostnames could also be dictionary words - I've worked on machines called "hippo" and
+            // "scarf" in the past.  Another idea would be to look at the adjacent characters and
+            // apply some heuristic based on those.
+    );
+
+    private GrokPatternCreator() {
+    }
+
+    /**
+     * Given a category definition regex and a collection of examples from the category, return
+     * a grok pattern that will match the category and pull out any likely fields.  The extracted
+     * fields are given pretty generic names, but unique within the grok pattern provided.  The
+     * expectation is that a user will adjust the extracted field names based on their domain
+     * knowledge.
+     */
+    public static String findBestGrokMatchFromExamples(String jobId, String regex, Collection<String> examples) {
+
+        // The first string in this array will end up being the empty string, and it doesn't correspond
+        // to an "in between" bit.  Although it could be removed for "neatness", it actually makes the
+        // loops below slightly neater if it's left in.
+        //
+        // E.g., ".*?cat.+?sat.+?mat.*" -> [ "", "cat", "sat", "mat" ]
+        String[] fixedRegexBits = regex.split("\\.[*+]\\??");
+
+        // Create a pattern that will capture the bits in between the fixed parts of the regex
+        //
+        // E.g., ".*?cat.+?sat.+?mat.*" -> Pattern (.*?)cat(.+?)sat(.+?)mat(.*)
+        Pattern exampleProcessor = Pattern.compile(regex.replaceAll("(\\.[*+]\\??)", "($1)"), Pattern.DOTALL);
+
+        List<Collection<String>> groupsMatchesFromExamples = new ArrayList<>(fixedRegexBits.length);
+        for (int i = 0; i < fixedRegexBits.length; ++i) {
+            groupsMatchesFromExamples.add(new ArrayList<>(examples.size()));
+        }
+        for (String example : examples) {
+            Matcher matcher = exampleProcessor.matcher(example);
+            if (matcher.matches()) {
+                assert matcher.groupCount() == fixedRegexBits.length;
+                // E.g., if the input regex was ".*?cat.+?sat.+?mat.*" then the example
+                // "the cat sat on the mat" will result in "the ", " ", " on the ", and ""
+                // being added to the 4 "in between" collections in that order
+                for (int groupNum = 1; groupNum <= matcher.groupCount(); ++groupNum) {
+                    groupsMatchesFromExamples.get(groupNum - 1).add(matcher.group(groupNum));
+                }
+            } else {
+                // We should never get here.  If we do it implies a bug in the original categorization,
+                // as it's produced a regex that doesn't match the examples.
+                assert matcher.matches() : exampleProcessor.pattern() + " did not match " + example;
+                Loggers.getLogger(GrokPatternCreator.class).error("[{}] Pattern [{}] did not match example [{}]", jobId,
+                        exampleProcessor.pattern(), example);
+            }
+        }
+
+        Map<String, Integer> fieldNameCountStore = new HashMap<>();
+        StringBuilder overallGrokPatternBuilder = new StringBuilder();
+        // Finally, for each collection of "in between" bits we look for the best Grok pattern and incorporate
+        // it into the overall Grok pattern that will match the each example in its entirety
+        for (int inBetweenBitNum = 0; inBetweenBitNum < groupsMatchesFromExamples.size(); ++inBetweenBitNum) {
+            // Remember (from the first comment in this method) that the first element in this array is
+            // always the empty string
+            overallGrokPatternBuilder.append(fixedRegexBits[inBetweenBitNum]);
+            appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, inBetweenBitNum == 0,
+                    inBetweenBitNum == fixedRegexBits.length - 1, groupsMatchesFromExamples.get(inBetweenBitNum));
+        }
+        return overallGrokPatternBuilder.toString();
+    }
+
+    /**
+     * Given a collection of strings, work out which (if any) of the grok patterns we're allowed
+     * to use matches it best.  Then append the appropriate grok language to represent that finding
+     * onto the supplied string builder.
+     */
+    static void appendBestGrokMatchForStrings(Map<String, Integer> fieldNameCountStore, StringBuilder overallGrokPatternBuilder,
+                                              boolean isFirst, boolean isLast, Collection<String> mustMatchStrings) {
+
+        GrokPatternCandidate bestCandidate = null;
+        if (mustMatchStrings.isEmpty() == false) {
+            for (GrokPatternCandidate candidate : ORDERED_CANDIDATE_GROK_PATTERNS) {
+                if (mustMatchStrings.stream().allMatch(candidate.grok::match)) {
+                    bestCandidate = candidate;
+                    break;
+                }
+            }
+        }
+
+        if (bestCandidate == null) {
+            if (isLast) {
+                overallGrokPatternBuilder.append(".*");
+            } else if (isFirst || mustMatchStrings.stream().anyMatch(String::isEmpty)) {
+                overallGrokPatternBuilder.append(".*?");
+            } else {
+                overallGrokPatternBuilder.append(".+?");
+            }
+        } else {
+            Collection<String> prefaces = new ArrayList<>();
+            Collection<String> epilogues = new ArrayList<>();
+            populatePrefacesAndEpilogues(mustMatchStrings, bestCandidate.grok, prefaces, epilogues);
+            appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, isFirst, false, prefaces);
+            overallGrokPatternBuilder.append("%{").append(bestCandidate.grokPatternName).append(':')
+                    .append(buildFieldName(fieldNameCountStore, bestCandidate.fieldName)).append('}');
+            appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, isLast, epilogues);
+        }
+    }
+
+    /**
+     * Given a collection of strings, and a grok pattern that matches some part of them all,
+     * return collections of the bits that come before (prefaces) and after (epilogues) the
+     * bit that matches.
+     */
+    static void populatePrefacesAndEpilogues(Collection<String> matchingStrings, Grok grok, Collection<String> prefaces,
+                                             Collection<String> epilogues) {
+        for (String s : matchingStrings) {
+            Map<String, Object> captures = grok.captures(s);
+            // If the pattern doesn't match then captures will be null.  But we expect this
+            // method to only be called after validating that the pattern does match.
+            assert captures != null;
+            prefaces.add(captures.getOrDefault(PREFACE, "").toString());
+            epilogues.add(captures.getOrDefault(EPILOGUE, "").toString());
+        }
+    }
+
+    /**
+     * The first time a particular field name is passed, simply return it.
+     * The second time return it with "2" appended.
+     * The third time return it with "3" appended.
+     * Etc.
+     */
+    static String buildFieldName(Map<String, Integer> fieldNameCountStore, String fieldName) {
+        Integer numberSeen = fieldNameCountStore.compute(fieldName, (k, v) -> 1 + ((v == null) ? 0 : v));
+        if (numberSeen > 1) {
+            return fieldName + numberSeen;
+        } else {
+            return fieldName;
+        }
+    }
+
+    static class GrokPatternCandidate {
+
+        final String grokPatternName;
+        final String fieldName;
+        final Grok grok;
+
+        /**
+         * Pre/post breaks default to \b, but this may not be appropriate for Grok patterns that start or
+         * end with a non "word" character (i.e. letter, number or underscore).  For such patterns use one
+         * of the other constructors.
+         *
+         * In cases where the Grok pattern defined by Logstash already includes conditions on what must
+         * come before and after the match, use one of the other constructors and specify an empty string
+         * for the pre and/or post breaks.
+         * @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash.
+         * @param fieldName       Name of the field to extract from the match.
+         */
+        GrokPatternCandidate(String grokPatternName, String fieldName) {
+            this(grokPatternName, fieldName, "\\b", "\\b");
+        }
+
+        GrokPatternCandidate(String grokPatternName, String fieldName, String preBreak) {
+            this(grokPatternName, fieldName, preBreak, "\\b");
+        }
+
+        /**
+         * @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash.
+         * @param fieldName       Name of the field to extract from the match.
+         * @param preBreak        Only consider the match if it's broken from the previous text by this.
+         * @param postBreak       Only consider the match if it's broken from the following text by this.
+         */
+        GrokPatternCandidate(String grokPatternName, String fieldName, String preBreak, String postBreak) {
+            this.grokPatternName = grokPatternName;
+            this.fieldName = fieldName;
+            this.grok = new Grok(Grok.getBuiltinPatterns(), "%{DATA:" + PREFACE + "}" + preBreak + "%{" + grokPatternName + ":this}" +
+                    postBreak + "%{GREEDYDATA:" + EPILOGUE + "}");
+        }
+    }
+}
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/persistence/JobProvider.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/persistence/JobProvider.java
@ -98,6 +98,7 @@ import org.elasticsearch.xpack.core.ml.job.results.Result;
 import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
 import org.elasticsearch.xpack.core.ml.utils.MlIndicesUtils;
 import org.elasticsearch.xpack.core.security.support.Exceptions;
+import org.elasticsearch.xpack.ml.job.categorization.GrokPatternCreator;
 import org.elasticsearch.xpack.ml.job.persistence.InfluencersQueryBuilder.InfluencersQuery;
 import org.elasticsearch.xpack.ml.job.process.autodetect.params.AutodetectParams;

@ -626,10 +627,11 @@ public class JobProvider {
     * Get a page of {@linkplain CategoryDefinition}s for the given <code>jobId</code>.
     * Uses a supplied client, so may run as the currently authenticated user
     * @param jobId the job id
+     * @param augment Should the category definition be augmented with a Grok pattern?
     * @param from  Skip the first N categories. This parameter is for paging
     * @param size  Take only this number of categories
     */
-    public void categoryDefinitions(String jobId, Long categoryId, Integer from, Integer size,
+    public void categoryDefinitions(String jobId, Long categoryId, boolean augment, Integer from, Integer size,
                                    Consumer<QueryPage<CategoryDefinition>> handler,
                                    Consumer<Exception> errorHandler, Client client) {
        if (categoryId != null && (from != null || size != null)) {
@ -663,6 +665,9 @@ public class JobProvider {
                             XContentParser parser = XContentFactory.xContent(XContentHelper.xContentType(source))
                                     .createParser(NamedXContentRegistry.EMPTY, LoggingDeprecationHandler.INSTANCE, stream)) {
                            CategoryDefinition categoryDefinition = CategoryDefinition.LENIENT_PARSER.apply(parser, null);
+                            if (augment) {
+                                augmentWithGrokPattern(categoryDefinition);
+                            }
                            results.add(categoryDefinition);
                        } catch (IOException e) {
                            throw new ElasticsearchParseException("failed to parse category definition", e);
@ -674,6 +679,17 @@ public class JobProvider {
                }, e -> errorHandler.accept(mapAuthFailure(e, jobId, GetCategoriesAction.NAME))), client::search);
    }

+    void augmentWithGrokPattern(CategoryDefinition categoryDefinition) {
+        List<String> examples = categoryDefinition.getExamples();
+        String regex = categoryDefinition.getRegex();
+        if (examples.isEmpty() || regex.isEmpty()) {
+            categoryDefinition.setGrokPattern("");
+        } else {
+            categoryDefinition.setGrokPattern(GrokPatternCreator.findBestGrokMatchFromExamples(categoryDefinition.getJobId(),
+                regex, examples));
+        }
+    }
+
    /**
     * Search for anomaly records with the parameters in the
     * {@link RecordsQueryBuilder}
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/AutodetectResultProcessorIT.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/AutodetectResultProcessorIT.java
@ -461,7 +461,7 @@ public class AutodetectResultProcessorIT extends MlSingleNodeTestCase {
        AtomicReference<Exception> errorHolder = new AtomicReference<>();
        AtomicReference<QueryPage<CategoryDefinition>> resultHolder = new AtomicReference<>();
        CountDownLatch latch = new CountDownLatch(1);
-        jobProvider.categoryDefinitions(JOB_ID, categoryId, null, null, r -> {
+        jobProvider.categoryDefinitions(JOB_ID, categoryId, false, null, null, r -> {
            resultHolder.set(r);
            latch.countDown();
        }, e -> {
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java
@ -0,0 +1,232 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.job.categorization;
+
+import org.elasticsearch.grok.Grok;
+import org.elasticsearch.test.ESTestCase;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+
+import static org.hamcrest.Matchers.containsInAnyOrder;
+
+public class GrokPatternCreatorTests extends ESTestCase {
+
+    public void testBuildFieldName() {
+        Map<String, Integer> fieldNameCountStore = new HashMap<>();
+        assertEquals("field", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
+        assertEquals("field2", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
+        assertEquals("field3", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
+        assertEquals("timestamp", GrokPatternCreator.buildFieldName(fieldNameCountStore, "timestamp"));
+        assertEquals("field4", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
+        assertEquals("uri", GrokPatternCreator.buildFieldName(fieldNameCountStore, "uri"));
+        assertEquals("timestamp2", GrokPatternCreator.buildFieldName(fieldNameCountStore, "timestamp"));
+        assertEquals("field5", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
+    }
+
+    public void testPopulatePrefacesAndEpiloguesGivenTimestamp() {
+
+        Collection<String> matchingStrings = Arrays.asList("[2018-01-25T15:33:23] DEBUG ",
+                "[2018-01-24T12:33:23] ERROR ",
+                "junk [2018-01-22T07:33:23] INFO ",
+                "[2018-01-21T03:33:23] DEBUG ");
+        Grok grok = new GrokPatternCreator.GrokPatternCandidate("TIMESTAMP_ISO8601", "timestamp").grok;
+        Collection<String> prefaces = new ArrayList<>();
+        Collection<String> epilogues = new ArrayList<>();
+
+        GrokPatternCreator.populatePrefacesAndEpilogues(matchingStrings, grok, prefaces, epilogues);
+
+        assertThat(prefaces, containsInAnyOrder("[", "[", "junk [", "["));
+        assertThat(epilogues, containsInAnyOrder("] DEBUG ", "] ERROR ", "] INFO ", "] DEBUG "));
+    }
+
+    public void testPopulatePrefacesAndEpiloguesGivenEmailAddress() {
+
+        Collection<String> matchingStrings = Arrays.asList("before alice@acme.com after",
+                "abc bob@acme.com xyz",
+                "carol@acme.com");
+        Grok grok = new GrokPatternCreator.GrokPatternCandidate("EMAILADDRESS", "email").grok;
+        Collection<String> prefaces = new ArrayList<>();
+        Collection<String> epilogues = new ArrayList<>();
+
+        GrokPatternCreator.populatePrefacesAndEpilogues(matchingStrings, grok, prefaces, epilogues);
+
+        assertThat(prefaces, containsInAnyOrder("before ", "abc ", ""));
+        assertThat(epilogues, containsInAnyOrder(" after", " xyz", ""));
+    }
+
+    public void testAppendBestGrokMatchForStringsGivenTimestampsAndLogLevels() {
+
+        Collection<String> mustMatchStrings = Arrays.asList("[2018-01-25T15:33:23] DEBUG ",
+                "[2018-01-24T12:33:23] ERROR ",
+                "junk [2018-01-22T07:33:23] INFO ",
+                "[2018-01-21T03:33:23] DEBUG ");
+
+        Map<String, Integer> fieldNameCountStore = new HashMap<>();
+        StringBuilder overallGrokPatternBuilder = new StringBuilder();
+
+        GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
+
+        assertEquals(".+?%{TIMESTAMP_ISO8601:timestamp}.+?%{LOGLEVEL:loglevel}.+?", overallGrokPatternBuilder.toString());
+    }
+
+    public void testAppendBestGrokMatchForStringsGivenNumbersInBrackets() {
+
+        Collection<String> mustMatchStrings = Arrays.asList("(-2)",
+                "  (-3)",
+                " (4)",
+                " (-5) ");
+
+        Map<String, Integer> fieldNameCountStore = new HashMap<>();
+        StringBuilder overallGrokPatternBuilder = new StringBuilder();
+
+        GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
+
+        assertEquals(".+?%{NUMBER:field}.+?", overallGrokPatternBuilder.toString());
+    }
+
+    public void testAppendBestGrokMatchForStringsGivenNegativeNumbersWithoutBreak() {
+
+        Collection<String> mustMatchStrings = Arrays.asList("before-2 ",
+                "prior to-3",
+                "-4");
+
+        Map<String, Integer> fieldNameCountStore = new HashMap<>();
+        StringBuilder overallGrokPatternBuilder = new StringBuilder();
+
+        GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
+
+        // It seems sensible that we don't detect these suffices as either base 10 or base 16 numbers
+        assertEquals(".+?", overallGrokPatternBuilder.toString());
+    }
+
+    public void testAppendBestGrokMatchForStringsGivenHexNumbers() {
+
+        Collection<String> mustMatchStrings = Arrays.asList(" abc",
+                "  123",
+                " -123",
+                "1f is hex");
+
+        Map<String, Integer> fieldNameCountStore = new HashMap<>();
+        StringBuilder overallGrokPatternBuilder = new StringBuilder();
+
+        GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
+
+        assertEquals(".*?%{BASE16NUM:field}.*?", overallGrokPatternBuilder.toString());
+    }
+
+    public void testAppendBestGrokMatchForStringsGivenHostnamesWithNumbers() {
+
+        Collection<String> mustMatchStrings = Arrays.asList("<host1.1.p2ps:",
+                "<host2.1.p2ps:");
+
+        Map<String, Integer> fieldNameCountStore = new HashMap<>();
+        StringBuilder overallGrokPatternBuilder = new StringBuilder();
+
+        GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
+
+        // We don't want the .1. in the middle to get detected as a hex number
+        assertEquals(".+?", overallGrokPatternBuilder.toString());
+    }
+
+    public void testAppendBestGrokMatchForStringsGivenEmailAddresses() {
+
+        Collection<String> mustMatchStrings = Arrays.asList("before alice@acme.com after",
+                "abc bob@acme.com xyz",
+                "carol@acme.com");
+
+        Map<String, Integer> fieldNameCountStore = new HashMap<>();
+        StringBuilder overallGrokPatternBuilder = new StringBuilder();
+
+        GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
+
+        assertEquals(".*?%{EMAILADDRESS:email}.*?", overallGrokPatternBuilder.toString());
+    }
+
+    public void testAppendBestGrokMatchForStringsGivenUris() {
+
+        Collection<String> mustMatchStrings = Arrays.asList("main site https://www.elastic.co/ with trailing slash",
+                "https://www.elastic.co/guide/en/x-pack/current/ml-configuring-categories.html#ml-configuring-categories is a section",
+                "download today from https://www.elastic.co/downloads");
+
+        Map<String, Integer> fieldNameCountStore = new HashMap<>();
+        StringBuilder overallGrokPatternBuilder = new StringBuilder();
+
+        GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
+
+        assertEquals(".*?%{URI:uri}.*?", overallGrokPatternBuilder.toString());
+    }
+
+    public void testAppendBestGrokMatchForStringsGivenPaths() {
+
+        Collection<String> mustMatchStrings = Arrays.asList("on Mac /Users/dave",
+                "on Windows C:\\Users\\dave",
+                "on Linux /home/dave");
+
+        Map<String, Integer> fieldNameCountStore = new HashMap<>();
+        StringBuilder overallGrokPatternBuilder = new StringBuilder();
+
+        GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
+
+        assertEquals(".+?%{PATH:path}.*?", overallGrokPatternBuilder.toString());
+    }
+
+    public void testFindBestGrokMatchFromExamplesGivenNamedLogs() {
+
+        String regex = ".*?linux.+?named.+?error.+?unexpected.+?RCODE.+?REFUSED.+?resolving.*";
+        Collection<String> examples = Arrays.asList(
+                "Sep  8 11:55:06 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'elastic.slack.com/A/IN': 95.110.64.205#53",
+                "Sep  8 11:55:08 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'slack-imgs.com/A/IN': 95.110.64.205#53",
+                "Sep  8 11:55:35 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'www.elastic.co/A/IN': 95.110.68.206#53",
+                "Sep  8 11:55:42 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'b.akamaiedge.net/A/IN': 95.110.64.205#53");
+
+        assertEquals(".*?%{SYSLOGTIMESTAMP:timestamp}.+?linux.+?named.+?%{NUMBER:field}.+?error.+?" +
+                "unexpected.+?RCODE.+?REFUSED.+?resolving.+?%{QUOTEDSTRING:field2}.+?%{IP:ipaddress}.+?%{NUMBER:field3}.*",
+                GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples));
+    }
+
+    public void testFindBestGrokMatchFromExamplesGivenCatalinaLogs() {
+
+        String regex = ".*?org\\.apache\\.tomcat\\.util\\.http\\.Parameters.+?processParameters.+?WARNING.+?Parameters.+?" +
+                "Invalid.+?chunk.+?ignored.*";
+        // The embedded newline ensures the regular expressions we're using are compiled with Pattern.DOTALL
+        Collection<String> examples = Arrays.asList(
+                "Aug 29, 2009 12:03:33 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
+                        "Invalid chunk ignored.",
+                "Aug 29, 2009 12:03:40 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
+                        "Invalid chunk ignored.",
+                "Aug 29, 2009 12:03:45 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
+                        "Invalid chunk ignored.",
+                "Aug 29, 2009 12:03:57 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
+                        "Invalid chunk ignored.");
+
+        assertEquals(".*?%{CATALINA_DATESTAMP:timestamp}.+?org\\.apache\\.tomcat\\.util\\.http\\.Parameters.+?processParameters.+?" +
+                "WARNING.+?Parameters.+?Invalid.+?chunk.+?ignored.*",
+                GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples));
+    }
+
+    public void testFindBestGrokMatchFromExamplesGivenMultiTimestampLogs() {
+
+        String regex = ".*?Authpriv.+?Info.+?sshd.+?subsystem.+?request.+?for.+?sftp.*";
+        // Two timestamps: one local, one UTC
+        Collection<String> examples = Arrays.asList(
+                "559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" +
+                        "Info\tsshd\tsubsystem request for sftp",
+                "559550912548986880\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" +
+                        "Info\tsshd\tsubsystem request for sftp",
+                "559550912548986887\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" +
+                        "Info\tsshd\tsubsystem request for sftp",
+                "559550912603512850\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" +
+                        "Info\tsshd\tsubsystem request for sftp");
+
+        assertEquals(".*?%{NUMBER:field}.+?%{TIMESTAMP_ISO8601:timestamp}.+?%{TIMESTAMP_ISO8601:timestamp2}.+?%{NUMBER:field2}.+?" +
+                "%{IP:ipaddress}.+?Authpriv.+?Info.+?sshd.+?subsystem.+?request.+?for.+?sftp.*",
+                GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples));
+    }
+}
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/persistence/JobProviderTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/persistence/JobProviderTests.java
@ -61,7 +61,6 @@ import java.util.Date;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.concurrent.ExecutionException;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.function.Consumer;

@ -235,8 +234,7 @@ public class JobProviderTests extends ESTestCase {
        });
    }

-    public void testBuckets_OneBucketNoInterim()
-            throws InterruptedException, ExecutionException, IOException {
+    public void testBuckets_OneBucketNoInterim() throws IOException {
        String jobId = "TestJobIdentification";
        Date now = new Date();
        List<Map<String, Object>> source = new ArrayList<>();
@ -268,8 +266,7 @@ public class JobProviderTests extends ESTestCase {
                        ".*"));
    }

-    public void testBuckets_OneBucketInterim()
-            throws InterruptedException, ExecutionException, IOException {
+    public void testBuckets_OneBucketInterim() throws IOException {
        String jobId = "TestJobIdentification";
        Date now = new Date();
        List<Map<String, Object>> source = new ArrayList<>();
@ -302,8 +299,7 @@ public class JobProviderTests extends ESTestCase {
        assertFalse(queryString.matches("(?s).*is_interim.*"));
    }

-    public void testBuckets_UsingBuilder()
-            throws InterruptedException, ExecutionException, IOException {
+    public void testBuckets_UsingBuilder() throws IOException {
        String jobId = "TestJobIdentification";
        Date now = new Date();
        List<Map<String, Object>> source = new ArrayList<>();
@ -339,8 +335,7 @@ public class JobProviderTests extends ESTestCase {
        assertFalse(queryString.matches("(?s).*is_interim.*"));
    }

-    public void testBucket_NoBucketNoExpand()
-            throws InterruptedException, ExecutionException, IOException {
+    public void testBucket_NoBucketNoExpand() throws IOException {
        String jobId = "TestJobIdentification";
        Long timestamp = 98765432123456789L;
        List<Map<String, Object>> source = new ArrayList<>();
@ -357,8 +352,7 @@ public class JobProviderTests extends ESTestCase {
        assertEquals(ResourceNotFoundException.class, holder[0].getClass());
    }

-    public void testBucket_OneBucketNoExpand()
-            throws InterruptedException, ExecutionException, IOException {
+    public void testBucket_OneBucketNoExpand() throws IOException {
        String jobId = "TestJobIdentification";
        Date now = new Date();
        List<Map<String, Object>> source = new ArrayList<>();
@ -384,7 +378,7 @@ public class JobProviderTests extends ESTestCase {
        assertEquals(now, b.getTimestamp());
    }

-    public void testRecords() throws InterruptedException, ExecutionException, IOException {
+    public void testRecords() throws IOException {
        String jobId = "TestJobIdentification";
        Date now = new Date();
        List<Map<String, Object>> source = new ArrayList<>();
@ -431,8 +425,7 @@ public class JobProviderTests extends ESTestCase {
        assertEquals("irrascible", records.get(1).getFunction());
    }

-    public void testRecords_UsingBuilder()
-            throws InterruptedException, ExecutionException, IOException {
+    public void testRecords_UsingBuilder() throws IOException {
        String jobId = "TestJobIdentification";
        Date now = new Date();
        List<Map<String, Object>> source = new ArrayList<>();
@ -485,7 +478,7 @@ public class JobProviderTests extends ESTestCase {
        assertEquals("irrascible", records.get(1).getFunction());
    }

-    public void testBucketRecords() throws InterruptedException, ExecutionException, IOException {
+    public void testBucketRecords() throws IOException {
        String jobId = "TestJobIdentification";
        Date now = new Date();
        Bucket bucket = mock(Bucket.class);
@ -532,7 +525,7 @@ public class JobProviderTests extends ESTestCase {
        assertEquals("irrascible", records.get(1).getFunction());
    }

-    public void testexpandBucket() throws InterruptedException, ExecutionException, IOException {
+    public void testexpandBucket() throws IOException {
        String jobId = "TestJobIdentification";
        Date now = new Date();
        Bucket bucket = new Bucket("foo", now, 22);
@ -559,8 +552,7 @@ public class JobProviderTests extends ESTestCase {
        assertEquals(400L, records);
    }

-    public void testCategoryDefinitions()
-            throws InterruptedException, ExecutionException, IOException {
+    public void testCategoryDefinitions() throws IOException {
        String jobId = "TestJobIdentification";
        String terms = "the terms and conditions are not valid here";
        List<Map<String, Object>> source = new ArrayList<>();
@ -580,15 +572,14 @@ public class JobProviderTests extends ESTestCase {
        JobProvider provider = createProvider(client);
        @SuppressWarnings({"unchecked", "rawtypes"})
        QueryPage<CategoryDefinition>[] holder = new QueryPage[1];
-        provider.categoryDefinitions(jobId, null, from, size, r -> holder[0] = r,
+        provider.categoryDefinitions(jobId, null, false, from, size, r -> holder[0] = r,
                e -> {throw new RuntimeException(e);}, client);
        QueryPage<CategoryDefinition> categoryDefinitions = holder[0];
        assertEquals(1L, categoryDefinitions.count());
        assertEquals(terms, categoryDefinitions.results().get(0).getTerms());
    }

-    public void testCategoryDefinition()
-            throws InterruptedException, ExecutionException, IOException {
+    public void testCategoryDefinition() throws IOException {
        String jobId = "TestJobIdentification";
        String terms = "the terms and conditions are not valid here";

@ -603,14 +594,14 @@ public class JobProviderTests extends ESTestCase {
        JobProvider provider = createProvider(client);
        @SuppressWarnings({"unchecked", "rawtypes"})
        QueryPage<CategoryDefinition>[] holder = new QueryPage[1];
-        provider.categoryDefinitions(jobId, categoryId, null, null,
+        provider.categoryDefinitions(jobId, categoryId, false, null, null,
                r -> holder[0] = r, e -> {throw new RuntimeException(e);}, client);
        QueryPage<CategoryDefinition> categoryDefinitions = holder[0];
        assertEquals(1L, categoryDefinitions.count());
        assertEquals(terms, categoryDefinitions.results().get(0).getTerms());
    }

-    public void testInfluencers_NoInterim() throws InterruptedException, ExecutionException, IOException {
+    public void testInfluencers_NoInterim() throws IOException {
        String jobId = "TestJobIdentificationForInfluencers";
        Date now = new Date();
        List<Map<String, Object>> source = new ArrayList<>();
@ -670,7 +661,7 @@ public class JobProviderTests extends ESTestCase {
        assertEquals(5.0, records.get(1).getInitialInfluencerScore(), 0.00001);
    }

-    public void testInfluencers_WithInterim() throws InterruptedException, ExecutionException, IOException {
+    public void testInfluencers_WithInterim() throws IOException {
        String jobId = "TestJobIdentificationForInfluencers";
        Date now = new Date();
        List<Map<String, Object>> source = new ArrayList<>();
@ -730,7 +721,7 @@ public class JobProviderTests extends ESTestCase {
        assertEquals(5.0, records.get(1).getInitialInfluencerScore(), 0.00001);
    }

-    public void testModelSnapshots() throws InterruptedException, ExecutionException, IOException {
+    public void testModelSnapshots() throws IOException {
        String jobId = "TestJobIdentificationForInfluencers";
        Date now = new Date();
        List<Map<String, Object>> source = new ArrayList<>();
@ -851,8 +842,7 @@ public class JobProviderTests extends ESTestCase {
        return getResponse;
    }

-    private static SearchResponse createSearchResponse(List<Map<String, Object>> source)
-            throws IOException {
+    private static SearchResponse createSearchResponse(List<Map<String, Object>> source) throws IOException {
        SearchResponse response = mock(SearchResponse.class);
        List<SearchHit> list = new ArrayList<>();

--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/results/CategoryDefinitionTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/results/CategoryDefinitionTests.java
@ -25,6 +25,9 @@ public class CategoryDefinitionTests extends AbstractSerializingTestCase<Categor
        categoryDefinition.setRegex(randomAlphaOfLength(10));
        categoryDefinition.setMaxMatchingLength(randomLong());
        categoryDefinition.setExamples(Arrays.asList(generateRandomStringArray(10, 10, false)));
+        if (randomBoolean()) {
+            categoryDefinition.setGrokPattern(randomAlphaOfLength(50));
+        }
        return categoryDefinition;
    }