[ML] Reverse engineer Grok patterns from categorization results (#30125)

This change adds a grok_pattern field to the GET categories API
output in ML. It's calculated using the regex and examples in the
categorization result, and applying a list of candidate Grok
patterns to the bits in between the tokens that are considered to
define the category.

This can currently be considered a prototype, as the Grok patterns
it produces are not optimal. However, enough people have said it
would be useful for it to be worthwhile exposing it as experimental
functionality for interested parties to try out.
This commit is contained in:
David Roberts 2018-05-15 09:02:38 +01:00 committed by GitHub
parent 7dd816e77c
commit 50c34b2a9b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 561 additions and 43 deletions

View File

@ -62,11 +62,11 @@ roles provide these privileges. For more information, see
==== Examples
The following example gets information about one category for the
`it_ops_new_logs` job:
`esxi_log` job:
[source,js]
--------------------------------------------------
GET _xpack/ml/anomaly_detectors/it_ops_new_logs/results/categories
GET _xpack/ml/anomaly_detectors/esxi_log/results/categories
{
"page":{
"size": 1
@ -83,14 +83,18 @@ In this example, the API returns the following information:
"count": 11,
"categories": [
{
"job_id": "it_ops_new_logs",
"category_id": 1,
"terms": "Actual Transaction Already Voided Reversed hostname dbserver.acme.com physicalhost esxserver1.acme.com vmhost app1.acme.com",
"regex": ".*?Actual.+?Transaction.+?Already.+?Voided.+?Reversed.+?hostname.+?dbserver.acme.com.+?physicalhost.+?esxserver1.acme.com.+?vmhost.+?app1.acme.com.*",
"max_matching_length": 137,
"examples": [
"Actual Transaction Already Voided / Reversed;hostname=dbserver.acme.com;physicalhost=esxserver1.acme.com;vmhost=app1.acme.com"
]
"job_id" : "esxi_log",
"category_id" : 1,
"terms" : "Vpxa verbose vpxavpxaInvtVm opID VpxaInvtVmChangeListener Guest DiskInfo Changed",
"regex" : ".*?Vpxa.+?verbose.+?vpxavpxaInvtVm.+?opID.+?VpxaInvtVmChangeListener.+?Guest.+?DiskInfo.+?Changed.*",
"max_matching_length": 154,
"examples" : [
"Oct 19 17:04:44 esxi1.acme.com Vpxa: [3CB3FB90 verbose 'vpxavpxaInvtVm' opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed",
"Oct 19 17:04:45 esxi2.acme.com Vpxa: [3CA66B90 verbose 'vpxavpxaInvtVm' opID=WFU-33927856] [VpxaInvtVmChangeListener] Guest DiskInfo Changed",
"Oct 19 17:04:51 esxi1.acme.com Vpxa: [FFDBAB90 verbose 'vpxavpxaInvtVm' opID=WFU-25e0d447] [VpxaInvtVmChangeListener] Guest DiskInfo Changed",
"Oct 19 17:04:58 esxi2.acme.com Vpxa: [FFDDBB90 verbose 'vpxavpxaInvtVm' opID=WFU-bbff0134] [VpxaInvtVmChangeListener] Guest DiskInfo Changed"
],
"grok_pattern" : ".*?%{SYSLOGTIMESTAMP:timestamp}.+?Vpxa.+?%{BASE16NUM:field}.+?verbose.+?vpxavpxaInvtVm.+?opID.+?VpxaInvtVmChangeListener.+?Guest.+?DiskInfo.+?Changed.*"
}
]
}

View File

@ -405,6 +405,13 @@ A category resource has the following properties:
`examples`::
(array) A list of examples of actual values that matched the category.
`grok_pattern`::
experimental[] (string) A Grok pattern that could be used in Logstash or an
Ingest Pipeline to extract fields from messages that match the category. This
field is experimental and may be changed or removed in a future release. The
Grok patterns that are found are not optimal, but are often a good starting
point for manual tweaking.
`job_id`::
(string) The unique identifier for the job that these results belong to.

View File

@ -5,6 +5,7 @@
*/
package org.elasticsearch.xpack.core.ml.job.results;
import org.elasticsearch.Version;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
@ -34,6 +35,7 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
public static final ParseField REGEX = new ParseField("regex");
public static final ParseField MAX_MATCHING_LENGTH = new ParseField("max_matching_length");
public static final ParseField EXAMPLES = new ParseField("examples");
public static final ParseField GROK_PATTERN = new ParseField("grok_pattern");
// Used for QueryPage
public static final ParseField RESULTS_FIELD = new ParseField("categories");
@ -51,6 +53,7 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
parser.declareString(CategoryDefinition::setRegex, REGEX);
parser.declareLong(CategoryDefinition::setMaxMatchingLength, MAX_MATCHING_LENGTH);
parser.declareStringArray(CategoryDefinition::setExamples, EXAMPLES);
parser.declareString(CategoryDefinition::setGrokPattern, GROK_PATTERN);
return parser;
}
@ -61,6 +64,7 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
private String regex = "";
private long maxMatchingLength = 0L;
private final Set<String> examples;
private String grokPattern;
public CategoryDefinition(String jobId) {
this.jobId = jobId;
@ -74,6 +78,9 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
regex = in.readString();
maxMatchingLength = in.readLong();
examples = new TreeSet<>(in.readList(StreamInput::readString));
if (in.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
grokPattern = in.readOptionalString();
}
}
@Override
@ -84,6 +91,9 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
out.writeString(regex);
out.writeLong(maxMatchingLength);
out.writeStringList(new ArrayList<>(examples));
if (out.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
out.writeOptionalString(grokPattern);
}
}
public String getJobId() {
@ -139,6 +149,14 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
examples.add(example);
}
public String getGrokPattern() {
return grokPattern;
}
public void setGrokPattern(String grokPattern) {
this.grokPattern = grokPattern;
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
@ -148,6 +166,9 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
builder.field(REGEX.getPreferredName(), regex);
builder.field(MAX_MATCHING_LENGTH.getPreferredName(), maxMatchingLength);
builder.field(EXAMPLES.getPreferredName(), examples);
if (grokPattern != null) {
builder.field(GROK_PATTERN.getPreferredName(), grokPattern);
}
builder.endObject();
return builder;
}
@ -166,11 +187,12 @@ public class CategoryDefinition implements ToXContentObject, Writeable {
&& Objects.equals(this.terms, that.terms)
&& Objects.equals(this.regex, that.regex)
&& Objects.equals(this.maxMatchingLength, that.maxMatchingLength)
&& Objects.equals(this.examples, that.examples);
&& Objects.equals(this.examples, that.examples)
&& Objects.equals(this.grokPattern, that.grokPattern);
}
@Override
public int hashCode() {
return Objects.hash(jobId, categoryId, terms, regex, maxMatchingLength, examples);
return Objects.hash(jobId, categoryId, terms, regex, maxMatchingLength, examples, grokPattern);
}
}

View File

@ -46,6 +46,7 @@ dependencies {
testCompile project(path: xpackModule('security'), configuration: 'testArtifacts')
// ml deps
compile project(':libs:grok')
compile 'net.sf.supercsv:super-csv:2.4.0'
nativeBundle "org.elasticsearch.ml:ml-cpp:${project.version}@zip"
testCompile 'org.ini4j:ini4j:0.5.2'

View File

@ -41,7 +41,7 @@ public class TransportGetCategoriesAction extends HandledTransportAction<GetCate
Integer from = request.getPageParams() != null ? request.getPageParams().getFrom() : null;
Integer size = request.getPageParams() != null ? request.getPageParams().getSize() : null;
jobProvider.categoryDefinitions(request.getJobId(), request.getCategoryId(), from, size,
jobProvider.categoryDefinitions(request.getJobId(), request.getCategoryId(), true, from, size,
r -> listener.onResponse(new GetCategoriesAction.Response(r)), listener::onFailure, client);
}
}

View File

@ -0,0 +1,243 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.job.categorization;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.grok.Grok;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Creates Grok patterns that will match all the examples in a given category_definition.
*
* The choice of field names is quite primitive. The intention is that a human will edit these.
*/
public final class GrokPatternCreator {
private static String PREFACE = "preface";
private static String EPILOGUE = "epilogue";
/**
* The first match in this list will be chosen, so it needs to be ordered
* such that more generic patterns come after more specific patterns.
*/
private static final List<GrokPatternCandidate> ORDERED_CANDIDATE_GROK_PATTERNS = Arrays.asList(
new GrokPatternCandidate("TIMESTAMP_ISO8601", "timestamp"),
new GrokPatternCandidate("DATESTAMP_RFC822", "timestamp"),
new GrokPatternCandidate("DATESTAMP_RFC2822", "timestamp"),
new GrokPatternCandidate("DATESTAMP_OTHER", "timestamp"),
new GrokPatternCandidate("DATESTAMP_EVENTLOG", "timestamp"),
new GrokPatternCandidate("SYSLOGTIMESTAMP", "timestamp"),
new GrokPatternCandidate("HTTPDATE", "timestamp"),
new GrokPatternCandidate("CATALINA_DATESTAMP", "timestamp"),
new GrokPatternCandidate("TOMCAT_DATESTAMP", "timestamp"),
new GrokPatternCandidate("CISCOTIMESTAMP", "timestamp"),
new GrokPatternCandidate("DATE", "date"),
new GrokPatternCandidate("TIME", "time"),
new GrokPatternCandidate("LOGLEVEL", "loglevel"),
new GrokPatternCandidate("URI", "uri"),
new GrokPatternCandidate("UUID", "uuid"),
new GrokPatternCandidate("MAC", "macaddress"),
// Can't use \b as the breaks, because slashes are not "word" characters
new GrokPatternCandidate("PATH", "path", "(?<!\\w)", "(?!\\w)"),
new GrokPatternCandidate("EMAILADDRESS", "email"),
// TODO: would be nice to have IPORHOST here, but HOST matches almost all words
new GrokPatternCandidate("IP", "ipaddress"),
// This already includes pre/post break conditions
new GrokPatternCandidate("QUOTEDSTRING", "field", "", ""),
// Can't use \b as the break before, because it doesn't work for negative numbers (the
// minus sign is not a "word" character)
new GrokPatternCandidate("NUMBER", "field", "(?<!\\w)"),
// Disallow +, - and . before hex numbers, otherwise this pattern will pick up base 10
// numbers that NUMBER rejected due to preceeding characters
new GrokPatternCandidate("BASE16NUM", "field", "(?<![\\w.+-])")
// TODO: also unfortunately can't have USERNAME in the list as it matches too broadly
// Fixing these problems with overly broad matches would require some extra intelligence
// to be added to remove inappropriate matches. One idea would be to use a dictionary,
// but that doesn't necessarily help as "jay" could be a username but is also a dictionary
// word (plus there's the international headache with relying on dictionaries). Similarly,
// hostnames could also be dictionary words - I've worked on machines called "hippo" and
// "scarf" in the past. Another idea would be to look at the adjacent characters and
// apply some heuristic based on those.
);
private GrokPatternCreator() {
}
/**
* Given a category definition regex and a collection of examples from the category, return
* a grok pattern that will match the category and pull out any likely fields. The extracted
* fields are given pretty generic names, but unique within the grok pattern provided. The
* expectation is that a user will adjust the extracted field names based on their domain
* knowledge.
*/
public static String findBestGrokMatchFromExamples(String jobId, String regex, Collection<String> examples) {
// The first string in this array will end up being the empty string, and it doesn't correspond
// to an "in between" bit. Although it could be removed for "neatness", it actually makes the
// loops below slightly neater if it's left in.
//
// E.g., ".*?cat.+?sat.+?mat.*" -> [ "", "cat", "sat", "mat" ]
String[] fixedRegexBits = regex.split("\\.[*+]\\??");
// Create a pattern that will capture the bits in between the fixed parts of the regex
//
// E.g., ".*?cat.+?sat.+?mat.*" -> Pattern (.*?)cat(.+?)sat(.+?)mat(.*)
Pattern exampleProcessor = Pattern.compile(regex.replaceAll("(\\.[*+]\\??)", "($1)"), Pattern.DOTALL);
List<Collection<String>> groupsMatchesFromExamples = new ArrayList<>(fixedRegexBits.length);
for (int i = 0; i < fixedRegexBits.length; ++i) {
groupsMatchesFromExamples.add(new ArrayList<>(examples.size()));
}
for (String example : examples) {
Matcher matcher = exampleProcessor.matcher(example);
if (matcher.matches()) {
assert matcher.groupCount() == fixedRegexBits.length;
// E.g., if the input regex was ".*?cat.+?sat.+?mat.*" then the example
// "the cat sat on the mat" will result in "the ", " ", " on the ", and ""
// being added to the 4 "in between" collections in that order
for (int groupNum = 1; groupNum <= matcher.groupCount(); ++groupNum) {
groupsMatchesFromExamples.get(groupNum - 1).add(matcher.group(groupNum));
}
} else {
// We should never get here. If we do it implies a bug in the original categorization,
// as it's produced a regex that doesn't match the examples.
assert matcher.matches() : exampleProcessor.pattern() + " did not match " + example;
Loggers.getLogger(GrokPatternCreator.class).error("[{}] Pattern [{}] did not match example [{}]", jobId,
exampleProcessor.pattern(), example);
}
}
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();
// Finally, for each collection of "in between" bits we look for the best Grok pattern and incorporate
// it into the overall Grok pattern that will match the each example in its entirety
for (int inBetweenBitNum = 0; inBetweenBitNum < groupsMatchesFromExamples.size(); ++inBetweenBitNum) {
// Remember (from the first comment in this method) that the first element in this array is
// always the empty string
overallGrokPatternBuilder.append(fixedRegexBits[inBetweenBitNum]);
appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, inBetweenBitNum == 0,
inBetweenBitNum == fixedRegexBits.length - 1, groupsMatchesFromExamples.get(inBetweenBitNum));
}
return overallGrokPatternBuilder.toString();
}
/**
* Given a collection of strings, work out which (if any) of the grok patterns we're allowed
* to use matches it best. Then append the appropriate grok language to represent that finding
* onto the supplied string builder.
*/
static void appendBestGrokMatchForStrings(Map<String, Integer> fieldNameCountStore, StringBuilder overallGrokPatternBuilder,
boolean isFirst, boolean isLast, Collection<String> mustMatchStrings) {
GrokPatternCandidate bestCandidate = null;
if (mustMatchStrings.isEmpty() == false) {
for (GrokPatternCandidate candidate : ORDERED_CANDIDATE_GROK_PATTERNS) {
if (mustMatchStrings.stream().allMatch(candidate.grok::match)) {
bestCandidate = candidate;
break;
}
}
}
if (bestCandidate == null) {
if (isLast) {
overallGrokPatternBuilder.append(".*");
} else if (isFirst || mustMatchStrings.stream().anyMatch(String::isEmpty)) {
overallGrokPatternBuilder.append(".*?");
} else {
overallGrokPatternBuilder.append(".+?");
}
} else {
Collection<String> prefaces = new ArrayList<>();
Collection<String> epilogues = new ArrayList<>();
populatePrefacesAndEpilogues(mustMatchStrings, bestCandidate.grok, prefaces, epilogues);
appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, isFirst, false, prefaces);
overallGrokPatternBuilder.append("%{").append(bestCandidate.grokPatternName).append(':')
.append(buildFieldName(fieldNameCountStore, bestCandidate.fieldName)).append('}');
appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, isLast, epilogues);
}
}
/**
* Given a collection of strings, and a grok pattern that matches some part of them all,
* return collections of the bits that come before (prefaces) and after (epilogues) the
* bit that matches.
*/
static void populatePrefacesAndEpilogues(Collection<String> matchingStrings, Grok grok, Collection<String> prefaces,
Collection<String> epilogues) {
for (String s : matchingStrings) {
Map<String, Object> captures = grok.captures(s);
// If the pattern doesn't match then captures will be null. But we expect this
// method to only be called after validating that the pattern does match.
assert captures != null;
prefaces.add(captures.getOrDefault(PREFACE, "").toString());
epilogues.add(captures.getOrDefault(EPILOGUE, "").toString());
}
}
/**
* The first time a particular field name is passed, simply return it.
* The second time return it with "2" appended.
* The third time return it with "3" appended.
* Etc.
*/
static String buildFieldName(Map<String, Integer> fieldNameCountStore, String fieldName) {
Integer numberSeen = fieldNameCountStore.compute(fieldName, (k, v) -> 1 + ((v == null) ? 0 : v));
if (numberSeen > 1) {
return fieldName + numberSeen;
} else {
return fieldName;
}
}
static class GrokPatternCandidate {
final String grokPatternName;
final String fieldName;
final Grok grok;
/**
* Pre/post breaks default to \b, but this may not be appropriate for Grok patterns that start or
* end with a non "word" character (i.e. letter, number or underscore). For such patterns use one
* of the other constructors.
*
* In cases where the Grok pattern defined by Logstash already includes conditions on what must
* come before and after the match, use one of the other constructors and specify an empty string
* for the pre and/or post breaks.
* @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash.
* @param fieldName Name of the field to extract from the match.
*/
GrokPatternCandidate(String grokPatternName, String fieldName) {
this(grokPatternName, fieldName, "\\b", "\\b");
}
GrokPatternCandidate(String grokPatternName, String fieldName, String preBreak) {
this(grokPatternName, fieldName, preBreak, "\\b");
}
/**
* @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash.
* @param fieldName Name of the field to extract from the match.
* @param preBreak Only consider the match if it's broken from the previous text by this.
* @param postBreak Only consider the match if it's broken from the following text by this.
*/
GrokPatternCandidate(String grokPatternName, String fieldName, String preBreak, String postBreak) {
this.grokPatternName = grokPatternName;
this.fieldName = fieldName;
this.grok = new Grok(Grok.getBuiltinPatterns(), "%{DATA:" + PREFACE + "}" + preBreak + "%{" + grokPatternName + ":this}" +
postBreak + "%{GREEDYDATA:" + EPILOGUE + "}");
}
}
}

View File

@ -98,6 +98,7 @@ import org.elasticsearch.xpack.core.ml.job.results.Result;
import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
import org.elasticsearch.xpack.core.ml.utils.MlIndicesUtils;
import org.elasticsearch.xpack.core.security.support.Exceptions;
import org.elasticsearch.xpack.ml.job.categorization.GrokPatternCreator;
import org.elasticsearch.xpack.ml.job.persistence.InfluencersQueryBuilder.InfluencersQuery;
import org.elasticsearch.xpack.ml.job.process.autodetect.params.AutodetectParams;
@ -626,10 +627,11 @@ public class JobProvider {
* Get a page of {@linkplain CategoryDefinition}s for the given <code>jobId</code>.
* Uses a supplied client, so may run as the currently authenticated user
* @param jobId the job id
* @param augment Should the category definition be augmented with a Grok pattern?
* @param from Skip the first N categories. This parameter is for paging
* @param size Take only this number of categories
*/
public void categoryDefinitions(String jobId, Long categoryId, Integer from, Integer size,
public void categoryDefinitions(String jobId, Long categoryId, boolean augment, Integer from, Integer size,
Consumer<QueryPage<CategoryDefinition>> handler,
Consumer<Exception> errorHandler, Client client) {
if (categoryId != null && (from != null || size != null)) {
@ -663,6 +665,9 @@ public class JobProvider {
XContentParser parser = XContentFactory.xContent(XContentHelper.xContentType(source))
.createParser(NamedXContentRegistry.EMPTY, LoggingDeprecationHandler.INSTANCE, stream)) {
CategoryDefinition categoryDefinition = CategoryDefinition.LENIENT_PARSER.apply(parser, null);
if (augment) {
augmentWithGrokPattern(categoryDefinition);
}
results.add(categoryDefinition);
} catch (IOException e) {
throw new ElasticsearchParseException("failed to parse category definition", e);
@ -674,6 +679,17 @@ public class JobProvider {
}, e -> errorHandler.accept(mapAuthFailure(e, jobId, GetCategoriesAction.NAME))), client::search);
}
void augmentWithGrokPattern(CategoryDefinition categoryDefinition) {
List<String> examples = categoryDefinition.getExamples();
String regex = categoryDefinition.getRegex();
if (examples.isEmpty() || regex.isEmpty()) {
categoryDefinition.setGrokPattern("");
} else {
categoryDefinition.setGrokPattern(GrokPatternCreator.findBestGrokMatchFromExamples(categoryDefinition.getJobId(),
regex, examples));
}
}
/**
* Search for anomaly records with the parameters in the
* {@link RecordsQueryBuilder}

View File

@ -461,7 +461,7 @@ public class AutodetectResultProcessorIT extends MlSingleNodeTestCase {
AtomicReference<Exception> errorHolder = new AtomicReference<>();
AtomicReference<QueryPage<CategoryDefinition>> resultHolder = new AtomicReference<>();
CountDownLatch latch = new CountDownLatch(1);
jobProvider.categoryDefinitions(JOB_ID, categoryId, null, null, r -> {
jobProvider.categoryDefinitions(JOB_ID, categoryId, false, null, null, r -> {
resultHolder.set(r);
latch.countDown();
}, e -> {

View File

@ -0,0 +1,232 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.job.categorization;
import org.elasticsearch.grok.Grok;
import org.elasticsearch.test.ESTestCase;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import static org.hamcrest.Matchers.containsInAnyOrder;
public class GrokPatternCreatorTests extends ESTestCase {
public void testBuildFieldName() {
Map<String, Integer> fieldNameCountStore = new HashMap<>();
assertEquals("field", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
assertEquals("field2", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
assertEquals("field3", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
assertEquals("timestamp", GrokPatternCreator.buildFieldName(fieldNameCountStore, "timestamp"));
assertEquals("field4", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
assertEquals("uri", GrokPatternCreator.buildFieldName(fieldNameCountStore, "uri"));
assertEquals("timestamp2", GrokPatternCreator.buildFieldName(fieldNameCountStore, "timestamp"));
assertEquals("field5", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
}
public void testPopulatePrefacesAndEpiloguesGivenTimestamp() {
Collection<String> matchingStrings = Arrays.asList("[2018-01-25T15:33:23] DEBUG ",
"[2018-01-24T12:33:23] ERROR ",
"junk [2018-01-22T07:33:23] INFO ",
"[2018-01-21T03:33:23] DEBUG ");
Grok grok = new GrokPatternCreator.GrokPatternCandidate("TIMESTAMP_ISO8601", "timestamp").grok;
Collection<String> prefaces = new ArrayList<>();
Collection<String> epilogues = new ArrayList<>();
GrokPatternCreator.populatePrefacesAndEpilogues(matchingStrings, grok, prefaces, epilogues);
assertThat(prefaces, containsInAnyOrder("[", "[", "junk [", "["));
assertThat(epilogues, containsInAnyOrder("] DEBUG ", "] ERROR ", "] INFO ", "] DEBUG "));
}
public void testPopulatePrefacesAndEpiloguesGivenEmailAddress() {
Collection<String> matchingStrings = Arrays.asList("before alice@acme.com after",
"abc bob@acme.com xyz",
"carol@acme.com");
Grok grok = new GrokPatternCreator.GrokPatternCandidate("EMAILADDRESS", "email").grok;
Collection<String> prefaces = new ArrayList<>();
Collection<String> epilogues = new ArrayList<>();
GrokPatternCreator.populatePrefacesAndEpilogues(matchingStrings, grok, prefaces, epilogues);
assertThat(prefaces, containsInAnyOrder("before ", "abc ", ""));
assertThat(epilogues, containsInAnyOrder(" after", " xyz", ""));
}
public void testAppendBestGrokMatchForStringsGivenTimestampsAndLogLevels() {
Collection<String> mustMatchStrings = Arrays.asList("[2018-01-25T15:33:23] DEBUG ",
"[2018-01-24T12:33:23] ERROR ",
"junk [2018-01-22T07:33:23] INFO ",
"[2018-01-21T03:33:23] DEBUG ");
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();
GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
assertEquals(".+?%{TIMESTAMP_ISO8601:timestamp}.+?%{LOGLEVEL:loglevel}.+?", overallGrokPatternBuilder.toString());
}
public void testAppendBestGrokMatchForStringsGivenNumbersInBrackets() {
Collection<String> mustMatchStrings = Arrays.asList("(-2)",
" (-3)",
" (4)",
" (-5) ");
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();
GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
assertEquals(".+?%{NUMBER:field}.+?", overallGrokPatternBuilder.toString());
}
public void testAppendBestGrokMatchForStringsGivenNegativeNumbersWithoutBreak() {
Collection<String> mustMatchStrings = Arrays.asList("before-2 ",
"prior to-3",
"-4");
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();
GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
// It seems sensible that we don't detect these suffices as either base 10 or base 16 numbers
assertEquals(".+?", overallGrokPatternBuilder.toString());
}
public void testAppendBestGrokMatchForStringsGivenHexNumbers() {
Collection<String> mustMatchStrings = Arrays.asList(" abc",
" 123",
" -123",
"1f is hex");
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();
GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
assertEquals(".*?%{BASE16NUM:field}.*?", overallGrokPatternBuilder.toString());
}
public void testAppendBestGrokMatchForStringsGivenHostnamesWithNumbers() {
Collection<String> mustMatchStrings = Arrays.asList("<host1.1.p2ps:",
"<host2.1.p2ps:");
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();
GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
// We don't want the .1. in the middle to get detected as a hex number
assertEquals(".+?", overallGrokPatternBuilder.toString());
}
public void testAppendBestGrokMatchForStringsGivenEmailAddresses() {
Collection<String> mustMatchStrings = Arrays.asList("before alice@acme.com after",
"abc bob@acme.com xyz",
"carol@acme.com");
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();
GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
assertEquals(".*?%{EMAILADDRESS:email}.*?", overallGrokPatternBuilder.toString());
}
public void testAppendBestGrokMatchForStringsGivenUris() {
Collection<String> mustMatchStrings = Arrays.asList("main site https://www.elastic.co/ with trailing slash",
"https://www.elastic.co/guide/en/x-pack/current/ml-configuring-categories.html#ml-configuring-categories is a section",
"download today from https://www.elastic.co/downloads");
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();
GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
assertEquals(".*?%{URI:uri}.*?", overallGrokPatternBuilder.toString());
}
public void testAppendBestGrokMatchForStringsGivenPaths() {
Collection<String> mustMatchStrings = Arrays.asList("on Mac /Users/dave",
"on Windows C:\\Users\\dave",
"on Linux /home/dave");
Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();
GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
assertEquals(".+?%{PATH:path}.*?", overallGrokPatternBuilder.toString());
}
public void testFindBestGrokMatchFromExamplesGivenNamedLogs() {
String regex = ".*?linux.+?named.+?error.+?unexpected.+?RCODE.+?REFUSED.+?resolving.*";
Collection<String> examples = Arrays.asList(
"Sep 8 11:55:06 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'elastic.slack.com/A/IN': 95.110.64.205#53",
"Sep 8 11:55:08 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'slack-imgs.com/A/IN': 95.110.64.205#53",
"Sep 8 11:55:35 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'www.elastic.co/A/IN': 95.110.68.206#53",
"Sep 8 11:55:42 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'b.akamaiedge.net/A/IN': 95.110.64.205#53");
assertEquals(".*?%{SYSLOGTIMESTAMP:timestamp}.+?linux.+?named.+?%{NUMBER:field}.+?error.+?" +
"unexpected.+?RCODE.+?REFUSED.+?resolving.+?%{QUOTEDSTRING:field2}.+?%{IP:ipaddress}.+?%{NUMBER:field3}.*",
GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples));
}
public void testFindBestGrokMatchFromExamplesGivenCatalinaLogs() {
String regex = ".*?org\\.apache\\.tomcat\\.util\\.http\\.Parameters.+?processParameters.+?WARNING.+?Parameters.+?" +
"Invalid.+?chunk.+?ignored.*";
// The embedded newline ensures the regular expressions we're using are compiled with Pattern.DOTALL
Collection<String> examples = Arrays.asList(
"Aug 29, 2009 12:03:33 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
"Invalid chunk ignored.",
"Aug 29, 2009 12:03:40 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
"Invalid chunk ignored.",
"Aug 29, 2009 12:03:45 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
"Invalid chunk ignored.",
"Aug 29, 2009 12:03:57 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
"Invalid chunk ignored.");
assertEquals(".*?%{CATALINA_DATESTAMP:timestamp}.+?org\\.apache\\.tomcat\\.util\\.http\\.Parameters.+?processParameters.+?" +
"WARNING.+?Parameters.+?Invalid.+?chunk.+?ignored.*",
GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples));
}
public void testFindBestGrokMatchFromExamplesGivenMultiTimestampLogs() {
String regex = ".*?Authpriv.+?Info.+?sshd.+?subsystem.+?request.+?for.+?sftp.*";
// Two timestamps: one local, one UTC
Collection<String> examples = Arrays.asList(
"559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" +
"Info\tsshd\tsubsystem request for sftp",
"559550912548986880\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" +
"Info\tsshd\tsubsystem request for sftp",
"559550912548986887\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" +
"Info\tsshd\tsubsystem request for sftp",
"559550912603512850\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" +
"Info\tsshd\tsubsystem request for sftp");
assertEquals(".*?%{NUMBER:field}.+?%{TIMESTAMP_ISO8601:timestamp}.+?%{TIMESTAMP_ISO8601:timestamp2}.+?%{NUMBER:field2}.+?" +
"%{IP:ipaddress}.+?Authpriv.+?Info.+?sshd.+?subsystem.+?request.+?for.+?sftp.*",
GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples));
}
}

View File

@ -61,7 +61,6 @@ import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Consumer;
@ -235,8 +234,7 @@ public class JobProviderTests extends ESTestCase {
});
}
public void testBuckets_OneBucketNoInterim()
throws InterruptedException, ExecutionException, IOException {
public void testBuckets_OneBucketNoInterim() throws IOException {
String jobId = "TestJobIdentification";
Date now = new Date();
List<Map<String, Object>> source = new ArrayList<>();
@ -268,8 +266,7 @@ public class JobProviderTests extends ESTestCase {
".*"));
}
public void testBuckets_OneBucketInterim()
throws InterruptedException, ExecutionException, IOException {
public void testBuckets_OneBucketInterim() throws IOException {
String jobId = "TestJobIdentification";
Date now = new Date();
List<Map<String, Object>> source = new ArrayList<>();
@ -302,8 +299,7 @@ public class JobProviderTests extends ESTestCase {
assertFalse(queryString.matches("(?s).*is_interim.*"));
}
public void testBuckets_UsingBuilder()
throws InterruptedException, ExecutionException, IOException {
public void testBuckets_UsingBuilder() throws IOException {
String jobId = "TestJobIdentification";
Date now = new Date();
List<Map<String, Object>> source = new ArrayList<>();
@ -339,8 +335,7 @@ public class JobProviderTests extends ESTestCase {
assertFalse(queryString.matches("(?s).*is_interim.*"));
}
public void testBucket_NoBucketNoExpand()
throws InterruptedException, ExecutionException, IOException {
public void testBucket_NoBucketNoExpand() throws IOException {
String jobId = "TestJobIdentification";
Long timestamp = 98765432123456789L;
List<Map<String, Object>> source = new ArrayList<>();
@ -357,8 +352,7 @@ public class JobProviderTests extends ESTestCase {
assertEquals(ResourceNotFoundException.class, holder[0].getClass());
}
public void testBucket_OneBucketNoExpand()
throws InterruptedException, ExecutionException, IOException {
public void testBucket_OneBucketNoExpand() throws IOException {
String jobId = "TestJobIdentification";
Date now = new Date();
List<Map<String, Object>> source = new ArrayList<>();
@ -384,7 +378,7 @@ public class JobProviderTests extends ESTestCase {
assertEquals(now, b.getTimestamp());
}
public void testRecords() throws InterruptedException, ExecutionException, IOException {
public void testRecords() throws IOException {
String jobId = "TestJobIdentification";
Date now = new Date();
List<Map<String, Object>> source = new ArrayList<>();
@ -431,8 +425,7 @@ public class JobProviderTests extends ESTestCase {
assertEquals("irrascible", records.get(1).getFunction());
}
public void testRecords_UsingBuilder()
throws InterruptedException, ExecutionException, IOException {
public void testRecords_UsingBuilder() throws IOException {
String jobId = "TestJobIdentification";
Date now = new Date();
List<Map<String, Object>> source = new ArrayList<>();
@ -485,7 +478,7 @@ public class JobProviderTests extends ESTestCase {
assertEquals("irrascible", records.get(1).getFunction());
}
public void testBucketRecords() throws InterruptedException, ExecutionException, IOException {
public void testBucketRecords() throws IOException {
String jobId = "TestJobIdentification";
Date now = new Date();
Bucket bucket = mock(Bucket.class);
@ -532,7 +525,7 @@ public class JobProviderTests extends ESTestCase {
assertEquals("irrascible", records.get(1).getFunction());
}
public void testexpandBucket() throws InterruptedException, ExecutionException, IOException {
public void testexpandBucket() throws IOException {
String jobId = "TestJobIdentification";
Date now = new Date();
Bucket bucket = new Bucket("foo", now, 22);
@ -559,8 +552,7 @@ public class JobProviderTests extends ESTestCase {
assertEquals(400L, records);
}
public void testCategoryDefinitions()
throws InterruptedException, ExecutionException, IOException {
public void testCategoryDefinitions() throws IOException {
String jobId = "TestJobIdentification";
String terms = "the terms and conditions are not valid here";
List<Map<String, Object>> source = new ArrayList<>();
@ -580,15 +572,14 @@ public class JobProviderTests extends ESTestCase {
JobProvider provider = createProvider(client);
@SuppressWarnings({"unchecked", "rawtypes"})
QueryPage<CategoryDefinition>[] holder = new QueryPage[1];
provider.categoryDefinitions(jobId, null, from, size, r -> holder[0] = r,
provider.categoryDefinitions(jobId, null, false, from, size, r -> holder[0] = r,
e -> {throw new RuntimeException(e);}, client);
QueryPage<CategoryDefinition> categoryDefinitions = holder[0];
assertEquals(1L, categoryDefinitions.count());
assertEquals(terms, categoryDefinitions.results().get(0).getTerms());
}
public void testCategoryDefinition()
throws InterruptedException, ExecutionException, IOException {
public void testCategoryDefinition() throws IOException {
String jobId = "TestJobIdentification";
String terms = "the terms and conditions are not valid here";
@ -603,14 +594,14 @@ public class JobProviderTests extends ESTestCase {
JobProvider provider = createProvider(client);
@SuppressWarnings({"unchecked", "rawtypes"})
QueryPage<CategoryDefinition>[] holder = new QueryPage[1];
provider.categoryDefinitions(jobId, categoryId, null, null,
provider.categoryDefinitions(jobId, categoryId, false, null, null,
r -> holder[0] = r, e -> {throw new RuntimeException(e);}, client);
QueryPage<CategoryDefinition> categoryDefinitions = holder[0];
assertEquals(1L, categoryDefinitions.count());
assertEquals(terms, categoryDefinitions.results().get(0).getTerms());
}
public void testInfluencers_NoInterim() throws InterruptedException, ExecutionException, IOException {
public void testInfluencers_NoInterim() throws IOException {
String jobId = "TestJobIdentificationForInfluencers";
Date now = new Date();
List<Map<String, Object>> source = new ArrayList<>();
@ -670,7 +661,7 @@ public class JobProviderTests extends ESTestCase {
assertEquals(5.0, records.get(1).getInitialInfluencerScore(), 0.00001);
}
public void testInfluencers_WithInterim() throws InterruptedException, ExecutionException, IOException {
public void testInfluencers_WithInterim() throws IOException {
String jobId = "TestJobIdentificationForInfluencers";
Date now = new Date();
List<Map<String, Object>> source = new ArrayList<>();
@ -730,7 +721,7 @@ public class JobProviderTests extends ESTestCase {
assertEquals(5.0, records.get(1).getInitialInfluencerScore(), 0.00001);
}
public void testModelSnapshots() throws InterruptedException, ExecutionException, IOException {
public void testModelSnapshots() throws IOException {
String jobId = "TestJobIdentificationForInfluencers";
Date now = new Date();
List<Map<String, Object>> source = new ArrayList<>();
@ -851,8 +842,7 @@ public class JobProviderTests extends ESTestCase {
return getResponse;
}
private static SearchResponse createSearchResponse(List<Map<String, Object>> source)
throws IOException {
private static SearchResponse createSearchResponse(List<Map<String, Object>> source) throws IOException {
SearchResponse response = mock(SearchResponse.class);
List<SearchHit> list = new ArrayList<>();

View File

@ -25,6 +25,9 @@ public class CategoryDefinitionTests extends AbstractSerializingTestCase<Categor
categoryDefinition.setRegex(randomAlphaOfLength(10));
categoryDefinition.setMaxMatchingLength(randomLong());
categoryDefinition.setExamples(Arrays.asList(generateRandomStringArray(10, 10, false)));
if (randomBoolean()) {
categoryDefinition.setGrokPattern(randomAlphaOfLength(50));
}
return categoryDefinition;
}