add ability to specify multiple grok patterns (#18074)

- now you can specify a list of grok patterns to match your field with
and the first one to successfully match wins.
- only non-null captures will be inserted into your matched document.

Fixes #17903.
This commit is contained in:
Tal Levy 2016-05-25 12:20:39 -07:00
parent 7e5c12606c
commit edfbdf2748
6 changed files with 206 additions and 86 deletions

View File

@ -1038,8 +1038,9 @@ Grok expression.
|======
| Name | Required | Default | Description
| `field` | yes | - | The field to use for grok expression parsing
| `pattern` | yes | - | The grok expression to match and extract named captures with
| `patterns` | yes | - | An ordered list of grok expression to match and extract named captures with. Returns on the first expression in the list that matches.
| `pattern_definitions` | no | - | A map of pattern-name and pattern tuples defining custom patterns to be used by the current processor. Patterns matching existing names will override the pre-existing definition.
| `trace_match` | no | false | when true, `_ingest._grok_match_index` will be inserted into your matched document's metadata with the index into the pattern found in `patterns` that matched.
|======
Here is an example of using the provided patterns to extract out and name structured fields from a string field in
@ -1069,7 +1070,7 @@ Here is an example pipeline for processing the above document by using Grok:
{
"grok": {
"field": "message",
"pattern": "%{IP:client} %{WORD:method} %{URIPATHPARAM:request} %{NUMBER:bytes} %{NUMBER:duration}"
"patterns": ["%{IP:client} %{WORD:method} %{URIPATHPARAM:request} %{NUMBER:bytes} %{NUMBER:duration}"]
}
}
]
@ -1107,7 +1108,7 @@ Here is an example of a pipeline specifying custom pattern definitions:
{
"grok": {
"field": "message",
"pattern": "my %{FAVORITE_DOG:dog} is colored %{RGB:color}"
"patterns": ["my %{FAVORITE_DOG:dog} is colored %{RGB:color}"]
"pattern_definitions" : {
"FAVORITE_DOG" : "beagle",
"RGB" : "RED|GREEN|BLUE"

View File

@ -25,31 +25,50 @@ import org.elasticsearch.ingest.core.ConfigurationUtils;
import org.elasticsearch.ingest.core.IngestDocument;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import static org.elasticsearch.ingest.core.ConfigurationUtils.newConfigurationException;
public final class GrokProcessor extends AbstractProcessor {
public static final String TYPE = "grok";
private static final String PATTERN_MATCH_KEY = "_ingest._grok_match_index";
private final String matchField;
private final Grok grok;
private final boolean traceMatch;
public GrokProcessor(String tag, Grok grok, String matchField) {
public GrokProcessor(String tag, Map<String, String> patternBank, List<String> matchPatterns, String matchField) {
this(tag, patternBank, matchPatterns, matchField, false);
}
public GrokProcessor(String tag, Map<String, String> patternBank, List<String> matchPatterns, String matchField, boolean traceMatch) {
super(tag);
this.matchField = matchField;
this.grok = grok;
this.grok = new Grok(patternBank, combinePatterns(matchPatterns, traceMatch));
this.traceMatch = traceMatch;
}
@Override
public void execute(IngestDocument ingestDocument) throws Exception {
String fieldValue = ingestDocument.getFieldValue(matchField, String.class);
Map<String, Object> matches = grok.captures(fieldValue);
if (matches != null) {
matches.forEach((k, v) -> ingestDocument.setFieldValue(k, v));
} else {
throw new IllegalArgumentException("Grok expression does not match field value: [" + fieldValue + "]");
if (matches == null) {
throw new IllegalArgumentException("Provided Grok expressions do not match field value: [" + fieldValue + "]");
}
matches.entrySet().stream()
.filter((e) -> Objects.nonNull(e.getValue()))
.forEach((e) -> ingestDocument.setFieldValue(e.getKey(), e.getValue()));
if (traceMatch) {
@SuppressWarnings("unchecked")
HashMap<String, String> matchMap = (HashMap<String, String>) ingestDocument.getFieldValue(PATTERN_MATCH_KEY, Object.class);
matchMap.keySet().stream().findFirst().ifPresent((index) -> {
ingestDocument.setFieldValue(PATTERN_MATCH_KEY, index);
});
}
}
@ -58,12 +77,41 @@ public final class GrokProcessor extends AbstractProcessor {
return TYPE;
}
public Grok getGrok() {
return grok;
}
String getMatchField() {
return matchField;
}
Grok getGrok() {
return grok;
static String combinePatterns(List<String> patterns, boolean traceMatch) {
String combinedPattern;
if (patterns.size() > 1) {
if (traceMatch) {
combinedPattern = "";
for (int i = 0; i < patterns.size(); i++) {
String valueWrap = "(?<" + PATTERN_MATCH_KEY + "." + i + ">" + patterns.get(i) + ")";
if (combinedPattern.equals("")) {
combinedPattern = valueWrap;
} else {
combinedPattern = combinedPattern + "|" + valueWrap;
}
}
} else {
combinedPattern = patterns.stream().reduce("", (prefix, value) -> {
if (prefix.equals("")) {
return "(?:" + value + ")";
} else {
return prefix + "|" + "(?:" + value + ")";
}
});
}
} else {
combinedPattern = patterns.get(0);
}
return combinedPattern;
}
public final static class Factory extends AbstractProcessorFactory<GrokProcessor> {
@ -77,22 +125,25 @@ public final class GrokProcessor extends AbstractProcessor {
@Override
public GrokProcessor doCreate(String processorTag, Map<String, Object> config) throws Exception {
String matchField = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "field");
String matchPattern = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "pattern");
List<String> matchPatterns = ConfigurationUtils.readList(TYPE, processorTag, config, "patterns");
boolean traceMatch = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "trace_match", false);
if (matchPatterns.isEmpty()) {
throw newConfigurationException(TYPE, processorTag, "patterns", "List of patterns must not be empty");
}
Map<String, String> customPatternBank = ConfigurationUtils.readOptionalMap(TYPE, processorTag, config, "pattern_definitions");
Map<String, String> patternBank = new HashMap<>(builtinPatterns);
if (customPatternBank != null) {
patternBank.putAll(customPatternBank);
}
Grok grok;
try {
grok = new Grok(patternBank, matchPattern);
return new GrokProcessor(processorTag, patternBank, matchPatterns, matchField, traceMatch);
} catch (Exception e) {
throw newConfigurationException(TYPE, processorTag, "pattern", "Invalid regex pattern. " + e.getMessage());
throw newConfigurationException(TYPE, processorTag, "patterns",
"Invalid regex pattern found in: " + matchPatterns + ". " + e.getMessage());
}
return new GrokProcessor(processorTag, grok, matchField);
}
}
}

View File

@ -37,7 +37,7 @@ public class GrokProcessorFactoryTests extends ESTestCase {
Map<String, Object> config = new HashMap<>();
config.put("field", "_field");
config.put("pattern", "(?<foo>\\w+)");
config.put("patterns", Collections.singletonList("(?<foo>\\w+)"));
String processorTag = randomAsciiOfLength(10);
config.put(AbstractProcessorFactory.TAG_KEY, processorTag);
GrokProcessor processor = factory.create(config);
@ -49,27 +49,26 @@ public class GrokProcessorFactoryTests extends ESTestCase {
public void testBuildMissingField() throws Exception {
GrokProcessor.Factory factory = new GrokProcessor.Factory(Collections.emptyMap());
Map<String, Object> config = new HashMap<>();
config.put("pattern", "(?<foo>\\w+)");
try {
factory.create(config);
fail("should fail");
} catch (ElasticsearchParseException e) {
assertThat(e.getMessage(), equalTo("[field] required property is missing"));
}
config.put("patterns", Collections.singletonList("(?<foo>\\w+)"));
ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> factory.create(config));
assertThat(e.getMessage(), equalTo("[field] required property is missing"));
}
public void testBuildMissingPattern() throws Exception {
public void testBuildMissingPatterns() throws Exception {
GrokProcessor.Factory factory = new GrokProcessor.Factory(Collections.emptyMap());
Map<String, Object> config = new HashMap<>();
config.put("field", "foo");
try {
factory.create(config);
fail("should fail");
} catch (ElasticsearchParseException e) {
assertThat(e.getMessage(), equalTo("[pattern] required property is missing"));
}
ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> factory.create(config));
assertThat(e.getMessage(), equalTo("[patterns] required property is missing"));
}
public void testBuildEmptyPatternsList() throws Exception {
GrokProcessor.Factory factory = new GrokProcessor.Factory(Collections.emptyMap());
Map<String, Object> config = new HashMap<>();
config.put("field", "foo");
config.put("patterns", Collections.emptyList());
ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> factory.create(config));
assertThat(e.getMessage(), equalTo("[patterns] List of patterns must not be empty"));
}
public void testCreateWithCustomPatterns() throws Exception {
@ -77,7 +76,7 @@ public class GrokProcessorFactoryTests extends ESTestCase {
Map<String, Object> config = new HashMap<>();
config.put("field", "_field");
config.put("pattern", "%{MY_PATTERN:name}!");
config.put("patterns", Collections.singletonList("%{MY_PATTERN:name}!"));
config.put("pattern_definitions", Collections.singletonMap("MY_PATTERN", "foo"));
GrokProcessor processor = factory.create(config);
assertThat(processor.getMatchField(), equalTo("_field"));
@ -89,28 +88,19 @@ public class GrokProcessorFactoryTests extends ESTestCase {
GrokProcessor.Factory factory = new GrokProcessor.Factory(Collections.emptyMap());
Map<String, Object> config = new HashMap<>();
config.put("field", "_field");
config.put("pattern", "[");
try {
factory.create(config);
fail("should fail");
} catch (ElasticsearchParseException e) {
assertThat(e.getMessage(), equalTo("[pattern] Invalid regex pattern. premature end of char-class"));
}
config.put("patterns", Collections.singletonList("["));
ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> factory.create(config));
assertThat(e.getMessage(), equalTo("[patterns] Invalid regex pattern found in: [[]. premature end of char-class"));
}
public void testCreateWithInvalidPatternDefinition() throws Exception {
GrokProcessor.Factory factory = new GrokProcessor.Factory(Collections.emptyMap());
Map<String, Object> config = new HashMap<>();
config.put("field", "_field");
config.put("pattern", "%{MY_PATTERN:name}!");
config.put("patterns", Collections.singletonList("%{MY_PATTERN:name}!"));
config.put("pattern_definitions", Collections.singletonMap("MY_PATTERN", "["));
try {
factory.create(config);
fail("should fail");
} catch (ElasticsearchParseException e) {
assertThat(e.getMessage(), equalTo("[pattern] Invalid regex pattern. premature end of char-class"));
}
ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> factory.create(config));
assertThat(e.getMessage(),
equalTo("[patterns] Invalid regex pattern found in: [%{MY_PATTERN:name}!]. premature end of char-class"));
}
}

View File

@ -21,14 +21,15 @@ package org.elasticsearch.ingest.grok;
import org.elasticsearch.ingest.RandomDocumentPicks;
import org.elasticsearch.ingest.core.IngestDocument;
import org.elasticsearch.ingest.grok.Grok;
import org.elasticsearch.ingest.grok.GrokProcessor;
import org.elasticsearch.test.ESTestCase;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.nullValue;
public class GrokProcessorTests extends ESTestCase {
@ -37,8 +38,8 @@ public class GrokProcessorTests extends ESTestCase {
String fieldName = RandomDocumentPicks.randomFieldName(random());
IngestDocument doc = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>());
doc.setFieldValue(fieldName, "1");
Grok grok = new Grok(Collections.singletonMap("ONE", "1"), "%{ONE:one}");
GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), grok, fieldName);
GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), Collections.singletonMap("ONE", "1"),
Collections.singletonList("%{ONE:one}"), fieldName);
processor.execute(doc);
assertThat(doc.getFieldValue("one", String.class), equalTo("1"));
}
@ -47,14 +48,10 @@ public class GrokProcessorTests extends ESTestCase {
String fieldName = RandomDocumentPicks.randomFieldName(random());
IngestDocument doc = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>());
doc.setFieldValue(fieldName, "23");
Grok grok = new Grok(Collections.singletonMap("ONE", "1"), "%{ONE:one}");
GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), grok, fieldName);
try {
processor.execute(doc);
fail();
} catch (Exception e) {
assertThat(e.getMessage(), equalTo("Grok expression does not match field value: [23]"));
}
GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), Collections.singletonMap("ONE", "1"),
Collections.singletonList("%{ONE:one}"), fieldName);
Exception e = expectThrows(Exception.class, () -> processor.execute(doc));
assertThat(e.getMessage(), equalTo("Provided Grok expressions do not match field value: [23]"));
}
public void testMatchWithoutCaptures() throws Exception {
@ -62,8 +59,8 @@ public class GrokProcessorTests extends ESTestCase {
IngestDocument originalDoc = new IngestDocument(new HashMap<>(), new HashMap<>());
originalDoc.setFieldValue(fieldName, fieldName);
IngestDocument doc = new IngestDocument(originalDoc);
Grok grok = new Grok(Collections.emptyMap(), fieldName);
GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), grok, fieldName);
GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), Collections.emptyMap(),
Collections.singletonList(fieldName), fieldName);
processor.execute(doc);
assertThat(doc, equalTo(originalDoc));
}
@ -72,26 +69,67 @@ public class GrokProcessorTests extends ESTestCase {
String fieldName = RandomDocumentPicks.randomFieldName(random());
IngestDocument doc = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>());
doc.setFieldValue(fieldName, 1);
Grok grok = new Grok(Collections.singletonMap("ONE", "1"), "%{ONE:one}");
GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), grok, fieldName);
try {
processor.execute(doc);
fail();
} catch (Exception e) {
assertThat(e.getMessage(), equalTo("field [" + fieldName + "] of type [java.lang.Integer] cannot be cast to [java.lang.String]"));
}
GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), Collections.singletonMap("ONE", "1"),
Collections.singletonList("%{ONE:one}"), fieldName);
Exception e = expectThrows(Exception.class, () -> processor.execute(doc));
assertThat(e.getMessage(), equalTo("field [" + fieldName + "] of type [java.lang.Integer] cannot be cast to [java.lang.String]"));
}
public void testMissingField() {
String fieldName = "foo.bar";
IngestDocument doc = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>());
Grok grok = new Grok(Collections.singletonMap("ONE", "1"), "%{ONE:one}");
GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), grok, fieldName);
try {
processor.execute(doc);
fail();
} catch (Exception e) {
assertThat(e.getMessage(), equalTo("field [foo] not present as part of path [foo.bar]"));
}
GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), Collections.singletonMap("ONE", "1"),
Collections.singletonList("%{ONE:one}"), fieldName);
Exception e = expectThrows(Exception.class, () -> processor.execute(doc));
assertThat(e.getMessage(), equalTo("field [foo] not present as part of path [foo.bar]"));
}
public void testMultiplePatternsWithMatchReturn() throws Exception {
String fieldName = RandomDocumentPicks.randomFieldName(random());
IngestDocument doc = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>());
doc.setFieldValue(fieldName, "2");
Map<String, String> patternBank = new HashMap<>();
patternBank.put("ONE", "1");
patternBank.put("TWO", "2");
patternBank.put("THREE", "3");
GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), patternBank,
Arrays.asList("%{ONE:one}", "%{TWO:two}", "%{THREE:three}"), fieldName);
processor.execute(doc);
assertThat(doc.hasField("one"), equalTo(false));
assertThat(doc.getFieldValue("two", String.class), equalTo("2"));
assertThat(doc.hasField("three"), equalTo(false));
}
public void testSetMetadata() throws Exception {
String fieldName = RandomDocumentPicks.randomFieldName(random());
IngestDocument doc = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>());
doc.setFieldValue(fieldName, "abc23");
Map<String, String> patternBank = new HashMap<>();
patternBank.put("ONE", "1");
patternBank.put("TWO", "2");
patternBank.put("THREE", "3");
GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), patternBank,
Arrays.asList("%{ONE:one}", "%{TWO:two}", "%{THREE:three}"), fieldName, true);
processor.execute(doc);
assertThat(doc.hasField("one"), equalTo(false));
assertThat(doc.getFieldValue("two", String.class), equalTo("2"));
assertThat(doc.hasField("three"), equalTo(false));
assertThat(doc.getFieldValue("_ingest._grok_match_index", String.class), equalTo("1"));
}
public void testCombinedPatterns() {
String combined;
combined = GrokProcessor.combinePatterns(Arrays.asList(""), false);
assertThat(combined, equalTo(""));
combined = GrokProcessor.combinePatterns(Arrays.asList(""), true);
assertThat(combined, equalTo(""));
combined = GrokProcessor.combinePatterns(Arrays.asList("foo"), false);
assertThat(combined, equalTo("foo"));
combined = GrokProcessor.combinePatterns(Arrays.asList("foo"), true);
assertThat(combined, equalTo("foo"));
combined = GrokProcessor.combinePatterns(Arrays.asList("foo", "bar"), false);
assertThat(combined, equalTo("(?:foo)|(?:bar)"));
combined = GrokProcessor.combinePatterns(Arrays.asList("foo", "bar"), true);
assertThat(combined, equalTo("(?<_ingest._grok_match_index.0>foo)|(?<_ingest._grok_match_index.1>bar)"));
}
}

View File

@ -10,7 +10,7 @@
{
"grok" : {
"field" : "field1",
"pattern" : "%{NUMBER:val:float} %{NUMBER:status:int} <%{WORD:msg}>"
"patterns" : ["%{NUMBER:val:float} %{NUMBER:status:int} <%{WORD:msg}>"]
}
}
]
@ -46,7 +46,7 @@
{
"grok" : {
"field" : "field1",
"pattern" : "<%{MY_PATTERN:msg}>",
"patterns" : ["<%{MY_PATTERN:msg}>"],
"pattern_definitions" : {
"MY_PATTERN" : "foo"
}
@ -83,7 +83,7 @@
{
"grok" : {
"field" : "field1",
"pattern" : "<%{NUMBER:msg}>",
"patterns" : ["<%{NUMBER:msg}>"],
"pattern_definitions" : {
"NUMBER" : "foo"
}
@ -107,3 +107,43 @@
type: test
id: 1
- match: { _source.msg: "foo" }
---
"Test simulate with grok debugging enabled":
- do:
ingest.simulate:
body: >
{
"pipeline": {
"description": "_description",
"processors": [
{
"grok" : {
"field" : "field",
"patterns" : ["%{ONE:one}", "%{TWO:two}"],
"pattern_definitions" : {
"ONE" : "1",
"TWO" : "2"
},
"trace_match" : true
}
}
]
},
"docs": [
{
"_index": "index",
"_type": "type",
"_id": "id",
"_source": {
"field": "abc2xyz"
}
}
]
}
- length: { docs: 1 }
- match: { docs.0.doc._source.field: "abc2xyz" }
- match: { docs.0.doc._source.two: "2" }
- length: { docs.0.doc._ingest: 2 }
- match: { docs.0.doc._ingest._grok_match_index: "1" }
- is_true: docs.0.doc._ingest.timestamp

View File

@ -9,7 +9,7 @@
{
"grok" : {
"field" : "log",
"pattern": "%{COMBINEDAPACHELOG}"
"patterns": ["%{COMBINEDAPACHELOG}"]
}
},
{
@ -55,7 +55,7 @@
index: test
type: test
id: 1
- length: { _source: 14 }
- length: { _source: 13 }
- match: { _source.request: "/presentations/logstash-scale11x/images/ahhh___rage_face_by_samusmmx-d5g5zap.png" }
- match: { _source.agent: "\"Mozilla/5.0 (Linux; Android 4.2.2; VS980 4G Build/JDQ39B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.135 Mobile Safari/537.36\"" }
- match: { _source.auth: "-" }