From 04b712bdc5d603786dcd3c3d254468aff7f23bb9 Mon Sep 17 00:00:00 2001 From: Tal Levy Date: Wed, 16 Nov 2016 15:41:54 +0200 Subject: [PATCH] fix trace_match behavior for when there is only one grok pattern (#21413) There is an issue in the Grok Processor, where trace_match: true does not inject the _ingest._grok_match_index into the ingest-document when there is just one pattern provided. This is due to an optimization in the regex construction. This commit adds a check for when this is the case, and injects a static index value of "0", since there is only one pattern matched (at the first index into the patterns). To make this clearer, more documentation was added to the grok-processor docs. Fixes #21371. --- docs/reference/ingest/ingest-node.asciidoc | 137 +++++++++++++++++- .../ingest/common/GrokProcessor.java | 20 ++- .../ingest/common/GrokProcessorTests.java | 13 ++ 3 files changed, 163 insertions(+), 7 deletions(-) diff --git a/docs/reference/ingest/ingest-node.asciidoc b/docs/reference/ingest/ingest-node.asciidoc index 9f11caed1ac..1ac1134f779 100644 --- a/docs/reference/ingest/ingest-node.asciidoc +++ b/docs/reference/ingest/ingest-node.asciidoc @@ -931,14 +931,14 @@ and the result: "date1" : "2016-04-25T12:02:01.789Z" }, "_ingest" : { - "timestamp" : "2016-08-11T12:00:01.222Z" + "timestamp" : "2016-11-08T19:43:03.850+0000" } } } ] } -------------------------------------------------- -// TESTRESPONSE[s/2016-08-11T12:00:01.222Z/$body.docs.0.doc._ingest.timestamp/] +// TESTRESPONSE[s/2016-11-08T19:43:03.850\+0000/$body.docs.0.doc._ingest.timestamp/] The above example shows that `_index` was set to ``. Elasticsearch understands this to mean `2016-04-01` as is explained in the <> @@ -1278,6 +1278,139 @@ Here is an example of a pipeline specifying custom pattern definitions: } -------------------------------------------------- +[[trace-match]] +==== Providing Multiple Match Patterns + +Sometimes one pattern is not enough to capture the potential structure of a field. Let's assume we +want to match all messages that contain your favorite pet breeds of either cats or dogs. One way to accomplish +this is to provide two distinct patterns that can be matched, instead of one really complicated expression capturing +the same `or` behavior. + +Here is an example of such a configuration executed against the simulate API: + +[source,js] +-------------------------------------------------- +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "description" : "parse multiple patterns", + "processors": [ + { + "grok": { + "field": "message", + "patterns": ["%{FAVORITE_DOG:pet}", "%{FAVORITE_CAT:pet}"], + "pattern_definitions" : { + "FAVORITE_DOG" : "beagle", + "FAVORITE_CAT" : "burmese" + } + } + } + ] +}, +"docs":[ + { + "_source": { + "message": "I love burmese cats!" + } + } + ] +} +-------------------------------------------------- +// CONSOLE + +response: + +[source,js] +-------------------------------------------------- +{ + "docs": [ + { + "doc": { + "_type": "_type", + "_index": "_index", + "_id": "_id", + "_source": { + "message": "I love burmese cats!", + "pet": "burmese" + }, + "_ingest": { + "timestamp": "2016-11-08T19:43:03.850+0000" + } + } + } + ] +} +-------------------------------------------------- +// TESTRESPONSE[s/2016-11-08T19:43:03.850\+0000/$body.docs.0.doc._ingest.timestamp/] + +Both patterns will set the field `pet` with the appropriate match, but what if we want to trace which of our +patterns matched and populated our fields? We can do this with the `trace_match` parameter. Here is the output of +that same pipeline, but with `"trace_match": true` configured: + +//// +Hidden setup for example: +[source,js] +-------------------------------------------------- +POST _ingest/pipeline/_simulate +{ + "pipeline": { + "description" : "parse multiple patterns", + "processors": [ + { + "grok": { + "field": "message", + "patterns": ["%{FAVORITE_DOG:pet}", "%{FAVORITE_CAT:pet}"], + "trace_match": true, + "pattern_definitions" : { + "FAVORITE_DOG" : "beagle", + "FAVORITE_CAT" : "burmese" + } + } + } + ] +}, +"docs":[ + { + "_source": { + "message": "I love burmese cats!" + } + } + ] +} +-------------------------------------------------- +// CONSOLE +//// + +[source,js] +-------------------------------------------------- +{ + "docs": [ + { + "doc": { + "_type": "_type", + "_index": "_index", + "_id": "_id", + "_source": { + "message": "I love burmese cats!", + "pet": "burmese" + }, + "_ingest": { + "_grok_match_index": "1", + "timestamp": "2016-11-08T19:43:03.850+0000" + } + } + } + ] +} +-------------------------------------------------- +// TESTRESPONSE[s/2016-11-08T19:43:03.850\+0000/$body.docs.0.doc._ingest.timestamp/] + +In the above response, you can see that the index of the pattern that matched was `"1"`. This is to say that it was the +second (index starts at zero) pattern in `patterns` to match. + +This trace metadata enables debugging which of the patterns matched. This information is stored in the ingest +metadata and will not be indexed. + [[gsub-processor]] === Gsub Processor Converts a string field by applying a regular expression and a replacement. diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/GrokProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/GrokProcessor.java index 4a4432a9bb2..22992a04076 100644 --- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/GrokProcessor.java +++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/GrokProcessor.java @@ -37,6 +37,7 @@ public final class GrokProcessor extends AbstractProcessor { private static final String PATTERN_MATCH_KEY = "_ingest._grok_match_index"; private final String matchField; + private final List matchPatterns; private final Grok grok; private final boolean traceMatch; private final boolean ignoreMissing; @@ -45,6 +46,7 @@ public final class GrokProcessor extends AbstractProcessor { boolean traceMatch, boolean ignoreMissing) { super(tag); this.matchField = matchField; + this.matchPatterns = matchPatterns; this.grok = new Grok(patternBank, combinePatterns(matchPatterns, traceMatch)); this.traceMatch = traceMatch; this.ignoreMissing = ignoreMissing; @@ -79,11 +81,15 @@ public final class GrokProcessor extends AbstractProcessor { .forEach((e) -> ingestDocument.setFieldValue(e.getKey(), e.getValue())); if (traceMatch) { - @SuppressWarnings("unchecked") - HashMap matchMap = (HashMap) ingestDocument.getFieldValue(PATTERN_MATCH_KEY, Object.class); - matchMap.keySet().stream().findFirst().ifPresent((index) -> { - ingestDocument.setFieldValue(PATTERN_MATCH_KEY, index); - }); + if (matchPatterns.size() > 1) { + @SuppressWarnings("unchecked") + HashMap matchMap = (HashMap) ingestDocument.getFieldValue(PATTERN_MATCH_KEY, Object.class); + matchMap.keySet().stream().findFirst().ifPresent((index) -> { + ingestDocument.setFieldValue(PATTERN_MATCH_KEY, index); + }); + } else { + ingestDocument.setFieldValue(PATTERN_MATCH_KEY, "0"); + } } } @@ -104,6 +110,10 @@ public final class GrokProcessor extends AbstractProcessor { return matchField; } + List getMatchPatterns() { + return matchPatterns; + } + static String combinePatterns(List patterns, boolean traceMatch) { String combinedPattern; if (patterns.size() > 1) { diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/GrokProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/GrokProcessorTests.java index ce1507d8b61..25cdb91387d 100644 --- a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/GrokProcessorTests.java +++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/GrokProcessorTests.java @@ -158,6 +158,19 @@ public class GrokProcessorTests extends ESTestCase { assertThat(doc.getFieldValue("_ingest._grok_match_index", String.class), equalTo("1")); } + public void testTraceWithOnePattern() throws Exception { + String fieldName = RandomDocumentPicks.randomFieldName(random()); + IngestDocument doc = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>()); + doc.setFieldValue(fieldName, "first1"); + Map patternBank = new HashMap<>(); + patternBank.put("ONE", "1"); + GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), patternBank, + Arrays.asList("%{ONE:one}"), fieldName, true, false); + processor.execute(doc); + assertThat(doc.hasField("one"), equalTo(true)); + assertThat(doc.getFieldValue("_ingest._grok_match_index", String.class), equalTo("0")); + } + public void testCombinedPatterns() { String combined; combined = GrokProcessor.combinePatterns(Arrays.asList(""), false);