fix trace_match behavior for when there is only one grok pattern (#21413)

There is an issue in the Grok Processor, where trace_match: true does not inject the _ingest._grok_match_index into the ingest-document when there is just one pattern provided. This is due to an optimization in the regex construction. This commit adds a check for when this is the case, and injects a static index value of "0", since there is only one pattern matched (at the first index into the patterns).

To make this clearer, more documentation was added to the grok-processor docs.

Fixes #21371.
This commit is contained in:
Tal Levy 2016-11-16 15:41:54 +02:00 committed by GitHub
parent 6baded8e7f
commit 04b712bdc5
3 changed files with 163 additions and 7 deletions

View File

@ -931,14 +931,14 @@ and the result:
"date1" : "2016-04-25T12:02:01.789Z"
},
"_ingest" : {
"timestamp" : "2016-08-11T12:00:01.222Z"
"timestamp" : "2016-11-08T19:43:03.850+0000"
}
}
}
]
}
--------------------------------------------------
// TESTRESPONSE[s/2016-08-11T12:00:01.222Z/$body.docs.0.doc._ingest.timestamp/]
// TESTRESPONSE[s/2016-11-08T19:43:03.850\+0000/$body.docs.0.doc._ingest.timestamp/]
The above example shows that `_index` was set to `<myindex-{2016-04-25||/M{yyyy-MM-dd|UTC}}>`. Elasticsearch
understands this to mean `2016-04-01` as is explained in the <<date-math-index-names, date math index name documentation>>
@ -1278,6 +1278,139 @@ Here is an example of a pipeline specifying custom pattern definitions:
}
--------------------------------------------------
[[trace-match]]
==== Providing Multiple Match Patterns
Sometimes one pattern is not enough to capture the potential structure of a field. Let's assume we
want to match all messages that contain your favorite pet breeds of either cats or dogs. One way to accomplish
this is to provide two distinct patterns that can be matched, instead of one really complicated expression capturing
the same `or` behavior.
Here is an example of such a configuration executed against the simulate API:
[source,js]
--------------------------------------------------
POST _ingest/pipeline/_simulate
{
"pipeline": {
"description" : "parse multiple patterns",
"processors": [
{
"grok": {
"field": "message",
"patterns": ["%{FAVORITE_DOG:pet}", "%{FAVORITE_CAT:pet}"],
"pattern_definitions" : {
"FAVORITE_DOG" : "beagle",
"FAVORITE_CAT" : "burmese"
}
}
}
]
},
"docs":[
{
"_source": {
"message": "I love burmese cats!"
}
}
]
}
--------------------------------------------------
// CONSOLE
response:
[source,js]
--------------------------------------------------
{
"docs": [
{
"doc": {
"_type": "_type",
"_index": "_index",
"_id": "_id",
"_source": {
"message": "I love burmese cats!",
"pet": "burmese"
},
"_ingest": {
"timestamp": "2016-11-08T19:43:03.850+0000"
}
}
}
]
}
--------------------------------------------------
// TESTRESPONSE[s/2016-11-08T19:43:03.850\+0000/$body.docs.0.doc._ingest.timestamp/]
Both patterns will set the field `pet` with the appropriate match, but what if we want to trace which of our
patterns matched and populated our fields? We can do this with the `trace_match` parameter. Here is the output of
that same pipeline, but with `"trace_match": true` configured:
////
Hidden setup for example:
[source,js]
--------------------------------------------------
POST _ingest/pipeline/_simulate
{
"pipeline": {
"description" : "parse multiple patterns",
"processors": [
{
"grok": {
"field": "message",
"patterns": ["%{FAVORITE_DOG:pet}", "%{FAVORITE_CAT:pet}"],
"trace_match": true,
"pattern_definitions" : {
"FAVORITE_DOG" : "beagle",
"FAVORITE_CAT" : "burmese"
}
}
}
]
},
"docs":[
{
"_source": {
"message": "I love burmese cats!"
}
}
]
}
--------------------------------------------------
// CONSOLE
////
[source,js]
--------------------------------------------------
{
"docs": [
{
"doc": {
"_type": "_type",
"_index": "_index",
"_id": "_id",
"_source": {
"message": "I love burmese cats!",
"pet": "burmese"
},
"_ingest": {
"_grok_match_index": "1",
"timestamp": "2016-11-08T19:43:03.850+0000"
}
}
}
]
}
--------------------------------------------------
// TESTRESPONSE[s/2016-11-08T19:43:03.850\+0000/$body.docs.0.doc._ingest.timestamp/]
In the above response, you can see that the index of the pattern that matched was `"1"`. This is to say that it was the
second (index starts at zero) pattern in `patterns` to match.
This trace metadata enables debugging which of the patterns matched. This information is stored in the ingest
metadata and will not be indexed.
[[gsub-processor]]
=== Gsub Processor
Converts a string field by applying a regular expression and a replacement.

View File

@ -37,6 +37,7 @@ public final class GrokProcessor extends AbstractProcessor {
private static final String PATTERN_MATCH_KEY = "_ingest._grok_match_index";
private final String matchField;
private final List<String> matchPatterns;
private final Grok grok;
private final boolean traceMatch;
private final boolean ignoreMissing;
@ -45,6 +46,7 @@ public final class GrokProcessor extends AbstractProcessor {
boolean traceMatch, boolean ignoreMissing) {
super(tag);
this.matchField = matchField;
this.matchPatterns = matchPatterns;
this.grok = new Grok(patternBank, combinePatterns(matchPatterns, traceMatch));
this.traceMatch = traceMatch;
this.ignoreMissing = ignoreMissing;
@ -79,11 +81,15 @@ public final class GrokProcessor extends AbstractProcessor {
.forEach((e) -> ingestDocument.setFieldValue(e.getKey(), e.getValue()));
if (traceMatch) {
@SuppressWarnings("unchecked")
HashMap<String, String> matchMap = (HashMap<String, String>) ingestDocument.getFieldValue(PATTERN_MATCH_KEY, Object.class);
matchMap.keySet().stream().findFirst().ifPresent((index) -> {
ingestDocument.setFieldValue(PATTERN_MATCH_KEY, index);
});
if (matchPatterns.size() > 1) {
@SuppressWarnings("unchecked")
HashMap<String, String> matchMap = (HashMap<String, String>) ingestDocument.getFieldValue(PATTERN_MATCH_KEY, Object.class);
matchMap.keySet().stream().findFirst().ifPresent((index) -> {
ingestDocument.setFieldValue(PATTERN_MATCH_KEY, index);
});
} else {
ingestDocument.setFieldValue(PATTERN_MATCH_KEY, "0");
}
}
}
@ -104,6 +110,10 @@ public final class GrokProcessor extends AbstractProcessor {
return matchField;
}
List<String> getMatchPatterns() {
return matchPatterns;
}
static String combinePatterns(List<String> patterns, boolean traceMatch) {
String combinedPattern;
if (patterns.size() > 1) {

View File

@ -158,6 +158,19 @@ public class GrokProcessorTests extends ESTestCase {
assertThat(doc.getFieldValue("_ingest._grok_match_index", String.class), equalTo("1"));
}
public void testTraceWithOnePattern() throws Exception {
String fieldName = RandomDocumentPicks.randomFieldName(random());
IngestDocument doc = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>());
doc.setFieldValue(fieldName, "first1");
Map<String, String> patternBank = new HashMap<>();
patternBank.put("ONE", "1");
GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), patternBank,
Arrays.asList("%{ONE:one}"), fieldName, true, false);
processor.execute(doc);
assertThat(doc.hasField("one"), equalTo(true));
assertThat(doc.getFieldValue("_ingest._grok_match_index", String.class), equalTo("0"));
}
public void testCombinedPatterns() {
String combined;
combined = GrokProcessor.combinePatterns(Arrays.asList(""), false);