Extract capture config from grok patterns up front (backport of #62706) (#62785)

This extracts the configuration for extracting values from a groked
string when building the grok expression to do two things:
1. Create a method exposing that configuration on `Grok` itself which
   will be used grok `grok` flavored runtime fields.
2. Marginally speed up extracting grok values by skipping a little
   string manipulation.
This commit is contained in:
Nik Everett 2020-09-22 17:44:42 -04:00 committed by GitHub
parent fa13585fae
commit 7ffea4621d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 289 additions and 82 deletions

View File

@ -43,6 +43,8 @@ import java.util.Locale;
import java.util.Map;
import java.util.function.Consumer;
import static java.util.Collections.unmodifiableList;
public final class Grok {
/**
* Patterns built in to the grok library.
@ -72,6 +74,7 @@ public final class Grok {
private final boolean namedCaptures;
private final Regex compiledExpression;
private final MatcherWatchdog matcherWatchdog;
private final List<GrokCaptureConfig> captureConfig;
public Grok(Map<String, String> patternBank, String grokPattern, Consumer<String> logCallBack) {
this(patternBank, grokPattern, true, MatcherWatchdog.noop(), logCallBack);
@ -101,6 +104,12 @@ public final class Grok {
byte[] expressionBytes = expression.getBytes(StandardCharsets.UTF_8);
this.compiledExpression = new Regex(expressionBytes, 0, expressionBytes.length, Option.DEFAULT, UTF8Encoding.INSTANCE,
message -> logCallBack.accept(message));
List<GrokCaptureConfig> captureConfig = new ArrayList<>();
for (Iterator<NameEntry> entry = compiledExpression.namedBackrefIterator(); entry.hasNext();) {
captureConfig.add(new GrokCaptureConfig(entry.next()));
}
this.captureConfig = unmodifiableList(captureConfig);
}
/**
@ -146,7 +155,7 @@ public final class Grok {
}
}
public String groupMatch(String name, Region region, String pattern) {
private String groupMatch(String name, Region region, String pattern) {
try {
int number = GROK_PATTERN_REGEX.nameToBackrefNumber(name.getBytes(StandardCharsets.UTF_8), 0,
name.getBytes(StandardCharsets.UTF_8).length, region);
@ -165,7 +174,7 @@ public final class Grok {
*
* @return named regex expression
*/
public String toRegex(String grokPattern) {
protected String toRegex(String grokPattern) {
StringBuilder res = new StringBuilder();
for (int i = 0; i < MAX_TO_REGEX_ITERATIONS; i++) {
byte[] grokPatternBytes = grokPattern.getBytes(StandardCharsets.UTF_8);
@ -255,19 +264,12 @@ public final class Grok {
// TODO: I think we should throw an error here?
return null;
} else if (compiledExpression.numberOfNames() > 0) {
Map<String, Object> fields = new HashMap<>();
Map<String, Object> fields = new HashMap<>(captureConfig.size());
Region region = matcher.getEagerRegion();
for (Iterator<NameEntry> entry = compiledExpression.namedBackrefIterator(); entry.hasNext();) {
NameEntry e = entry.next();
String groupName = new String(e.name, e.nameP, e.nameEnd - e.nameP, StandardCharsets.UTF_8);
for (int number : e.getBackRefs()) {
if (region.beg[number] >= 0) {
String matchValue = new String(textAsBytes, region.beg[number], region.end[number] - region.beg[number],
StandardCharsets.UTF_8);
GrokMatchGroup match = new GrokMatchGroup(groupName, matchValue);
fields.put(match.getName(), match.getValue());
break;
}
for (GrokCaptureConfig config: captureConfig) {
Object v = config.extract(textAsBytes, region);
if (v != null) {
fields.put(config.name(), v);
}
}
return fields;
@ -276,6 +278,13 @@ public final class Grok {
}
}
/**
* The list of values that this {@linkplain Grok} can capture.
*/
public List<GrokCaptureConfig> captureConfig() {
return captureConfig;
}
/**
* Load built-in patterns.
*/

View File

@ -0,0 +1,67 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.grok;
import org.joni.NameEntry;
import org.joni.Region;
import java.nio.charset.StandardCharsets;
/**
* Configuration for a value that {@link Grok} can capture.
*/
public final class GrokCaptureConfig {
private final String name;
private final GrokCaptureType type;
private final int[] backRefs;
GrokCaptureConfig(NameEntry nameEntry) {
String groupName = new String(nameEntry.name, nameEntry.nameP, nameEntry.nameEnd - nameEntry.nameP, StandardCharsets.UTF_8);
String[] parts = groupName.split(":");
name = parts.length >= 2 ? parts[1] : parts[0];
type = parts.length == 3 ? GrokCaptureType.fromString(parts[2]) : GrokCaptureType.STRING;
this.backRefs = nameEntry.getBackRefs();
}
/**
* The name defined for the field in the pattern.
*/
public String name() {
return name;
}
/**
* The type defined for the field in the pattern.
*/
public GrokCaptureType type() {
return type;
}
Object extract(byte[] textAsBytes, Region region) {
for (int number : backRefs) {
if (region.beg[number] >= 0) {
String matchValue = new String(textAsBytes, region.beg[number], region.end[number] - region.beg[number],
StandardCharsets.UTF_8);
return type.parse(matchValue);
}
}
return null;
}
}

View File

@ -0,0 +1,90 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.grok;
/**
* The type defined for the field in the pattern.
*/
public enum GrokCaptureType {
STRING {
@Override
protected Object parseValue(String str) {
return str;
}
},
INTEGER {
@Override
protected Object parseValue(String str) {
return Integer.parseInt(str);
}
},
LONG {
@Override
protected Object parseValue(String str) {
return Long.parseLong(str);
}
},
DOUBLE {
@Override
protected Object parseValue(String str) {
return Double.parseDouble(str);
}
},
FLOAT {
@Override
protected Object parseValue(String str) {
return Float.parseFloat(str);
}
},
BOOLEAN {
@Override
protected Object parseValue(String str) {
return Boolean.parseBoolean(str);
}
};
final Object parse(String str) {
if (str == null) {
return null;
}
return parseValue(str);
}
protected abstract Object parseValue(String str);
static GrokCaptureType fromString(String str) {
switch (str) {
case "string":
return STRING;
case "int":
return INTEGER;
case "long":
return LONG;
case "double":
return DOUBLE;
case "float":
return FLOAT;
case "boolean":
return BOOLEAN;
default:
return STRING;
}
}
}

View File

@ -1,68 +0,0 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.grok;
final class GrokMatchGroup {
private static final String DEFAULT_TYPE = "string";
private final String patternName;
private final String fieldName;
private final String type;
private final String groupValue;
GrokMatchGroup(String groupName, String groupValue) {
String[] parts = groupName.split(":");
patternName = parts[0];
if (parts.length >= 2) {
fieldName = parts[1];
} else {
fieldName = null;
}
if (parts.length == 3) {
type = parts[2];
} else {
type = DEFAULT_TYPE;
}
this.groupValue = groupValue;
}
public String getName() {
return (fieldName == null) ? patternName : fieldName;
}
public Object getValue() {
if (groupValue == null) { return null; }
switch(type) {
case "int":
return Integer.parseInt(groupValue);
case "long":
return Long.parseLong(groupValue);
case "double":
return Double.parseDouble(groupValue);
case "float":
return Float.parseFloat(groupValue);
case "boolean":
return Boolean.parseBoolean(groupValue);
default:
return groupValue;
}
}
}

View File

@ -31,6 +31,12 @@ import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.BiConsumer;
import static org.elasticsearch.grok.GrokCaptureType.BOOLEAN;
import static org.elasticsearch.grok.GrokCaptureType.DOUBLE;
import static org.elasticsearch.grok.GrokCaptureType.FLOAT;
import static org.elasticsearch.grok.GrokCaptureType.INTEGER;
import static org.elasticsearch.grok.GrokCaptureType.LONG;
import static org.elasticsearch.grok.GrokCaptureType.STRING;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.is;
@ -53,6 +59,19 @@ public class GrokTests extends ESTestCase {
public void testSimpleSyslogLine() {
String line = "Mar 16 00:01:25 evita postfix/smtpd[1713]: connect from camomile.cloud9.net[168.100.1.3]";
Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "%{SYSLOGLINE}", logger::warn);
assertCaptureConfig(
grok,
org.elasticsearch.common.collect.Map.ofEntries(
org.elasticsearch.common.collect.Map.entry("facility", STRING),
org.elasticsearch.common.collect.Map.entry("logsource", STRING),
org.elasticsearch.common.collect.Map.entry("message", STRING),
org.elasticsearch.common.collect.Map.entry("pid", STRING),
org.elasticsearch.common.collect.Map.entry("priority", STRING),
org.elasticsearch.common.collect.Map.entry("program", STRING),
org.elasticsearch.common.collect.Map.entry("timestamp", STRING),
org.elasticsearch.common.collect.Map.entry("timestamp8601", STRING)
)
);
Map<String, Object> matches = grok.captures(line);
assertEquals("evita", matches.get("logsource"));
assertEquals("Mar 16 00:01:25", matches.get("timestamp"));
@ -65,6 +84,20 @@ public class GrokTests extends ESTestCase {
String line = "<191>1 2009-06-30T18:30:00+02:00 paxton.local grokdebug 4123 - [id1 foo=\\\"bar\\\"][id2 baz=\\\"something\\\"] " +
"Hello, syslog.";
Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "%{SYSLOG5424LINE}", logger::warn);
assertCaptureConfig(
grok,
org.elasticsearch.common.collect.Map.ofEntries(
org.elasticsearch.common.collect.Map.entry("syslog5424_app", STRING),
org.elasticsearch.common.collect.Map.entry("syslog5424_host", STRING),
org.elasticsearch.common.collect.Map.entry("syslog5424_msg", STRING),
org.elasticsearch.common.collect.Map.entry("syslog5424_msgid", STRING),
org.elasticsearch.common.collect.Map.entry("syslog5424_pri", STRING),
org.elasticsearch.common.collect.Map.entry("syslog5424_proc", STRING),
org.elasticsearch.common.collect.Map.entry("syslog5424_sd", STRING),
org.elasticsearch.common.collect.Map.entry("syslog5424_ts", STRING),
org.elasticsearch.common.collect.Map.entry("syslog5424_ver", STRING)
)
);
Map<String, Object> matches = grok.captures(line);
assertEquals("191", matches.get("syslog5424_pri"));
assertEquals("1", matches.get("syslog5424_ver"));
@ -80,12 +113,14 @@ public class GrokTests extends ESTestCase {
public void testDatePattern() {
String line = "fancy 12-12-12 12:12:12";
Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "(?<timestamp>%{DATE_EU} %{TIME})", logger::warn);
assertCaptureConfig(grok, org.elasticsearch.common.collect.Map.of("timestamp", STRING));
Map<String, Object> matches = grok.captures(line);
assertEquals("12-12-12 12:12:12", matches.get("timestamp"));
}
public void testNilCoercedValues() {
Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "test (N/A|%{BASE10NUM:duration:float}ms)", logger::warn);
assertCaptureConfig(grok, org.elasticsearch.common.collect.Map.of("duration", FLOAT));
Map<String, Object> matches = grok.captures("test 28.4ms");
assertEquals(28.4f, matches.get("duration"));
matches = grok.captures("test N/A");
@ -94,6 +129,7 @@ public class GrokTests extends ESTestCase {
public void testNilWithNoCoercion() {
Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "test (N/A|%{BASE10NUM:duration}ms)", logger::warn);
assertCaptureConfig(grok, org.elasticsearch.common.collect.Map.of("duration", STRING));
Map<String, Object> matches = grok.captures("test 28.4ms");
assertEquals("28.4", matches.get("duration"));
matches = grok.captures("test N/A");
@ -104,6 +140,17 @@ public class GrokTests extends ESTestCase {
Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "<%{POSINT:syslog_pri}>%{SPACE}%{SYSLOGTIMESTAMP:syslog_timestamp} " +
"%{SYSLOGHOST:syslog_hostname} %{PROG:syslog_program}(:?)(?:\\[%{GREEDYDATA:syslog_pid}\\])?(:?) " +
"%{GREEDYDATA:syslog_message}", logger::warn);
assertCaptureConfig(
grok,
org.elasticsearch.common.collect.Map.ofEntries(
org.elasticsearch.common.collect.Map.entry("syslog_hostname", STRING),
org.elasticsearch.common.collect.Map.entry("syslog_message", STRING),
org.elasticsearch.common.collect.Map.entry("syslog_pid", STRING),
org.elasticsearch.common.collect.Map.entry("syslog_pri", STRING),
org.elasticsearch.common.collect.Map.entry("syslog_program", STRING),
org.elasticsearch.common.collect.Map.entry("syslog_timestamp", STRING)
)
);
Map<String, Object> matches = grok.captures("<22>Jan 4 07:50:46 mailmaster postfix/policy-spf[9454]: : " +
"SPF permerror (Junk encountered in record 'v=spf1 mx a:mail.domain.no ip4:192.168.0.4 <20>all'): Envelope-from: " +
"email@domain.no");
@ -114,18 +161,21 @@ public class GrokTests extends ESTestCase {
public void testNamedFieldsWithWholeTextMatch() {
Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "%{DATE_EU:stimestamp}", logger::warn);
assertCaptureConfig(grok, org.elasticsearch.common.collect.Map.of("stimestamp", STRING));
Map<String, Object> matches = grok.captures("11/01/01");
assertThat(matches.get("stimestamp"), equalTo("11/01/01"));
}
public void testWithOniguramaNamedCaptures() {
Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "(?<foo>\\w+)", logger::warn);
assertCaptureConfig(grok, org.elasticsearch.common.collect.Map.of("foo", STRING));
Map<String, Object> matches = grok.captures("hello world");
assertThat(matches.get("foo"), equalTo("hello"));
}
public void testISO8601() {
Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "^%{TIMESTAMP_ISO8601}$", logger::warn);
assertCaptureConfig(grok, org.elasticsearch.common.collect.Map.of());
List<String> timeMessages = Arrays.asList(
"2001-01-01T00:00:00",
"1974-03-02T04:09:09",
@ -150,6 +200,7 @@ public class GrokTests extends ESTestCase {
public void testNotISO8601() {
Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "^%{TIMESTAMP_ISO8601}$", logger::warn);
assertCaptureConfig(grok, org.elasticsearch.common.collect.Map.of());
List<String> timeMessages = Arrays.asList(
"2001-13-01T00:00:00", // invalid month
"2001-00-01T00:00:00", // invalid month
@ -189,6 +240,7 @@ public class GrokTests extends ESTestCase {
String text = "wowza !!!Tal!!! - Tal";
String pattern = "%{EXCITED_NAME} - %{NAME}";
Grok g = new Grok(bank, pattern, false, logger::warn);
assertCaptureConfig(g, org.elasticsearch.common.collect.Map.of("EXCITED_NAME_0", STRING, "NAME_21", STRING, "NAME_22", STRING));
assertEquals("(?<EXCITED_NAME_0>!!!(?<NAME_21>Tal)!!!) - (?<NAME_22>Tal)", g.toRegex(pattern));
assertEquals(true, g.match(text));
@ -263,6 +315,7 @@ public class GrokTests extends ESTestCase {
public void testBooleanCaptures() {
String pattern = "%{WORD:name}=%{WORD:status:boolean}";
Grok g = new Grok(Grok.BUILTIN_PATTERNS, pattern, logger::warn);
assertCaptureConfig(g, org.elasticsearch.common.collect.Map.of("name", STRING, "status", BOOLEAN));
String text = "active=true";
Map<String, Object> expected = new HashMap<>();
@ -280,6 +333,7 @@ public class GrokTests extends ESTestCase {
String pattern = "%{NUMBER:bytes:float} %{NUMBER:id:long} %{NUMBER:rating:double}";
Grok g = new Grok(bank, pattern, logger::warn);
assertCaptureConfig(g, org.elasticsearch.common.collect.Map.of("bytes", FLOAT, "id", LONG, "rating", DOUBLE));
String text = "12009.34 20000000000 4820.092";
Map<String, Object> expected = new HashMap<>();
@ -298,6 +352,7 @@ public class GrokTests extends ESTestCase {
String pattern = "%{NUMBER:bytes:float} %{NUMBER:status} %{NUMBER}";
Grok g = new Grok(bank, pattern, logger::warn);
assertCaptureConfig(g, org.elasticsearch.common.collect.Map.of("bytes", FLOAT, "status", STRING));
String text = "12009.34 200 9032";
Map<String, Object> expected = new HashMap<>();
@ -308,11 +363,39 @@ public class GrokTests extends ESTestCase {
assertEquals(expected, actual);
}
public void testGarbageTypeNameBecomesString() {
Map<String, String> bank = new HashMap<>();
bank.put("BASE10NUM", "(?<![0-9.+-])(?>[+-]?(?:(?:[0-9]+(?:\\.[0-9]+)?)|(?:\\.[0-9]+)))");
bank.put("NUMBER", "(?:%{BASE10NUM})");
String pattern = "%{NUMBER:f:not_a_valid_type}";
Grok g = new Grok(bank, pattern, logger::warn);
assertCaptureConfig(g, org.elasticsearch.common.collect.Map.of("f", STRING));
assertThat(g.captures("12009.34"), equalTo(org.elasticsearch.common.collect.Map.of("f", "12009.34")));
}
public void testApacheLog() {
String logLine = "31.184.238.164 - - [24/Jul/2014:05:35:37 +0530] \"GET /logs/access.log HTTP/1.0\" 200 69849 " +
"\"http://8rursodiol.enjin.com\" \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " +
"Chrome/30.0.1599.12785 YaBrowser/13.12.1599.12785 Safari/537.36\" \"www.dlwindianrailways.com\"";
Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "%{COMBINEDAPACHELOG}", logger::warn);
assertCaptureConfig(
grok,
org.elasticsearch.common.collect.Map.ofEntries(
org.elasticsearch.common.collect.Map.entry("agent", STRING),
org.elasticsearch.common.collect.Map.entry("auth", STRING),
org.elasticsearch.common.collect.Map.entry("bytes", STRING),
org.elasticsearch.common.collect.Map.entry("clientip", STRING),
org.elasticsearch.common.collect.Map.entry("httpversion", STRING),
org.elasticsearch.common.collect.Map.entry("ident", STRING),
org.elasticsearch.common.collect.Map.entry("rawrequest", STRING),
org.elasticsearch.common.collect.Map.entry("referrer", STRING),
org.elasticsearch.common.collect.Map.entry("request", STRING),
org.elasticsearch.common.collect.Map.entry("response", STRING),
org.elasticsearch.common.collect.Map.entry("timestamp", STRING),
org.elasticsearch.common.collect.Map.entry("verb", STRING)
)
);
Map<String, Object> matches = grok.captures(logLine);
assertEquals("31.184.238.164", matches.get("clientip"));
@ -373,6 +456,22 @@ public class GrokTests extends ESTestCase {
"HTTP/%{NUMBER:httpversion}\" %{NUMBER:response:int} (?:-|%{NUMBER:bytes:int}) %{QS:referrer} %{QS:agent}";
Grok grok = new Grok(bank, pattern, logger::warn);
assertCaptureConfig(
grok,
org.elasticsearch.common.collect.Map.ofEntries(
org.elasticsearch.common.collect.Map.entry("agent", STRING),
org.elasticsearch.common.collect.Map.entry("auth", STRING),
org.elasticsearch.common.collect.Map.entry("bytes", INTEGER),
org.elasticsearch.common.collect.Map.entry("clientip", STRING),
org.elasticsearch.common.collect.Map.entry("httpversion", STRING),
org.elasticsearch.common.collect.Map.entry("ident", STRING),
org.elasticsearch.common.collect.Map.entry("referrer", STRING),
org.elasticsearch.common.collect.Map.entry("request", STRING),
org.elasticsearch.common.collect.Map.entry("response", INTEGER),
org.elasticsearch.common.collect.Map.entry("timestamp", STRING),
org.elasticsearch.common.collect.Map.entry("verb", STRING)
)
);
Map<String, Object> expected = new HashMap<>();
expected.put("clientip", "83.149.9.216");
@ -404,6 +503,7 @@ public class GrokTests extends ESTestCase {
Map<String, String> bank = new HashMap<>();
bank.put("SINGLEDIGIT", "[0-9]");
Grok grok = new Grok(bank, "%{SINGLEDIGIT:num}%{SINGLEDIGIT:num}", logger::warn);
assertCaptureConfig(grok, org.elasticsearch.common.collect.Map.of("num", STRING));
Map<String, Object> expected = new HashMap<>();
expected.put("num", "1");
@ -500,4 +600,13 @@ public class GrokTests extends ESTestCase {
Map<String, Object> matches = grok.captures(line);
assertEquals(line, matches.get(fieldName));
}
private void assertCaptureConfig(Grok grok, Map<String, GrokCaptureType> nameToType) {
Map<String, GrokCaptureType> fromGrok = new TreeMap<>();
for (GrokCaptureConfig config : grok.captureConfig()) {
Object old = fromGrok.put(config.name(), config.type());
assertThat("duplicates not allowed", old, nullValue());
}
assertThat(fromGrok, equalTo(new TreeMap<>(nameToType)));
}
}