Grok: "native" results (backport of #62843) (#62886)

This adds the ability to fetch java primitives like `long` and `float`
from grok matches rather than their boxed versions. It also allows
customizing the which fields are extracted and how they are extracted.
By default we continue to fetch a `Map<String, Object>` but runtime
fields will be able to catch *just* the fields it is interested
in, and the values will be primitives.
This commit is contained in:
Nik Everett 2020-09-24 11:47:13 -04:00 committed by GitHub
parent 3590a77b2b
commit 719a76e4bd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 365 additions and 63 deletions

View File

@ -0,0 +1,33 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.grok;
import java.util.function.Consumer;
/**
* Primitive {@link Consumer} for floats.
*/
@FunctionalInterface
public interface FloatConsumer {
/**
* Consumes the {@code value}.
*/
void accept(float value);
}

View File

@ -35,7 +35,6 @@ import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
@ -244,21 +243,27 @@ public final class Grok {
* Matches and returns any named captures.
*
* @param text the text to match and extract values from.
* @return a map containing field names and their respective coerced values that matched.
* @return a map containing field names and their respective coerced values that matched or null if the pattern didn't match
*/
public Map<String, Object> captures(String text) {
byte[] utf8Bytes = text.getBytes(StandardCharsets.UTF_8);
return captures(utf8Bytes, 0, utf8Bytes.length);
GrokCaptureExtracter.MapExtracter extracter = new GrokCaptureExtracter.MapExtracter(captureConfig);
if (match(utf8Bytes, 0, utf8Bytes.length, extracter)) {
return extracter.result();
}
return null;
}
/**
* Matches and returns any named captures.
* Matches and collects any named captures.
* @param utf8Bytes array containing the text to match against encoded in utf-8
* @param offset offset {@code utf8Bytes} of the start of the text
* @param length length of the text to match
* @return a map containing field names and their respective coerced values that matched.
* @param extracter collector for captures. {@link GrokCaptureConfig#nativeExtracter} can build these.
* @return true if there was a match, false otherwise
* @throws RuntimeException if there was a timeout
*/
public Map<String, Object> captures(byte[] utf8Bytes, int offset, int length) {
public boolean match(byte[] utf8Bytes, int offset, int length, GrokCaptureExtracter extracter) {
Matcher matcher = compiledExpression.matcher(utf8Bytes, offset, offset + length);
int result;
try {
@ -270,22 +275,12 @@ public final class Grok {
if (result == Matcher.INTERRUPTED) {
throw new RuntimeException("grok pattern matching was interrupted after [" +
matcherWatchdog.maxExecutionTimeInMillis() + "] ms");
} else if (result == Matcher.FAILED) {
// TODO: I think we should throw an error here?
return null;
} else if (compiledExpression.numberOfNames() > 0) {
Map<String, Object> fields = new HashMap<>(captureConfig.size());
Region region = matcher.getEagerRegion();
for (GrokCaptureConfig config: captureConfig) {
Object v = config.extract(utf8Bytes, offset, region);
if (v != null) {
fields.put(config.name(), v);
}
}
return fields;
} else {
return Collections.emptyMap();
}
if (result == Matcher.FAILED) {
return false;
}
extracter.extract(utf8Bytes, offset, matcher.getEagerRegion());
return true;
}
/**

View File

@ -20,9 +20,13 @@
package org.elasticsearch.grok;
import org.joni.NameEntry;
import org.joni.Region;
import java.nio.charset.StandardCharsets;
import java.util.function.Consumer;
import java.util.function.DoubleConsumer;
import java.util.function.Function;
import java.util.function.IntConsumer;
import java.util.function.LongConsumer;
/**
* Configuration for a value that {@link Grok} can capture.
@ -50,18 +54,105 @@ public final class GrokCaptureConfig {
/**
* The type defined for the field in the pattern.
*/
public GrokCaptureType type() {
GrokCaptureType type() {
return type;
}
Object extract(byte[] utf8Bytes, int offset, Region region) {
for (int number : backRefs) {
if (region.beg[number] >= 0) {
String matchValue = new String(utf8Bytes, offset + region.beg[number], region.end[number] - region.beg[number],
StandardCharsets.UTF_8);
return type.parse(matchValue);
/**
* Build a {@linkplain GrokCaptureExtracter} that will call {@code emit} when
* it extracts text, boxed if the "native" representation is primitive type.
* Extracters returned from this method are stateless and can be reused.
*/
public GrokCaptureExtracter objectExtracter(Consumer<Object> emit) {
// We could probably write this code a little more concisely but this makes it clear where we are boxing
return nativeExtracter(new NativeExtracterMap<GrokCaptureExtracter>() {
@Override
public GrokCaptureExtracter forString(Function<Consumer<String>, GrokCaptureExtracter> buildExtracter) {
return buildExtracter.apply(str -> emit.accept(str));
}
}
return null;
@Override
public GrokCaptureExtracter forInt(Function<IntConsumer, GrokCaptureExtracter> buildExtracter) {
return buildExtracter.apply(i -> emit.accept(Integer.valueOf(i)));
}
@Override
public GrokCaptureExtracter forLong(Function<LongConsumer, GrokCaptureExtracter> buildExtracter) {
return buildExtracter.apply(l -> emit.accept(Long.valueOf(l)));
}
@Override
public GrokCaptureExtracter forFloat(Function<FloatConsumer, GrokCaptureExtracter> buildExtracter) {
return buildExtracter.apply(f -> emit.accept(Float.valueOf(f)));
}
@Override
public GrokCaptureExtracter forDouble(Function<DoubleConsumer, GrokCaptureExtracter> buildExtracter) {
return buildExtracter.apply(d -> emit.accept(Double.valueOf(d)));
}
@Override
public GrokCaptureExtracter forBoolean(Function<Consumer<Boolean>, GrokCaptureExtracter> buildExtracter) {
return buildExtracter.apply(b -> emit.accept(b));
}
});
}
/**
* Build an extract that has access to the "native" type of the extracter
* match. This means that patterns like {@code %{NUMBER:bytes:float}} has
* access to an actual {@link float}. Extracters returned from this method
* should be stateless stateless and can be reused. Pathological implementations
* of the {@code map} parameter could violate this, but the caller should
* take care to stay sane.
* <p>
* While the goal is to produce a {@link GrokCaptureExtracter} that provides
* a primitive, the caller can produce whatever type-safe constructs it
* needs and return them from this method. Thus the {@code <T>} in the type
* signature.
*
* @param <T> The type of the result.
* @param map Collection of handlers for each native type. Only one method
* will be called but well behaved implementers are stateless.
* @return whatever was returned by the handler.
*/
public <T> T nativeExtracter(NativeExtracterMap<T> map) {
return type.nativeExtracter(backRefs, map);
}
/**
* Collection of handlers for each native type. Well behaved implementations
* are stateless and produce stateless results.
*/
public interface NativeExtracterMap<T> {
/**
* Called when the native type is a {@link String}.
*/
T forString(Function<Consumer<String>, GrokCaptureExtracter> buildExtracter);
/**
* Called when the native type is an {@link int}.
*/
T forInt(Function<IntConsumer, GrokCaptureExtracter> buildExtracter);
/**
* Called when the native type is an {@link long}.
*/
T forLong(Function<LongConsumer, GrokCaptureExtracter> buildExtracter);
/**
* Called when the native type is an {@link float}.
*/
T forFloat(Function<FloatConsumer, GrokCaptureExtracter> buildExtracter);
/**
* Called when the native type is an {@link double}.
*/
T forDouble(Function<DoubleConsumer, GrokCaptureExtracter> buildExtracter);
/**
* Called when the native type is an {@link boolean}.
*/
T forBoolean(Function<Consumer<Boolean>, GrokCaptureExtracter> buildExtracter);
}
}

View File

@ -0,0 +1,64 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.grok;
import org.joni.Region;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static java.util.Collections.emptyMap;
/**
* How to extract matches.
*/
public abstract class GrokCaptureExtracter {
/**
* Extract {@link Map} results. This implementation of {@link GrokCaptureExtracter}
* is mutable and should be discarded after collecting a single result.
*/
static class MapExtracter extends GrokCaptureExtracter {
private final Map<String, Object> result;
private final List<GrokCaptureExtracter> fieldExtracters;
MapExtracter(List<GrokCaptureConfig> captureConfig) {
result = captureConfig.isEmpty() ? emptyMap() : new HashMap<>();
fieldExtracters = new ArrayList<>(captureConfig.size());
for (GrokCaptureConfig config : captureConfig) {
fieldExtracters.add(config.objectExtracter(v -> result.put(config.name(), v)));
}
}
@Override
void extract(byte[] utf8Bytes, int offset, Region region) {
for (GrokCaptureExtracter extracter : fieldExtracters) {
extracter.extract(utf8Bytes, offset, region);
}
}
Map<String, Object> result() {
return result;
}
}
abstract void extract(byte[] utf8Bytes, int offset, Region region);
}

View File

@ -19,55 +19,54 @@
package org.elasticsearch.grok;
import org.elasticsearch.grok.GrokCaptureConfig.NativeExtracterMap;
import org.joni.Region;
import java.nio.charset.StandardCharsets;
import java.util.function.Consumer;
/**
* The type defined for the field in the pattern.
*/
public enum GrokCaptureType {
enum GrokCaptureType {
STRING {
@Override
protected Object parseValue(String str) {
return str;
<T> T nativeExtracter(int[] backRefs, NativeExtracterMap<T> map) {
return map.forString(emit -> rawExtracter(backRefs, emit));
}
},
INTEGER {
@Override
protected Object parseValue(String str) {
return Integer.parseInt(str);
<T> T nativeExtracter(int[] backRefs, NativeExtracterMap<T> map) {
return map.forInt(emit -> rawExtracter(backRefs, str -> emit.accept(Integer.parseInt(str))));
}
},
LONG {
@Override
protected Object parseValue(String str) {
return Long.parseLong(str);
}
},
DOUBLE {
@Override
protected Object parseValue(String str) {
return Double.parseDouble(str);
<T> T nativeExtracter(int[] backRefs, NativeExtracterMap<T> map) {
return map.forLong(emit -> rawExtracter(backRefs, str -> emit.accept(Long.parseLong(str))));
}
},
FLOAT {
@Override
protected Object parseValue(String str) {
return Float.parseFloat(str);
<T> T nativeExtracter(int[] backRefs, NativeExtracterMap<T> map) {
return map.forFloat(emit -> rawExtracter(backRefs, str -> emit.accept(Float.parseFloat(str))));
}
},
DOUBLE {
@Override
<T> T nativeExtracter(int[] backRefs, NativeExtracterMap<T> map) {
return map.forDouble(emit -> rawExtracter(backRefs, str -> emit.accept(Double.parseDouble(str))));
}
},
BOOLEAN {
@Override
protected Object parseValue(String str) {
return Boolean.parseBoolean(str);
<T> T nativeExtracter(int[] backRefs, NativeExtracterMap<T> map) {
return map.forBoolean(emit -> rawExtracter(backRefs, str -> emit.accept(Boolean.parseBoolean(str))));
}
};
final Object parse(String str) {
if (str == null) {
return null;
}
return parseValue(str);
}
protected abstract Object parseValue(String str);
abstract <T> T nativeExtracter(int[] backRefs, NativeExtracterMap<T> map);
static GrokCaptureType fromString(String str) {
switch (str) {
@ -77,14 +76,30 @@ public enum GrokCaptureType {
return INTEGER;
case "long":
return LONG;
case "double":
return DOUBLE;
case "float":
return FLOAT;
case "double":
return DOUBLE;
case "boolean":
return BOOLEAN;
default:
return STRING;
}
}
protected final GrokCaptureExtracter rawExtracter(int[] backRefs, Consumer<? super String> emit) {
return new GrokCaptureExtracter() {
@Override
void extract(byte[] utf8Bytes, int offset, Region region) {
for (int number : backRefs) {
if (region.beg[number] >= 0) {
int matchOffset = offset + region.beg[number];
int matchLength = region.end[number] - region.beg[number];
emit.accept(new String(utf8Bytes, matchOffset, matchLength, StandardCharsets.UTF_8));
return; // Capture only the first value.
}
}
}
};
}
}

View File

@ -19,6 +19,7 @@
package org.elasticsearch.grok;
import org.elasticsearch.grok.GrokCaptureConfig.NativeExtracterMap;
import org.elasticsearch.test.ESTestCase;
import java.nio.charset.StandardCharsets;
@ -31,6 +32,11 @@ import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import java.util.function.DoubleConsumer;
import java.util.function.Function;
import java.util.function.IntConsumer;
import java.util.function.LongConsumer;
import static org.elasticsearch.grok.GrokCaptureType.BOOLEAN;
import static org.elasticsearch.grok.GrokCaptureType.DOUBLE;
@ -55,12 +61,20 @@ public class GrokTests extends ESTestCase {
public void testCaputuresBytes() {
Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "%{NUMBER:n:int}", logger::warn);
byte[] utf8 = "10".getBytes(StandardCharsets.UTF_8);
assertThat(grok.captures(utf8, 0, utf8.length), equalTo(org.elasticsearch.common.collect.Map.of("n", 10)));
assertThat(grok.captures(utf8, 0, 1), equalTo(org.elasticsearch.common.collect.Map.of("n", 1)));
assertThat(captureBytes(grok, utf8, 0, utf8.length), equalTo(org.elasticsearch.common.collect.Map.of("n", 10)));
assertThat(captureBytes(grok, utf8, 0, 1), equalTo(org.elasticsearch.common.collect.Map.of("n", 1)));
utf8 = "10 11 12".getBytes(StandardCharsets.UTF_8);
assertThat(grok.captures(utf8, 0, 2), equalTo(org.elasticsearch.common.collect.Map.of("n", 10)));
assertThat(grok.captures(utf8, 3, 2), equalTo(org.elasticsearch.common.collect.Map.of("n", 11)));
assertThat(grok.captures(utf8, 6, 2), equalTo(org.elasticsearch.common.collect.Map.of("n", 12)));
assertThat(captureBytes(grok, utf8, 0, 2), equalTo(org.elasticsearch.common.collect.Map.of("n", 10)));
assertThat(captureBytes(grok, utf8, 3, 2), equalTo(org.elasticsearch.common.collect.Map.of("n", 11)));
assertThat(captureBytes(grok, utf8, 6, 2), equalTo(org.elasticsearch.common.collect.Map.of("n", 12)));
}
private Map<String, Object> captureBytes(Grok grok, byte[] utf8, int offset, int length) {
GrokCaptureExtracter.MapExtracter extracter = new GrokCaptureExtracter.MapExtracter(grok.captureConfig());
if (grok.match(utf8, offset, length, extracter)) {
return extracter.result();
}
return null;
}
public void testNoMatchingPatternInDictionary() {
@ -90,6 +104,16 @@ public class GrokTests extends ESTestCase {
assertEquals("connect from camomile.cloud9.net[168.100.1.3]", matches.get("message"));
assertEquals("postfix/smtpd", matches.get("program"));
assertEquals("1713", matches.get("pid"));
String[] logsource = new String[1];
GrokCaptureExtracter logsourceExtracter = namedConfig(grok, "logsource").nativeExtracter(new ThrowingNativeExtracterMap() {
@Override
public GrokCaptureExtracter forString(Function<Consumer<String>, GrokCaptureExtracter> buildExtracter) {
return buildExtracter.apply(str -> logsource[0] = str);
}
});
assertThat(specificCapture(grok, line, logsourceExtracter), is(true));
assertThat(logsource[0], equalTo("evita"));
}
public void testSyslog5424Line() {
@ -336,6 +360,16 @@ public class GrokTests extends ESTestCase {
Map<String, Object> actual = g.captures(text);
assertEquals(expected, actual);
boolean[] status = new boolean[1];
GrokCaptureExtracter statusExtracter = namedConfig(g, "status").nativeExtracter(new ThrowingNativeExtracterMap() {
@Override
public GrokCaptureExtracter forBoolean(Function<Consumer<Boolean>, GrokCaptureExtracter> buildExtracter) {
return buildExtracter.apply(b -> status[0] = b);
}
});
assertThat(specificCapture(g, text, statusExtracter), is(true));
assertThat(status[0], equalTo(true));
}
public void testNumericCaptures() {
@ -355,6 +389,35 @@ public class GrokTests extends ESTestCase {
Map<String, Object> actual = g.captures(text);
assertEquals(expected, actual);
float[] bytes = new float[1];
GrokCaptureExtracter bytesExtracter = namedConfig(g, "bytes").nativeExtracter(new ThrowingNativeExtracterMap() {
@Override
public GrokCaptureExtracter forFloat(Function<FloatConsumer, GrokCaptureExtracter> buildExtracter) {
return buildExtracter.apply(f -> bytes[0] = f);
}
});
assertThat(specificCapture(g, text, bytesExtracter), is(true));
assertThat(bytes[0], equalTo(12009.34f));
long[] id = new long[1];
GrokCaptureExtracter idExtracter = namedConfig(g, "id").nativeExtracter(new ThrowingNativeExtracterMap() {
@Override
public GrokCaptureExtracter forLong(Function<LongConsumer, GrokCaptureExtracter> buildExtracter) {
return buildExtracter.apply(l -> id[0] = l);
}
});
assertThat(specificCapture(g, text, idExtracter), is(true));
assertThat(id[0], equalTo(20000000000L));
double[] rating = new double[1];
GrokCaptureExtracter ratingExtracter = namedConfig(g, "rating").nativeExtracter(new ThrowingNativeExtracterMap() {
public GrokCaptureExtracter forDouble(java.util.function.Function<DoubleConsumer,GrokCaptureExtracter> buildExtracter) {
return buildExtracter.apply(d -> rating[0] = d);
}
});
assertThat(specificCapture(g, text, ratingExtracter), is(true));
assertThat(rating[0], equalTo(4820.092));
}
public void testNumericCapturesCoercion() {
@ -621,4 +684,45 @@ public class GrokTests extends ESTestCase {
}
assertThat(fromGrok, equalTo(new TreeMap<>(nameToType)));
}
private GrokCaptureConfig namedConfig(Grok grok, String name) {
return grok.captureConfig().stream().filter(i -> i.name().equals(name)).findFirst().get();
}
private boolean specificCapture(Grok grok, String str, GrokCaptureExtracter extracter) {
byte[] utf8 = str.getBytes(StandardCharsets.UTF_8);
return grok.match(utf8, 0, utf8.length, extracter);
}
private abstract class ThrowingNativeExtracterMap implements NativeExtracterMap<GrokCaptureExtracter> {
@Override
public GrokCaptureExtracter forString(Function<Consumer<String>, GrokCaptureExtracter> buildExtracter) {
throw new IllegalArgumentException();
}
@Override
public GrokCaptureExtracter forInt(Function<IntConsumer, GrokCaptureExtracter> buildExtracter) {
throw new IllegalArgumentException();
}
@Override
public GrokCaptureExtracter forLong(Function<LongConsumer, GrokCaptureExtracter> buildExtracter) {
throw new IllegalArgumentException();
}
@Override
public GrokCaptureExtracter forFloat(Function<FloatConsumer, GrokCaptureExtracter> buildExtracter) {
throw new IllegalArgumentException();
}
@Override
public GrokCaptureExtracter forDouble(Function<DoubleConsumer, GrokCaptureExtracter> buildExtracter) {
throw new IllegalArgumentException();
}
@Override
public GrokCaptureExtracter forBoolean(Function<Consumer<Boolean>, GrokCaptureExtracter> buildExtracter) {
throw new IllegalArgumentException();
}
}
}