Merge pull request #15167 from talevy/custom_grok_patterns

add ability to define custom grok patterns within processor config
This commit is contained in:
Tal Levy 2015-12-03 08:27:35 -08:00
commit 41a953bf8b
4 changed files with 140 additions and 0 deletions

View File

@ -245,6 +245,7 @@ TIME (?!<[0-9])%{HOUR}:%{MINUTE}(?::%{SECOND})(?![0-9])
| Name | Required | Default | Description
| `match_field` | yes | - | The field to use for grok expression parsing
| `match_pattern` | yes | - | The grok expression to match and extract named captures with
| `pattern_definitions` | no | - | A map of pattern-name and pattern tuples defining custom patterns to be used by the current processor. Patterns matching existing names will override the pre-existing definition.
|======
Here is an example of using the provided patterns to extract out and name structured fields from a string field in
@ -295,6 +296,28 @@ This pipeline will insert these named captures as new fields within the document
}
--------------------------------------------------
An example of a pipeline specifying custom pattern definitions:
[source,js]
--------------------------------------------------
{
"description" : "...",
"processors": [
{
"grok": {
"match_field": "message",
"match_pattern": "my %{FAVORITE_DOG:dog} is colored %{RGB:color}"
"pattern_definitions" : {
"FAVORITE_DOG" : "beagle",
"RGB" : "RED|GREEN|BLUE"
}
}
}
]
}
--------------------------------------------------
==== Geoip processor
The GeoIP processor adds information about the geographical location of IP addresses, based on data from the Maxmind databases.

View File

@ -98,7 +98,10 @@ public final class GrokProcessor implements Processor {
public GrokProcessor create(Map<String, Object> config) throws Exception {
String matchField = ConfigurationUtils.readStringProperty(config, "field");
String matchPattern = ConfigurationUtils.readStringProperty(config, "pattern");
Map<String, String> customPatternBank = ConfigurationUtils.readOptionalMap(config, "pattern_definitions");
Map<String, String> patternBank = new HashMap<>();
Path patternsDirectory = grokConfigDirectory.resolve("patterns");
try (DirectoryStream<Path> stream = Files.newDirectoryStream(patternsDirectory)) {
for (Path patternFilePath : stream) {
@ -110,6 +113,10 @@ public final class GrokProcessor implements Processor {
}
}
if (customPatternBank != null) {
patternBank.putAll(customPatternBank);
}
Grok grok = new Grok(patternBank, matchPattern);
return new GrokProcessor(grok, matchField);
}

View File

@ -24,6 +24,7 @@ import org.junit.Before;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
@ -53,4 +54,16 @@ public class GrokProcessorFactoryTests extends ESTestCase {
assertThat(processor.getGrok(), notNullValue());
}
public void testCreateWithCustomPatterns() throws Exception {
GrokProcessor.Factory factory = new GrokProcessor.Factory(configDir);
Map<String, Object> config = new HashMap<>();
config.put("field", "_field");
config.put("pattern", "%{MY_PATTERN:name}!");
config.put("pattern_definitions", Collections.singletonMap("MY_PATTERN", "foo"));
GrokProcessor processor = factory.create(config);
assertThat(processor.getMatchField(), equalTo("_field"));
assertThat(processor.getGrok(), notNullValue());
assertThat(processor.getGrok().match("foo!"), equalTo(true));
}
}

View File

@ -46,3 +46,100 @@
- match: { _source.status: 400 }
- match: { _source.msg: "foo" }
---
"Test Grok Pipeline With Custom Pattern":
- do:
cluster.health:
wait_for_status: green
- do:
ingest.put_pipeline:
id: "my_pipeline"
body: >
{
"description": "_description",
"processors": [
{
"grok" : {
"field" : "field1",
"pattern" : "<%{MY_PATTERN:msg}>",
"pattern_definitions" : {
"MY_PATTERN" : "foo"
}
}
}
]
}
- match: { _id: "my_pipeline" }
# Simulate a Thread.sleep(), because pipeline are updated in the background
- do:
catch: request_timeout
cluster.health:
wait_for_nodes: 99
timeout: 2s
- match: { "timed_out": true }
- do:
ingest.index:
index: test
type: test
id: 1
pipeline_id: "my_pipeline"
body: {field1: "<foo>"}
- do:
get:
index: test
type: test
id: 1
- match: { _source.msg: "foo" }
---
"Test Grok Pipeline With Custom Pattern Sharing Same Name As Another":
- do:
cluster.health:
wait_for_status: green
- do:
ingest.put_pipeline:
id: "my_pipeline"
body: >
{
"description": "_description",
"processors": [
{
"grok" : {
"field" : "field1",
"pattern" : "<%{NUMBER:msg}>",
"pattern_definitions" : {
"NUMBER" : "foo"
}
}
}
]
}
- match: { _id: "my_pipeline" }
# Simulate a Thread.sleep(), because pipeline are updated in the background
- do:
catch: request_timeout
cluster.health:
wait_for_nodes: 99
timeout: 2s
- match: { "timed_out": true }
- do:
ingest.index:
index: test
type: test
id: 1
pipeline_id: "my_pipeline"
body: {field1: "<foo>"}
- do:
get:
index: test
type: test
id: 1
- match: { _source.msg: "foo" }