introduce a "tree" type to the flattenSpec (#12177)

* introduce a "tree" type to the flattenSpec

* feedback - rename exprs to nodes, use CollectionsUtils.isNullOrEmpty for guard

* feedback - expand docs to more clearly capture limitations of "tree" flattenSpec

* feedback - fix for typo on docs

* introduce a comment to explain defensive copy, tweak null handling

* fix: part of rebase

* mark ObjectFlatteners.FlattenerMaker as an ExtensionPoint and provide default for new tree type

* fix: objectflattener restore previous behavior to call getRootField for root type

* docs: ingestion/data-formats add note that ORC only supports path expressions

* chore: linter remove unused import

* fix: use correct newer form for empty DimensionsSpec in FlattenJSONBenchmark
This commit is contained in:
Jason Koch 2022-10-31 23:49:30 -07:00 committed by GitHub
parent 675fd982fb
commit 0d03ce435f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 311 additions and 44 deletions

View File

@ -56,6 +56,8 @@ public class FlattenJSONBenchmark
Parser flatParser;
Parser nestedParser;
Parser jqParser;
Parser treeJqParser;
Parser treeTreeParser;
Parser fieldDiscoveryParser;
Parser forcedPathParser;
int flatCounter = 0;
@ -82,6 +84,8 @@ public class FlattenJSONBenchmark
flatParser = gen.getFlatParser();
nestedParser = gen.getNestedParser();
jqParser = gen.getJqParser();
treeJqParser = gen.getTreeJqParser();
treeTreeParser = gen.getTreeTreeParser();
fieldDiscoveryParser = gen.getFieldDiscoveryParser();
forcedPathParser = gen.getForcedPathParser();
}
@ -112,6 +116,32 @@ public class FlattenJSONBenchmark
return parsed;
}
@Benchmark
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
public Map<String, Object> treejqflatten(final Blackhole blackhole)
{
Map<String, Object> parsed = treeJqParser.parseToMap(nestedInputs.get(jqCounter));
for (String s : parsed.keySet()) {
blackhole.consume(parsed.get(s));
}
jqCounter = (jqCounter + 1) % NUM_EVENTS;
return parsed;
}
@Benchmark
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
public Map<String, Object> treetreeflatten(final Blackhole blackhole)
{
Map<String, Object> parsed = treeTreeParser.parseToMap(nestedInputs.get(jqCounter));
for (String s : parsed.keySet()) {
blackhole.consume(parsed.get(s));
}
jqCounter = (jqCounter + 1) % NUM_EVENTS;
return parsed;
}
@Benchmark
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MICROSECONDS)

View File

@ -35,6 +35,7 @@ import org.apache.druid.java.util.common.parsers.JSONPathSpec;
import org.apache.druid.java.util.common.parsers.Parser;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
@ -209,6 +210,69 @@ public class FlattenJSONBenchmarkUtil
return spec.makeParser();
}
public Parser getTreeJqParser()
{
List<JSONPathFieldSpec> fields = new ArrayList<>();
fields.add(JSONPathFieldSpec.createRootField("ts"));
fields.add(JSONPathFieldSpec.createRootField("d1"));
fields.add(JSONPathFieldSpec.createJqField("e1.d1", ".e1.d1"));
fields.add(JSONPathFieldSpec.createJqField("e1.d2", ".e1.d2"));
fields.add(JSONPathFieldSpec.createJqField("e2.d3", ".e2.d3"));
fields.add(JSONPathFieldSpec.createJqField("e2.d4", ".e2.d4"));
fields.add(JSONPathFieldSpec.createJqField("e2.d5", ".e2.d5"));
fields.add(JSONPathFieldSpec.createJqField("e2.d6", ".e2.d6"));
fields.add(JSONPathFieldSpec.createRootField("m3"));
fields.add(JSONPathFieldSpec.createJqField("e3.m1", ".e3.m1"));
fields.add(JSONPathFieldSpec.createJqField("e3.m2", ".e3.m2"));
fields.add(JSONPathFieldSpec.createJqField("e3.m3", ".e3.m3"));
fields.add(JSONPathFieldSpec.createJqField("e3.m4", ".e3.m4"));
JSONPathSpec flattenSpec = new JSONPathSpec(false, fields);
JSONParseSpec spec = new JSONParseSpec(
new TimestampSpec("ts", "iso", null),
DimensionsSpec.EMPTY,
flattenSpec,
null,
null
);
return spec.makeParser();
}
public Parser getTreeTreeParser()
{
List<JSONPathFieldSpec> fields = new ArrayList<>();
fields.add(JSONPathFieldSpec.createRootField("ts"));
fields.add(JSONPathFieldSpec.createRootField("d1"));
fields.add(JSONPathFieldSpec.createTreeField("e1.d1", Arrays.asList("e1", "d1")));
fields.add(JSONPathFieldSpec.createTreeField("e1.d2", Arrays.asList("e1", "d2")));
fields.add(JSONPathFieldSpec.createTreeField("e2.d3", Arrays.asList("e2", "d3")));
fields.add(JSONPathFieldSpec.createTreeField("e2.d4", Arrays.asList("e2", "d4")));
fields.add(JSONPathFieldSpec.createTreeField("e2.d5", Arrays.asList("e2", "d5")));
fields.add(JSONPathFieldSpec.createTreeField("e2.d6", Arrays.asList("e2", "d6")));
fields.add(JSONPathFieldSpec.createRootField("m3"));
fields.add(JSONPathFieldSpec.createTreeField("e3.m1", Arrays.asList("e3", "m1")));
fields.add(JSONPathFieldSpec.createTreeField("e3.m2", Arrays.asList("e3", "m2")));
fields.add(JSONPathFieldSpec.createTreeField("e3.m3", Arrays.asList("e3", "m3")));
fields.add(JSONPathFieldSpec.createTreeField("e3.m4", Arrays.asList("e3", "m4")));
JSONPathSpec flattenSpec = new JSONPathSpec(false, fields);
JSONParseSpec spec = new JSONParseSpec(
new TimestampSpec("ts", "iso", null),
DimensionsSpec.EMPTY,
flattenSpec,
null,
null
);
return spec.makeParser();
}
public String generateFlatEvent() throws Exception
{
String nestedEvent = generateNestedEvent();

View File

@ -109,6 +109,24 @@ public class JSONFlattenerMaker implements ObjectFlatteners.FlattenerMaker<JsonN
}
}
@Override
public Function<JsonNode, Object> makeJsonTreeExtractor(final List<String> nodes)
{
// create a defensive copy
final String[] keyNames = nodes.toArray(new String[0]);
return jsonNode -> {
JsonNode targetNode = jsonNode;
for (String keyName : keyNames) {
if (targetNode == null) {
return null;
}
targetNode = targetNode.get(keyName);
}
return finalizeConversionForMap(targetNode);
};
}
@Override
public JsonProvider getJsonProvider()
{

View File

@ -22,7 +22,9 @@ package org.apache.druid.java.util.common.parsers;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.base.Preconditions;
import org.apache.druid.utils.CollectionUtils;
import java.util.List;
import java.util.Objects;
public class JSONPathFieldSpec
@ -30,25 +32,49 @@ public class JSONPathFieldSpec
private final JSONPathFieldType type;
private final String name;
private final String expr;
private final List<String> nodes;
@JsonCreator
public JSONPathFieldSpec(
@JsonProperty("type") JSONPathFieldType type,
@JsonProperty("name") String name,
@JsonProperty("expr") String expr
@JsonProperty("expr") String expr,
@JsonProperty("nodes") List<String> nodes
)
{
this.type = type;
this.name = Preconditions.checkNotNull(name, "Missing 'name' in field spec");
// If expr is null and type is root, use the name as the expr too.
if (expr == null && type == JSONPathFieldType.ROOT) {
this.expr = name;
} else {
// Validate required fields are present
switch (type) {
case ROOT:
this.expr = (expr == null) ? name : expr;
this.nodes = null;
break;
case TREE:
this.expr = null;
Preconditions.checkArgument(
!CollectionUtils.isNullOrEmpty(nodes),
"Missing 'nodes' for field[%s], was [%s]", name, nodes);
this.nodes = nodes;
break;
default:
this.expr = Preconditions.checkNotNull(expr, "Missing 'expr' for field[%s]", name);
this.nodes = null;
}
}
public JSONPathFieldSpec(
JSONPathFieldType type,
String name,
String expr
)
{
this(type, name, expr, null);
}
@JsonProperty
public JSONPathFieldType getType()
{
@ -67,6 +93,12 @@ public class JSONPathFieldSpec
return expr;
}
@JsonProperty
public List<String> getNodes()
{
return nodes;
}
@JsonCreator
public static JSONPathFieldSpec fromString(String name)
{
@ -88,6 +120,11 @@ public class JSONPathFieldSpec
return new JSONPathFieldSpec(JSONPathFieldType.ROOT, name, null);
}
public static JSONPathFieldSpec createTreeField(String name, List<String> nodes)
{
return new JSONPathFieldSpec(JSONPathFieldType.TREE, name, null, nodes);
}
@Override
public boolean equals(final Object o)
{
@ -100,13 +137,14 @@ public class JSONPathFieldSpec
final JSONPathFieldSpec that = (JSONPathFieldSpec) o;
return type == that.type &&
Objects.equals(name, that.name) &&
Objects.equals(expr, that.expr);
Objects.equals(expr, that.expr) &&
Objects.equals(nodes, that.nodes);
}
@Override
public int hashCode()
{
return Objects.hash(type, name, expr);
return Objects.hash(type, name, expr, nodes);
}
@Override
@ -116,6 +154,7 @@ public class JSONPathFieldSpec
"type=" + type +
", name='" + name + '\'' +
", expr='" + expr + '\'' +
", nodes='" + nodes + '\'' +
'}';
}
}

View File

@ -27,7 +27,8 @@ public enum JSONPathFieldType
{
ROOT,
PATH,
JQ;
JQ,
TREE;
@JsonValue
@Override

View File

@ -21,6 +21,7 @@ package org.apache.druid.java.util.common.parsers;
import com.google.common.collect.Iterables;
import com.jayway.jsonpath.spi.json.JsonProvider;
import org.apache.druid.guice.annotations.ExtensionPoint;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.java.util.common.UOE;
@ -64,6 +65,9 @@ public class ObjectFlatteners
case JQ:
extractor = flattenerMaker.makeJsonQueryExtractor(fieldSpec.getExpr());
break;
case TREE:
extractor = flattenerMaker.makeJsonTreeExtractor(fieldSpec.getNodes());
break;
default:
throw new UOE("Unsupported field type[%s]", fieldSpec.getType());
}
@ -208,6 +212,7 @@ public class ObjectFlatteners
};
}
@ExtensionPoint
public interface FlattenerMaker<T>
{
JsonProvider getJsonProvider();
@ -231,6 +236,14 @@ public class ObjectFlatteners
*/
Function<T, Object> makeJsonQueryExtractor(String expr);
/**
* Create a "field" extractor for nested json expressions
*/
default Function<T, Object> makeJsonTreeExtractor(List<String> nodes)
{
throw new UOE("makeJsonTreeExtractor has not been implemented.");
}
/**
* Convert object to Java {@link Map} using {@link #getJsonProvider()} and {@link #finalizeConversionForMap} to
* extract and convert data

View File

@ -32,6 +32,7 @@ import org.junit.Assert;
import org.junit.Test;
import java.io.IOException;
import java.util.Arrays;
public class JsonInputFormatTest
{
@ -48,7 +49,9 @@ public class JsonInputFormatTest
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg2", "$.o.mg2"),
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg", ".o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2")
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2"),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg", null, Arrays.asList("o", "mg")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg2", null, Arrays.asList("o", "mg2"))
)
),
ImmutableMap.of(Feature.ALLOW_COMMENTS.name(), true, Feature.ALLOW_UNQUOTED_FIELD_NAMES.name(), false),

View File

@ -52,7 +52,11 @@ public class JsonLineReaderTest
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg2", "$.o.mg2"),
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg", ".o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2")
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2"),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_baz", null, Collections.singletonList("baz")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_baz2", null, Collections.singletonList("baz2")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg", null, Arrays.asList("o", "mg")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg2", null, Arrays.asList("o", "mg2"))
)
),
null,
@ -83,12 +87,16 @@ public class JsonLineReaderTest
Assert.assertEquals("x", Iterables.getOnlyElement(row.getDimension("foo")));
Assert.assertEquals("4", Iterables.getOnlyElement(row.getDimension("baz")));
Assert.assertEquals("4", Iterables.getOnlyElement(row.getDimension("root_baz")));
Assert.assertEquals("4", Iterables.getOnlyElement(row.getDimension("tree_baz")));
Assert.assertEquals("1", Iterables.getOnlyElement(row.getDimension("path_omg")));
Assert.assertEquals("1", Iterables.getOnlyElement(row.getDimension("jq_omg")));
Assert.assertEquals("1", Iterables.getOnlyElement(row.getDimension("tree_omg")));
Assert.assertTrue(row.getDimension("root_baz2").isEmpty());
Assert.assertTrue(row.getDimension("tree_baz2").isEmpty());
Assert.assertTrue(row.getDimension("path_omg2").isEmpty());
Assert.assertTrue(row.getDimension("jq_omg2").isEmpty());
Assert.assertTrue(row.getDimension("tree_omg2").isEmpty());
numActualIterations++;
}
Assert.assertEquals(numExpectedIterations, numActualIterations);
@ -148,7 +156,8 @@ public class JsonLineReaderTest
new JSONPathSpec(
true,
ImmutableList.of(
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg")
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg", null, Arrays.asList("o", "mg"))
)
),
null,
@ -175,10 +184,11 @@ public class JsonLineReaderTest
int numActualIterations = 0;
while (iterator.hasNext()) {
final InputRow row = iterator.next();
Assert.assertEquals(Arrays.asList("path_omg", "timestamp", "bar", "foo"), row.getDimensions());
Assert.assertEquals(Arrays.asList("path_omg", "tree_omg", "timestamp", "bar", "foo"), row.getDimensions());
Assert.assertTrue(row.getDimension("bar").isEmpty());
Assert.assertEquals("x", Iterables.getOnlyElement(row.getDimension("foo")));
Assert.assertTrue(row.getDimension("path_omg").isEmpty());
Assert.assertTrue(row.getDimension("tree_omg").isEmpty());
numActualIterations++;
}
Assert.assertEquals(numExpectedIterations, numActualIterations);
@ -192,7 +202,8 @@ public class JsonLineReaderTest
new JSONPathSpec(
true,
ImmutableList.of(
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg")
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg", null, Arrays.asList("o", "mg"))
)
),
null,
@ -219,10 +230,11 @@ public class JsonLineReaderTest
int numActualIterations = 0;
while (iterator.hasNext()) {
final InputRow row = iterator.next();
Assert.assertEquals(Arrays.asList("path_omg", "timestamp", "bar", "foo"), row.getDimensions());
Assert.assertEquals(Arrays.asList("path_omg", "tree_omg", "timestamp", "bar", "foo"), row.getDimensions());
Assert.assertEquals("1", Iterables.getOnlyElement(row.getDimension("bar")));
Assert.assertEquals("x", Iterables.getOnlyElement(row.getDimension("foo")));
Assert.assertEquals("a", Iterables.getOnlyElement(row.getDimension("path_omg")));
Assert.assertEquals("a", Iterables.getOnlyElement(row.getDimension("tree_omg")));
numActualIterations++;
}
Assert.assertEquals(numExpectedIterations, numActualIterations);
@ -236,7 +248,8 @@ public class JsonLineReaderTest
new JSONPathSpec(
true,
ImmutableList.of(
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg")
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg", null, Arrays.asList("o", "mg"))
)
),
null,
@ -263,10 +276,11 @@ public class JsonLineReaderTest
int numActualIterations = 0;
while (iterator.hasNext()) {
final InputRow row = iterator.next();
Assert.assertEquals(Arrays.asList("path_omg", "timestamp", "foo"), row.getDimensions());
Assert.assertEquals(Arrays.asList("path_omg", "tree_omg", "timestamp", "foo"), row.getDimensions());
Assert.assertTrue(row.getDimension("bar").isEmpty());
Assert.assertEquals("x", Iterables.getOnlyElement(row.getDimension("foo")));
Assert.assertEquals("a", Iterables.getOnlyElement(row.getDimension("path_omg")));
Assert.assertEquals("a", Iterables.getOnlyElement(row.getDimension("tree_omg")));
numActualIterations++;
}
Assert.assertEquals(numExpectedIterations, numActualIterations);

View File

@ -39,6 +39,8 @@ import org.junit.Test;
import org.junit.rules.ExpectedException;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
public class JsonReaderTest
{
@ -57,7 +59,11 @@ public class JsonReaderTest
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg2", "$.o.mg2"),
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg", ".o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2")
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2"),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_baz", null, Collections.singletonList("baz")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_baz2", null, Collections.singletonList("baz2")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg", null, Arrays.asList("o", "mg")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg2", null, Arrays.asList("o", "mg2"))
)
),
null,
@ -95,12 +101,16 @@ public class JsonReaderTest
Assert.assertEquals("x", Iterables.getOnlyElement(row.getDimension("foo")));
Assert.assertEquals("4", Iterables.getOnlyElement(row.getDimension("baz")));
Assert.assertEquals("4", Iterables.getOnlyElement(row.getDimension("root_baz")));
Assert.assertEquals("4", Iterables.getOnlyElement(row.getDimension("tree_baz")));
Assert.assertEquals(msgId, Iterables.getOnlyElement(row.getDimension("path_omg")));
Assert.assertEquals(msgId, Iterables.getOnlyElement(row.getDimension("jq_omg")));
Assert.assertEquals(msgId, Iterables.getOnlyElement(row.getDimension("tree_omg")));
Assert.assertTrue(row.getDimension("root_baz2").isEmpty());
Assert.assertTrue(row.getDimension("tree_baz2").isEmpty());
Assert.assertTrue(row.getDimension("path_omg2").isEmpty());
Assert.assertTrue(row.getDimension("jq_omg2").isEmpty());
Assert.assertTrue(row.getDimension("tree_omg2").isEmpty());
}
Assert.assertEquals(numExpectedIterations, numActualIterations);
@ -119,7 +129,11 @@ public class JsonReaderTest
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg2", "$.o.mg2"),
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg", ".o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2")
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2"),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_baz", null, Collections.singletonList("baz")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_baz2", null, Collections.singletonList("baz2")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg", null, Arrays.asList("o", "mg")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg2", null, Arrays.asList("o", "mg2"))
)
),
null,
@ -162,12 +176,16 @@ public class JsonReaderTest
Assert.assertEquals("x", Iterables.getOnlyElement(row.getDimension("foo")));
Assert.assertEquals("4", Iterables.getOnlyElement(row.getDimension("baz")));
Assert.assertEquals("4", Iterables.getOnlyElement(row.getDimension("root_baz")));
Assert.assertEquals("4", Iterables.getOnlyElement(row.getDimension("tree_baz")));
Assert.assertEquals("1", Iterables.getOnlyElement(row.getDimension("path_omg")));
Assert.assertEquals("1", Iterables.getOnlyElement(row.getDimension("jq_omg")));
Assert.assertEquals("1", Iterables.getOnlyElement(row.getDimension("tree_omg")));
Assert.assertTrue(row.getDimension("root_baz2").isEmpty());
Assert.assertTrue(row.getDimension("tree_baz2").isEmpty());
Assert.assertTrue(row.getDimension("path_omg2").isEmpty());
Assert.assertTrue(row.getDimension("jq_omg2").isEmpty());
Assert.assertTrue(row.getDimension("tree_omg2").isEmpty());
numActualIterations++;
}
@ -188,7 +206,11 @@ public class JsonReaderTest
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg2", "$.o.mg2"),
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg", ".o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2")
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2"),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_baz", null, Collections.singletonList("baz")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_baz2", null, Collections.singletonList("baz2")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg", null, Arrays.asList("o", "mg")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg2", null, Arrays.asList("o", "mg2"))
)
),
null,
@ -244,7 +266,11 @@ public class JsonReaderTest
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg2", "$.o.mg2"),
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg", ".o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2")
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2"),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_baz", null, Collections.singletonList("baz")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_baz2", null, Collections.singletonList("baz2")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg", null, Arrays.asList("o", "mg")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg2", null, Arrays.asList("o", "mg2"))
)
),
null,
@ -287,12 +313,16 @@ public class JsonReaderTest
Assert.assertEquals("x", Iterables.getOnlyElement(row.getDimension("foo")));
Assert.assertEquals("4", Iterables.getOnlyElement(row.getDimension("baz")));
Assert.assertEquals("4", Iterables.getOnlyElement(row.getDimension("root_baz")));
Assert.assertEquals("4", Iterables.getOnlyElement(row.getDimension("tree_baz")));
Assert.assertEquals(msgId, Iterables.getOnlyElement(row.getDimension("path_omg")));
Assert.assertEquals(msgId, Iterables.getOnlyElement(row.getDimension("jq_omg")));
Assert.assertEquals(msgId, Iterables.getOnlyElement(row.getDimension("tree_omg")));
Assert.assertTrue(row.getDimension("root_baz2").isEmpty());
Assert.assertTrue(row.getDimension("tree_baz2").isEmpty());
Assert.assertTrue(row.getDimension("path_omg2").isEmpty());
Assert.assertTrue(row.getDimension("jq_omg2").isEmpty());
Assert.assertTrue(row.getDimension("tree_omg2").isEmpty());
}
}
}
@ -312,7 +342,11 @@ public class JsonReaderTest
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg2", "$.o.mg2"),
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg", ".o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2")
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2"),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_baz", null, Collections.singletonList("baz")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_baz2", null, Collections.singletonList("baz2")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg", null, Arrays.asList("o", "mg")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg2", null, Arrays.asList("o", "mg2"))
)
),
null,
@ -370,7 +404,11 @@ public class JsonReaderTest
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg2", "$.o.mg2"),
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg", ".o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2")
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2"),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_baz", null, Collections.singletonList("baz")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_baz2", null, Collections.singletonList("baz2")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg", null, Arrays.asList("o", "mg")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg2", null, Arrays.asList("o", "mg2"))
)
),
null,
@ -428,7 +466,11 @@ public class JsonReaderTest
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg2", "$.o.mg2"),
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg", ".o.mg"),
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2")
new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2"),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_baz", null, Collections.singletonList("baz")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_baz2", null, Collections.singletonList("baz2")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg", null, Arrays.asList("o", "mg")),
new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree_omg2", null, Arrays.asList("o", "mg2"))
)
),
null,

View File

@ -28,6 +28,7 @@ import org.junit.rules.ExpectedException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
@ -173,6 +174,10 @@ public class JSONPathParserTest
fields.add(new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq-nested-foo.bar2", ".foo.bar2"));
fields.add(new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq-heybarx0", ".hey[0].barx"));
fields.add(new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq-met-array", ".met.a"));
fields.add(new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree-simpleVal", null, Collections.singletonList("simpleVal")));
fields.add(new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree-timestamp", null, Collections.singletonList("timestamp")));
fields.add(new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree-nested-foo.bar2", null, Arrays.asList("foo", "bar2")));
fields.add(new JSONPathFieldSpec(JSONPathFieldType.TREE, "tree-met-array", null, Arrays.asList("met", "a")));
final Parser<String, Object> jsonParser = new JSONPathParser(new JSONPathSpec(false, fields), null, false);
final Map<String, Object> jsonMap = jsonParser.parseToMap(NESTED_JSON);
@ -180,6 +185,8 @@ public class JSONPathParserTest
// Root fields
Assert.assertEquals("text", jsonMap.get("simpleVal"));
Assert.assertEquals("2999", jsonMap.get("timestamp"));
Assert.assertEquals("text", jsonMap.get("tree-simpleVal"));
Assert.assertEquals("2999", jsonMap.get("tree-timestamp"));
// Nested fields
Assert.assertEquals("bbb", jsonMap.get("nested-foo.bar2"));
@ -189,6 +196,9 @@ public class JSONPathParserTest
Assert.assertEquals("asdf", jsonMap.get("jq-heybarx0"));
Assert.assertEquals(ImmutableList.of(7L, 8L, 9L), jsonMap.get("jq-met-array"));
Assert.assertEquals(ImmutableList.of(7L, 8L, 9L), jsonMap.get("tree-met-array"));
Assert.assertEquals("bbb", jsonMap.get("tree-nested-foo.bar2"));
// Fields that should not be discovered
Assert.assertFalse(jsonMap.containsKey("newmet"));
Assert.assertFalse(jsonMap.containsKey("foo.bar1"));

View File

@ -231,7 +231,7 @@ Configure the ORC `inputFormat` to load ORC data as follows:
| Field | Type | Description | Required |
|-------|------|-------------|----------|
| type | String | Set value to `orc`. | yes |
| flattenSpec | JSON Object | Specifies flattening configuration for nested ORC data. See [`flattenSpec`](#flattenspec) for more info. | no |
| flattenSpec | JSON Object | Specifies flattening configuration for nested ORC data. Only 'path' expressions are supported ('jq' and 'tree' are unavailable). See [`flattenSpec`](#flattenspec) for more info. | no |
| binaryAsString | Boolean | Specifies if the binary orc column which is not logically marked as a string should be treated as a UTF-8 encoded string. | no (default = false) |
For example:
@ -262,9 +262,9 @@ To use the Parquet input format load the Druid Parquet extension ([`druid-parque
Configure the Parquet `inputFormat` to load Parquet data as follows:
| Field | Type | Description | Required |
|-------|------|-------------|----------|
|-------|------|---------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|
|type| String| Set value to `parquet`. | yes |
|flattenSpec| JSON Object | Define a [`flattenSpec`](#flattenspec) to extract nested values from a Parquet file. Only 'path' expressions are supported ('jq' is unavailable).| no (default will auto-discover 'root' level properties) |
|flattenSpec| JSON Object | Define a [`flattenSpec`](#flattenspec) to extract nested values from a Parquet file. Only 'path' expressions are supported ('jq' and 'tree' are unavailable). | no (default will auto-discover 'root' level properties) |
| binaryAsString | Boolean | Specifies if the bytes parquet column which is not logically marked as a string or enum type should be treated as a UTF-8 encoded string. | no (default = false) |
For example:
@ -510,9 +510,9 @@ See the [Avro Types](../development/extensions-core/avro.md#avro-types) section
Configure the Avro OCF `inputFormat` to load Avro OCF data as follows:
| Field | Type | Description | Required |
|-------|------|-------------|----------|
|-------|------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|
|type| String| Set value to `avro_ocf`. | yes |
|flattenSpec| JSON Object |Define a [`flattenSpec`](#flattenspec) to extract nested values from Avro records. Only 'path' expressions are supported ('jq' is unavailable).| no (default will auto-discover 'root' level properties) |
|flattenSpec| JSON Object | Define a [`flattenSpec`](#flattenspec) to extract nested values from Avro records. Only 'path' expressions are supported ('jq' and 'tree' are unavailable). | no (default will auto-discover 'root' level properties) |
|schema| JSON Object | Define a reader schema to be used when parsing Avro records. This is useful when parsing multiple versions of Avro OCF file data. | no (default will decode using the writer schema contained in the OCF file) |
| binaryAsString | Boolean | Specifies if the bytes parquet column which is not logically marked as a string or enum type should be treated as a UTF-8 encoded string. | no (default = false) |
@ -559,9 +559,9 @@ For example:
Configure the Protobuf `inputFormat` to load Protobuf data as follows:
| Field | Type | Description | Required |
|-------|------|-------------|----------|
|-------|------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|
|type| String| Set value to `protobuf`. | yes |
|flattenSpec| JSON Object |Define a [`flattenSpec`](#flattenspec) to extract nested values from a Protobuf record. Note that only 'path' expression are supported ('jq' is unavailable).| no (default will auto-discover 'root' level properties) |
|flattenSpec| JSON Object | Define a [`flattenSpec`](#flattenspec) to extract nested values from a Protobuf record. Note that only 'path' expression are supported ('jq' and 'tree' is unavailable). | no (default will auto-discover 'root' level properties) |
|`protoBytesDecoder`| JSON Object | Specifies how to decode bytes to Protobuf record. | yes |
For example:
@ -609,6 +609,7 @@ For example:
"fields": [
{ "name": "baz", "type": "root" },
{ "name": "foo_bar", "type": "path", "expr": "$.foo.bar" },
{ "name": "foo_other_bar", "type": "tree", "nodes": ["foo", "other", "bar"] },
{ "name": "first_food", "type": "jq", "expr": ".thing.food[1]" }
]
}
@ -623,9 +624,10 @@ Each entry in the `fields` list can have the following components:
| Field | Description | Default |
|-------|-------------|---------|
| type | Options are as follows:<br /><br /><ul><li>`root`, referring to a field at the root level of the record. Only really useful if `useFieldDiscovery` is false.</li><li>`path`, referring to a field using [JsonPath](https://github.com/jayway/JsonPath) notation. Supported by most data formats that offer nesting, including `avro`, `json`, `orc`, and `parquet`.</li><li>`jq`, referring to a field using [jackson-jq](https://github.com/eiiches/jackson-jq) notation. Only supported for the `json` format.</li></ul> | none (required) |
| type | Options are as follows:<br /><br /><ul><li>`root`, referring to a field at the root level of the record. Only really useful if `useFieldDiscovery` is false.</li><li>`path`, referring to a field using [JsonPath](https://github.com/jayway/JsonPath) notation. Supported by most data formats that offer nesting, including `avro`, `json`, `orc`, and `parquet`.</li><li>`jq`, referring to a field using [jackson-jq](https://github.com/eiiches/jackson-jq) notation. Only supported for the `json` format.</li><li>`tree`, referring to a nested field from the root level of the record. Useful and more efficient than `path` or `jq` if a simple hierarchical fetch is required. Only supported for the `json` format.</li></ul> | none (required) |
| name | Name of the field after flattening. This name can be referred to by the [`timestampSpec`](./ingestion-spec.md#timestampspec), [`transformSpec`](./ingestion-spec.md#transformspec), [`dimensionsSpec`](./ingestion-spec.md#dimensionsspec), and [`metricsSpec`](./ingestion-spec.md#metricsspec).| none (required) |
| expr | Expression for accessing the field while flattening. For type `path`, this should be [JsonPath](https://github.com/jayway/JsonPath). For type `jq`, this should be [jackson-jq](https://github.com/eiiches/jackson-jq) notation. For other types, this parameter is ignored. | none (required for types `path` and `jq`) |
| nodes | For `tree` only. Multiple-expression field for accessing the field while flattening, representing the hierarchy of field names to read. For other types, this parameter must not be provided. | none (required for type `tree`) |
#### Notes on flattening
@ -690,7 +692,8 @@ See [Avro specification](http://avro.apache.org/docs/1.7.7/spec.html#Schema+Reso
| fromPigAvroStorage | Boolean | Specifies whether the data file is stored using AvroStorage. | no(default == false) |
An Avro parseSpec can contain a [`flattenSpec`](#flattenspec) using either the "root" or "path"
field types, which can be used to read nested Avro records. The "jq" field type is not currently supported for Avro.
field types, which can be used to read nested Avro records. The "jq" and "tree" field type is not currently supported
for Avro.
For example, using Avro Hadoop parser with custom reader's schema file:
@ -1208,7 +1211,7 @@ This parser is for [stream ingestion](./index.md#streaming) and reads Avro data
| parseSpec | JSON Object | Specifies the timestamp and dimensions of the data. Should be an "avro" parseSpec. | yes |
An Avro parseSpec can contain a [`flattenSpec`](#flattenspec) using either the "root" or "path"
field types, which can be used to read nested Avro records. The "jq" field type is not currently supported for Avro.
field types, which can be used to read nested Avro records. The "jq" and "tree" field type is not currently supported for Avro.
For example, using Avro stream parser with schema repo Avro bytes decoder:

View File

@ -140,6 +140,16 @@ public class AvroFlattenerMaker implements ObjectFlatteners.FlattenerMaker<Gener
throw new UnsupportedOperationException("Avro + JQ not supported");
}
@Override
public Function<GenericRecord, Object> makeJsonTreeExtractor(List<String> nodes)
{
if (nodes.size() == 1) {
return (GenericRecord record) -> getRootField(record, nodes.get(0));
}
throw new UnsupportedOperationException("Avro + nested tree extraction not supported");
}
@Override
public JsonProvider getJsonProvider()
{

View File

@ -91,6 +91,16 @@ public class OrcStructFlattenerMaker implements ObjectFlatteners.FlattenerMaker<
throw new UnsupportedOperationException("ORC flattener does not support JQ");
}
@Override
public Function<OrcStruct, Object> makeJsonTreeExtractor(List<String> nodes)
{
if (nodes.size() == 1) {
return (OrcStruct record) -> getRootField(record, nodes.get(0));
}
throw new UnsupportedOperationException("ORC flattener does not support nested root queries");
}
@Override
public JsonProvider getJsonProvider()
{

View File

@ -88,6 +88,16 @@ public class ParquetGroupFlattenerMaker implements ObjectFlatteners.FlattenerMak
throw new UnsupportedOperationException("Parquet does not support JQ");
}
@Override
public Function<Group, Object> makeJsonTreeExtractor(List<String> nodes)
{
if (nodes.size() == 1) {
return (Group group) -> getRootField(group, nodes.get(0));
}
throw new UnsupportedOperationException("Parque does not support nested tree extraction");
}
@Override
public JsonProvider getJsonProvider()
{