From 74ff848ce54cd389bdae6153322ba8cbc32d5cb2 Mon Sep 17 00:00:00 2001 From: somu-imply <93540295+somu-imply@users.noreply.github.com> Date: Wed, 1 Feb 2023 04:15:08 -0800 Subject: [PATCH] Fixing incorrect filtering of nulls in an array when ingesting for JSON and Avro (#13712) --- .../common/parsers/JSONFlattenerMaker.java | 5 +- .../data/input/impl/JSONParseSpecTest.java | 39 ++++++++++++++- .../data/input/impl/JsonLineReaderTest.java | 6 ++- .../data/input/avro/AvroFlattenerMaker.java | 3 +- .../src/test/avro/some-datum.avsc | 12 +++-- .../data/input/AvroStreamInputFormatTest.java | 7 ++- .../input/AvroStreamInputRowParserTest.java | 7 ++- .../input/avro/AvroFlattenerMakerTest.java | 48 ++++++++++++++++++- 8 files changed, 106 insertions(+), 21 deletions(-) diff --git a/core/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java b/core/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java index 0b8244e29b7..5df98199080 100644 --- a/core/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java +++ b/core/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java @@ -179,12 +179,11 @@ public class JSONFlattenerMaker implements ObjectFlatteners.FlattenerMaker newList = new ArrayList<>(); for (JsonNode entry : val) { - if (!entry.isNull()) { - newList.add(convertJsonNode(entry, enc)); - } + newList.add(convertJsonNode(entry, enc)); } return newList; } diff --git a/core/src/test/java/org/apache/druid/data/input/impl/JSONParseSpecTest.java b/core/src/test/java/org/apache/druid/data/input/impl/JSONParseSpecTest.java index d5728f79886..b89c11258b0 100644 --- a/core/src/test/java/org/apache/druid/data/input/impl/JSONParseSpecTest.java +++ b/core/src/test/java/org/apache/druid/data/input/impl/JSONParseSpecTest.java @@ -102,7 +102,7 @@ public class JSONParseSpecTest ); final Map expected = new HashMap<>(); - expected.put("foo", new ArrayList()); + expected.put("foo", Collections.singletonList(null)); expected.put("baz", null); expected.put("bar", Collections.singletonList("test")); @@ -113,6 +113,43 @@ public class JSONParseSpecTest Assert.assertEquals(expected, parsedRow); } + @Test + public void testParseRowWithNullsInArrays() + { + final JSONParseSpec parseSpec = new JSONParseSpec( + new TimestampSpec("timestamp", "iso", null), + new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("foo"))), + new JSONPathSpec( + true, + ImmutableList.of( + // https://github.com/apache/druid/issues/6653 $.x.y.z where y is missing + new JSONPathFieldSpec(JSONPathFieldType.PATH, "foo", "$.baz.[?(@.maybe_object)].maybe_object"), + // https://github.com/apache/druid/issues/6653 $.x.y.z where y is from an array and is null + new JSONPathFieldSpec(JSONPathFieldType.PATH, "nullFoo", "$.nullFoo.[?(@.value)][0].foo"), + new JSONPathFieldSpec(JSONPathFieldType.PATH, "baz", "$.baz"), + // $.x.y.z where x is from an array and is null + new JSONPathFieldSpec(JSONPathFieldType.PATH, "nullBaz", "$.baz[1].foo.maybe_object"), + new JSONPathFieldSpec(JSONPathFieldType.PATH, "bar", "$.[?(@.something_else)].something_else.foo") + ) + ), + null, + false + ); + + final Map expected = new HashMap<>(); + expected.put("foo", new ArrayList<>()); + expected.put("baz", Arrays.asList("1", null, "2", null)); + expected.put("bar", Collections.singletonList("test")); + expected.put("nullFoo", new ArrayList<>()); + expected.put("nullBaz", null); + + final Parser parser = parseSpec.makeParser(); + final Map parsedRow = parser.parseToMap("{\"baz\":[\"1\",null,\"2\",null],\"nullFoo\":{\"value\":[null,null]},\"something_else\": {\"foo\": \"test\"}}"); + + Assert.assertNotNull(parsedRow); + Assert.assertEquals(expected, parsedRow); + } + @Test public void testSerde() throws IOException { diff --git a/core/src/test/java/org/apache/druid/data/input/impl/JsonLineReaderTest.java b/core/src/test/java/org/apache/druid/data/input/impl/JsonLineReaderTest.java index 3ea56aa71f1..5b0a3a391ab 100644 --- a/core/src/test/java/org/apache/druid/data/input/impl/JsonLineReaderTest.java +++ b/core/src/test/java/org/apache/druid/data/input/impl/JsonLineReaderTest.java @@ -141,7 +141,11 @@ public class JsonLineReaderTest while (iterator.hasNext()) { final InputRow row = iterator.next(); Assert.assertEquals("test", Iterables.getOnlyElement(row.getDimension("bar"))); - Assert.assertEquals(Collections.emptyList(), row.getDimension("foo")); + // Since foo is in the JSONPathSpec it comes as an array of [null] + // row.getRaw("foo") comes out as an array of nulls but the + // row.getDimension("foo") stringifies it as "null". A future developer should aim to relieve this + Assert.assertEquals(Collections.singletonList(null), row.getRaw("foo")); + Assert.assertEquals(Collections.singletonList("null"), row.getDimension("foo")); Assert.assertTrue(row.getDimension("baz").isEmpty()); numActualIterations++; } diff --git a/extensions-core/avro-extensions/src/main/java/org/apache/druid/data/input/avro/AvroFlattenerMaker.java b/extensions-core/avro-extensions/src/main/java/org/apache/druid/data/input/avro/AvroFlattenerMaker.java index ba9d895b1f9..d557b65c847 100644 --- a/extensions-core/avro-extensions/src/main/java/org/apache/druid/data/input/avro/AvroFlattenerMaker.java +++ b/extensions-core/avro-extensions/src/main/java/org/apache/druid/data/input/avro/AvroFlattenerMaker.java @@ -39,7 +39,6 @@ import java.util.EnumSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.Objects; import java.util.Set; import java.util.function.Function; import java.util.stream.Collectors; @@ -194,7 +193,7 @@ public class AvroFlattenerMaker implements ObjectFlatteners.FlattenerMaker) field).stream().filter(Objects::nonNull).map(this::transformValue).collect(Collectors.toList()); + return ((List) field).stream().map(this::transformValue).collect(Collectors.toList()); } else if (field instanceof GenericEnumSymbol) { return field.toString(); } else if (field instanceof GenericFixed) { diff --git a/extensions-core/avro-extensions/src/test/avro/some-datum.avsc b/extensions-core/avro-extensions/src/test/avro/some-datum.avsc index f03cb708df0..fe43d719ee1 100644 --- a/extensions-core/avro-extensions/src/test/avro/some-datum.avsc +++ b/extensions-core/avro-extensions/src/test/avro/some-datum.avsc @@ -33,10 +33,14 @@ }, { "name": "someStringArray", - "type": { - "type": "array", - "items": "string" - } + "type": [ + "null", + { + "type": "array", + "items": ["null","string"] + } + ], + "default":null }, { "name": "someIntValueMap", diff --git a/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/AvroStreamInputFormatTest.java b/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/AvroStreamInputFormatTest.java index a59c32b8100..177766510d3 100644 --- a/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/AvroStreamInputFormatTest.java +++ b/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/AvroStreamInputFormatTest.java @@ -81,7 +81,7 @@ import static org.apache.druid.data.input.AvroStreamInputRowParserTest.buildSome * "someOtherId": 6568719896, * "isValid": true, * "someIntArray": [1, 2, 4, 8], - * "someStringArray": ["8", "4", "2", "1"], + * "someStringArray": ["8", "4", "2", "1", null], * "someIntValueMap": {"8": 8, "1": 1, "2": 2, "4": 4}, * "someStringValueMap": {"8": "8", "1": "1", "2": "2", "4": "4"}, * "someUnion": "string as union", @@ -108,14 +108,13 @@ public class AvroStreamInputFormatTest extends InitializedNullHandlingTest private static final List DIMENSIONS_SCHEMALESS = Arrays.asList( "nested", SOME_OTHER_ID, - "someStringArray", "someIntArray", "someFloat", + "someUnion", EVENT_TYPE, + ID, "someFixed", "someBytes", - "someUnion", - ID, "someEnum", "someLong", "someInt", diff --git a/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/AvroStreamInputRowParserTest.java b/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/AvroStreamInputRowParserTest.java index 3f206a77a0f..3bcec4e3c16 100644 --- a/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/AvroStreamInputRowParserTest.java +++ b/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/AvroStreamInputRowParserTest.java @@ -88,14 +88,13 @@ public class AvroStreamInputRowParserTest private static final List DIMENSIONS_SCHEMALESS = Arrays.asList( "nested", SOME_OTHER_ID, - "someStringArray", "someIntArray", "someFloat", + "someUnion", EVENT_TYPE, + ID, "someFixed", "someBytes", - "someUnion", - ID, "someEnum", "someLong", "someInt", @@ -128,7 +127,7 @@ public class AvroStreamInputRowParserTest .setSubInt(SUB_INT_VALUE) .setSubLong(SUB_LONG_VALUE) .build(); - private static final List SOME_STRING_ARRAY_VALUE = Arrays.asList("8", "4", "2", "1"); + private static final List SOME_STRING_ARRAY_VALUE = Arrays.asList("8", "4", "2", "1", null); private static final List SOME_INT_ARRAY_VALUE = Arrays.asList(1, 2, 4, 8); static final Map SOME_INT_VALUE_MAP_VALUE = Maps.asMap( new HashSet<>(Arrays.asList("8", "2", "4", "1")), new Function() diff --git a/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/avro/AvroFlattenerMakerTest.java b/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/avro/AvroFlattenerMakerTest.java index 28caea3049b..59e15deeb74 100644 --- a/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/avro/AvroFlattenerMakerTest.java +++ b/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/avro/AvroFlattenerMakerTest.java @@ -164,11 +164,12 @@ public class AvroFlattenerMakerTest final AvroFlattenerMaker flattenerNested = new AvroFlattenerMaker(false, false, true, true); SomeAvroDatum input = AvroStreamInputRowParserTest.buildSomeAvroDatum(); - + // isFieldPrimitive on someStringArray is false + // as it contains items as nulls and strings + // so flattenerNested should only be able to discover it Assert.assertEquals( ImmutableSet.of( "someOtherId", - "someStringArray", "someIntArray", "someFloat", "eventType", @@ -210,6 +211,49 @@ public class AvroFlattenerMakerTest ); } + + @Test + public void testNullsInStringArray() + { + final AvroFlattenerMaker flattenerNested = new AvroFlattenerMaker(false, false, true, true); + + SomeAvroDatum input = AvroStreamInputRowParserTest.buildSomeAvroDatum(); + + Assert.assertEquals( + ImmutableSet.of( + "someStringValueMap", + "someOtherId", + "someStringArray", + "someIntArray", + "someFloat", + "isValid", + "someIntValueMap", + "eventType", + "someFixed", + "someBytes", + "someRecord", + "someMultiMemberUnion", + "someNull", + "someRecordArray", + "someUnion", + "id", + "someEnum", + "someLong", + "someInt", + "timestamp" + ), + ImmutableSet.copyOf(flattenerNested.discoverRootFields(input)) + ); + + ArrayList results = (ArrayList) flattenerNested.getRootField(input, "someStringArray"); + // 4 strings a 1 null for a total of 5 + Assert.assertEquals("8", results.get(0).toString()); + Assert.assertEquals("4", results.get(1).toString()); + Assert.assertEquals("2", results.get(2).toString()); + Assert.assertEquals("1", results.get(3).toString()); + Assert.assertEquals(null, results.get(4)); + } + private void getRootField_common(final SomeAvroDatum record, final AvroFlattenerMaker flattener) { Assert.assertEquals(