Fixing incorrect filtering of nulls in an array when ingesting for JSON and Avro (#13712)

This commit is contained in:
somu-imply 2023-02-01 04:15:08 -08:00 committed by GitHub
parent c95a26cae3
commit 74ff848ce5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 106 additions and 21 deletions

View File

@ -179,12 +179,11 @@ public class JSONFlattenerMaker implements ObjectFlatteners.FlattenerMaker<JsonN
return ((BinaryNode) val).binaryValue();
}
if (val.isArray()) {
List<Object> newList = new ArrayList<>();
for (JsonNode entry : val) {
if (!entry.isNull()) {
newList.add(convertJsonNode(entry, enc));
}
newList.add(convertJsonNode(entry, enc));
}
return newList;
}

View File

@ -102,7 +102,7 @@ public class JSONParseSpecTest
);
final Map<String, Object> expected = new HashMap<>();
expected.put("foo", new ArrayList());
expected.put("foo", Collections.singletonList(null));
expected.put("baz", null);
expected.put("bar", Collections.singletonList("test"));
@ -113,6 +113,43 @@ public class JSONParseSpecTest
Assert.assertEquals(expected, parsedRow);
}
@Test
public void testParseRowWithNullsInArrays()
{
final JSONParseSpec parseSpec = new JSONParseSpec(
new TimestampSpec("timestamp", "iso", null),
new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("foo"))),
new JSONPathSpec(
true,
ImmutableList.of(
// https://github.com/apache/druid/issues/6653 $.x.y.z where y is missing
new JSONPathFieldSpec(JSONPathFieldType.PATH, "foo", "$.baz.[?(@.maybe_object)].maybe_object"),
// https://github.com/apache/druid/issues/6653 $.x.y.z where y is from an array and is null
new JSONPathFieldSpec(JSONPathFieldType.PATH, "nullFoo", "$.nullFoo.[?(@.value)][0].foo"),
new JSONPathFieldSpec(JSONPathFieldType.PATH, "baz", "$.baz"),
// $.x.y.z where x is from an array and is null
new JSONPathFieldSpec(JSONPathFieldType.PATH, "nullBaz", "$.baz[1].foo.maybe_object"),
new JSONPathFieldSpec(JSONPathFieldType.PATH, "bar", "$.[?(@.something_else)].something_else.foo")
)
),
null,
false
);
final Map<String, Object> expected = new HashMap<>();
expected.put("foo", new ArrayList<>());
expected.put("baz", Arrays.asList("1", null, "2", null));
expected.put("bar", Collections.singletonList("test"));
expected.put("nullFoo", new ArrayList<>());
expected.put("nullBaz", null);
final Parser<String, Object> parser = parseSpec.makeParser();
final Map<String, Object> parsedRow = parser.parseToMap("{\"baz\":[\"1\",null,\"2\",null],\"nullFoo\":{\"value\":[null,null]},\"something_else\": {\"foo\": \"test\"}}");
Assert.assertNotNull(parsedRow);
Assert.assertEquals(expected, parsedRow);
}
@Test
public void testSerde() throws IOException
{

View File

@ -141,7 +141,11 @@ public class JsonLineReaderTest
while (iterator.hasNext()) {
final InputRow row = iterator.next();
Assert.assertEquals("test", Iterables.getOnlyElement(row.getDimension("bar")));
Assert.assertEquals(Collections.emptyList(), row.getDimension("foo"));
// Since foo is in the JSONPathSpec it comes as an array of [null]
// row.getRaw("foo") comes out as an array of nulls but the
// row.getDimension("foo") stringifies it as "null". A future developer should aim to relieve this
Assert.assertEquals(Collections.singletonList(null), row.getRaw("foo"));
Assert.assertEquals(Collections.singletonList("null"), row.getDimension("foo"));
Assert.assertTrue(row.getDimension("baz").isEmpty());
numActualIterations++;
}

View File

@ -39,7 +39,6 @@ import java.util.EnumSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
@ -194,7 +193,7 @@ public class AvroFlattenerMaker implements ObjectFlatteners.FlattenerMaker<Gener
} else if (field instanceof Utf8) {
return field.toString();
} else if (field instanceof List) {
return ((List<?>) field).stream().filter(Objects::nonNull).map(this::transformValue).collect(Collectors.toList());
return ((List<?>) field).stream().map(this::transformValue).collect(Collectors.toList());
} else if (field instanceof GenericEnumSymbol) {
return field.toString();
} else if (field instanceof GenericFixed) {

View File

@ -33,10 +33,14 @@
},
{
"name": "someStringArray",
"type": {
"type": "array",
"items": "string"
}
"type": [
"null",
{
"type": "array",
"items": ["null","string"]
}
],
"default":null
},
{
"name": "someIntValueMap",

View File

@ -81,7 +81,7 @@ import static org.apache.druid.data.input.AvroStreamInputRowParserTest.buildSome
* "someOtherId": 6568719896,
* "isValid": true,
* "someIntArray": [1, 2, 4, 8],
* "someStringArray": ["8", "4", "2", "1"],
* "someStringArray": ["8", "4", "2", "1", null],
* "someIntValueMap": {"8": 8, "1": 1, "2": 2, "4": 4},
* "someStringValueMap": {"8": "8", "1": "1", "2": "2", "4": "4"},
* "someUnion": "string as union",
@ -108,14 +108,13 @@ public class AvroStreamInputFormatTest extends InitializedNullHandlingTest
private static final List<String> DIMENSIONS_SCHEMALESS = Arrays.asList(
"nested",
SOME_OTHER_ID,
"someStringArray",
"someIntArray",
"someFloat",
"someUnion",
EVENT_TYPE,
ID,
"someFixed",
"someBytes",
"someUnion",
ID,
"someEnum",
"someLong",
"someInt",

View File

@ -88,14 +88,13 @@ public class AvroStreamInputRowParserTest
private static final List<String> DIMENSIONS_SCHEMALESS = Arrays.asList(
"nested",
SOME_OTHER_ID,
"someStringArray",
"someIntArray",
"someFloat",
"someUnion",
EVENT_TYPE,
ID,
"someFixed",
"someBytes",
"someUnion",
ID,
"someEnum",
"someLong",
"someInt",
@ -128,7 +127,7 @@ public class AvroStreamInputRowParserTest
.setSubInt(SUB_INT_VALUE)
.setSubLong(SUB_LONG_VALUE)
.build();
private static final List<CharSequence> SOME_STRING_ARRAY_VALUE = Arrays.asList("8", "4", "2", "1");
private static final List<CharSequence> SOME_STRING_ARRAY_VALUE = Arrays.asList("8", "4", "2", "1", null);
private static final List<Integer> SOME_INT_ARRAY_VALUE = Arrays.asList(1, 2, 4, 8);
static final Map<CharSequence, Integer> SOME_INT_VALUE_MAP_VALUE = Maps.asMap(
new HashSet<>(Arrays.asList("8", "2", "4", "1")), new Function<CharSequence, Integer>()

View File

@ -164,11 +164,12 @@ public class AvroFlattenerMakerTest
final AvroFlattenerMaker flattenerNested = new AvroFlattenerMaker(false, false, true, true);
SomeAvroDatum input = AvroStreamInputRowParserTest.buildSomeAvroDatum();
// isFieldPrimitive on someStringArray is false
// as it contains items as nulls and strings
// so flattenerNested should only be able to discover it
Assert.assertEquals(
ImmutableSet.of(
"someOtherId",
"someStringArray",
"someIntArray",
"someFloat",
"eventType",
@ -210,6 +211,49 @@ public class AvroFlattenerMakerTest
);
}
@Test
public void testNullsInStringArray()
{
final AvroFlattenerMaker flattenerNested = new AvroFlattenerMaker(false, false, true, true);
SomeAvroDatum input = AvroStreamInputRowParserTest.buildSomeAvroDatum();
Assert.assertEquals(
ImmutableSet.of(
"someStringValueMap",
"someOtherId",
"someStringArray",
"someIntArray",
"someFloat",
"isValid",
"someIntValueMap",
"eventType",
"someFixed",
"someBytes",
"someRecord",
"someMultiMemberUnion",
"someNull",
"someRecordArray",
"someUnion",
"id",
"someEnum",
"someLong",
"someInt",
"timestamp"
),
ImmutableSet.copyOf(flattenerNested.discoverRootFields(input))
);
ArrayList<Object> results = (ArrayList<Object>) flattenerNested.getRootField(input, "someStringArray");
// 4 strings a 1 null for a total of 5
Assert.assertEquals("8", results.get(0).toString());
Assert.assertEquals("4", results.get(1).toString());
Assert.assertEquals("2", results.get(2).toString());
Assert.assertEquals("1", results.get(3).toString());
Assert.assertEquals(null, results.get(4));
}
private void getRootField_common(final SomeAvroDatum record, final AvroFlattenerMaker flattener)
{
Assert.assertEquals(