fix issue with nested virtual column index supplier for partial paths when processing from raw (#15643)

This commit is contained in:
Clint Wylie 2024-01-08 18:25:08 -08:00 committed by GitHub
parent 468b99e608
commit 911941b4a6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 124 additions and 1 deletions

View File

@ -1170,10 +1170,15 @@ public class NestedFieldVirtualColumn implements VirtualColumn
if (theColumn instanceof CompressedNestedDataComplexColumn) {
final CompressedNestedDataComplexColumn<?> nestedColumn = (CompressedNestedDataComplexColumn<?>) theColumn;
final ColumnIndexSupplier nestedColumnPathIndexSupplier = nestedColumn.getColumnIndexSupplier(parts);
if (nestedColumnPathIndexSupplier == null && processFromRaw) {
// if processing from raw, a non-exstent path from parts doesn't mean the path doesn't really exist
// so fall back to no indexes
return NoIndexesColumnIndexSupplier.getInstance();
}
if (expectedType != null) {
final Set<ColumnType> types = nestedColumn.getColumnTypes(parts);
// if the expected output type is numeric but not all of the input types are numeric, we might have additional
// null values than what the null value bitmap is tracking, wrap it
// null values than what the null value bitmap is tracking, fall back to not using indexes
if (expectedType.isNumeric() && (types == null || types.stream().anyMatch(t -> !t.isNumeric()))) {
return NoIndexesColumnIndexSupplier.getInstance();
}

View File

@ -37,6 +37,8 @@ import org.apache.druid.query.aggregation.AggregationTestHelper;
import org.apache.druid.query.aggregation.AggregatorFactory;
import org.apache.druid.query.aggregation.CountAggregatorFactory;
import org.apache.druid.query.filter.BoundDimFilter;
import org.apache.druid.query.filter.NotDimFilter;
import org.apache.druid.query.filter.NullFilter;
import org.apache.druid.query.filter.SelectorDimFilter;
import org.apache.druid.query.ordering.StringComparators;
import org.apache.druid.query.spec.MultipleIntervalSegmentSpec;
@ -799,6 +801,122 @@ public class NestedDataScanQueryTest extends InitializedNullHandlingTest
Assert.assertEquals(resultsSegments.get(0).getEvents().toString(), resultsRealtime.get(0).getEvents().toString());
}
@Test
public void testIngestAndScanSegmentsAndFilterPartialPathArrayIndex() throws Exception
{
Query<ScanResultValue> scanQuery = Druids.newScanQueryBuilder()
.dataSource("test_datasource")
.intervals(
new MultipleIntervalSegmentSpec(
Collections.singletonList(Intervals.ETERNITY)
)
)
.filters(
NotDimFilter.of(NullFilter.forColumn("v0"))
)
.virtualColumns(
new NestedFieldVirtualColumn(
"complexObj",
"v0",
ColumnType.NESTED_DATA,
null,
true,
"$.y[0]",
false
)
)
.resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_COMPACTED_LIST)
.limit(100)
.context(ImmutableMap.of())
.build();
List<Segment> segs = NestedDataTestUtils.createSegmentsForJsonInput(
tempFolder,
closer,
NestedDataTestUtils.ALL_TYPES_TEST_DATA_FILE,
Granularities.HOUR,
true,
IndexSpec.DEFAULT
);
List<Segment> realtimeSegs = ImmutableList.of(
NestedDataTestUtils.createIncrementalIndexForJsonInput(
tempFolder,
NestedDataTestUtils.ALL_TYPES_TEST_DATA_FILE,
Granularities.NONE,
true
)
);
final Sequence<ScanResultValue> seq = helper.runQueryOnSegmentsObjs(segs, scanQuery);
final Sequence<ScanResultValue> seqRealtime = helper.runQueryOnSegmentsObjs(realtimeSegs, scanQuery);
List<ScanResultValue> results = seq.toList();
List<ScanResultValue> resultsRealtime = seqRealtime.toList();
logResults(results);
logResults(resultsRealtime);
Assert.assertEquals(1, results.size());
Assert.assertEquals(4, ((List) results.get(0).getEvents()).size());
Assert.assertEquals(results.size(), resultsRealtime.size());
Assert.assertEquals(results.get(0).getEvents().toString(), resultsRealtime.get(0).getEvents().toString());
}
@Test
public void testIngestAndScanSegmentsAndFilterPartialPath() throws Exception
{
Query<ScanResultValue> scanQuery = Druids.newScanQueryBuilder()
.dataSource("test_datasource")
.intervals(
new MultipleIntervalSegmentSpec(
Collections.singletonList(Intervals.ETERNITY)
)
)
.filters(
NotDimFilter.of(NullFilter.forColumn("v0"))
)
.virtualColumns(
new NestedFieldVirtualColumn(
"obj",
"v0",
ColumnType.NESTED_DATA,
null,
true,
"$.b",
false
)
)
.resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_COMPACTED_LIST)
.limit(100)
.context(ImmutableMap.of())
.build();
List<Segment> segs = NestedDataTestUtils.createSegmentsForJsonInput(
tempFolder,
closer,
NestedDataTestUtils.ALL_TYPES_TEST_DATA_FILE,
Granularities.HOUR,
true,
IndexSpec.DEFAULT
);
List<Segment> realtimeSegs = ImmutableList.of(
NestedDataTestUtils.createIncrementalIndexForJsonInput(
tempFolder,
NestedDataTestUtils.ALL_TYPES_TEST_DATA_FILE,
Granularities.NONE,
true
)
);
final Sequence<ScanResultValue> seq = helper.runQueryOnSegmentsObjs(segs, scanQuery);
final Sequence<ScanResultValue> seqRealtime = helper.runQueryOnSegmentsObjs(realtimeSegs, scanQuery);
List<ScanResultValue> results = seq.toList();
List<ScanResultValue> resultsRealtime = seqRealtime.toList();
logResults(results);
logResults(resultsRealtime);
Assert.assertEquals(1, results.size());
Assert.assertEquals(6, ((List) results.get(0).getEvents()).size());
Assert.assertEquals(results.size(), resultsRealtime.size());
Assert.assertEquals(results.get(0).getEvents().toString(), resultsRealtime.get(0).getEvents().toString());
}
private static void logResults(List<ScanResultValue> results)
{
StringBuilder bob = new StringBuilder();